Updated to fedora-glibc-20051020T0651
[glibc.git] / posix / regcomp.c
blobd35ae5f7d0251a675ce3960f7b2022fc4f6934f2
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx[] attribute_hidden =
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
219 reg_errcode_t ret;
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 /* Match anchors at newline. */
227 bufp->newline_anchor = 1;
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 #ifdef _LIBC
236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
254 reg_syntax_t
255 re_set_syntax (syntax)
256 reg_syntax_t syntax;
258 reg_syntax_t ret = re_syntax_options;
260 re_syntax_options = syntax;
261 return ret;
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
268 re_compile_fastmap (bufp)
269 struct re_pattern_buffer *bufp;
271 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272 char *fastmap = bufp->fastmap;
274 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276 if (dfa->init_state != dfa->init_state_word)
277 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278 if (dfa->init_state != dfa->init_state_nl)
279 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280 if (dfa->init_state != dfa->init_state_begbuf)
281 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282 bufp->fastmap_accurate = 1;
283 return 0;
285 #ifdef _LIBC
286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
301 static void
302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 char *fastmap)
305 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int node_cnt;
307 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
310 int node = init_state->nodes.elems[node_cnt];
311 re_token_type_t type = dfa->nodes[node].type;
313 if (type == CHARACTER)
315 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
319 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 wchar_t wc;
321 mbstate_t state;
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, 0, buf[0]);
336 #endif
338 else if (type == SIMPLE_BRACKET)
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
353 int i;
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
356 || cset->nranges || cset->nchar_classes)
358 # ifdef _LIBC
359 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
361 /* In this case we want to catch the bytes which are
362 the first byte of any collation elements.
363 e.g. In da_DK, we want to catch 'a' since "aa"
364 is a valid collation element, and don't catch
365 'b' since 'b' is the only collation element
366 which starts from 'b'. */
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # else
374 if (dfa->mb_cur_max > 1)
375 for (i = 0; i < SBC_MAX; ++i)
376 if (__btowc (i) == WEOF)
377 re_set_fastmap (fastmap, icase, i);
378 # endif /* not _LIBC */
380 for (i = 0; i < cset->nmbchars; ++i)
382 char buf[256];
383 mbstate_t state;
384 memset (&state, '\0', sizeof (state));
385 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
386 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
387 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
389 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
390 != (size_t) -1)
391 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
395 #endif /* RE_ENABLE_I18N */
396 else if (type == OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 || type == OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 || type == END_OF_RE)
402 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
403 if (type == END_OF_RE)
404 bufp->can_be_null = 1;
405 return;
410 /* Entry point for POSIX code. */
411 /* regcomp takes a regular expression as a string and compiles it.
413 PREG is a regex_t *. We do not expect any fields to be initialized,
414 since POSIX says we shouldn't. Thus, we set
416 `buffer' to the compiled pattern;
417 `used' to the length of the compiled pattern;
418 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419 REG_EXTENDED bit in CFLAGS is set; otherwise, to
420 RE_SYNTAX_POSIX_BASIC;
421 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422 `fastmap' to an allocated space for the fastmap;
423 `fastmap_accurate' to zero;
424 `re_nsub' to the number of subexpressions in PATTERN.
426 PATTERN is the address of the pattern string.
428 CFLAGS is a series of bits which affect compilation.
430 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431 use POSIX basic syntax.
433 If REG_NEWLINE is set, then . and [^...] don't match newline.
434 Also, regexec will try a match beginning after every newline.
436 If REG_ICASE is set, then we considers upper- and lowercase
437 versions of letters to be equivalent when matching.
439 If REG_NOSUB is set, then when PREG is passed to regexec, that
440 routine will report only success or failure, and nothing about the
441 registers.
443 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
444 the return codes and their meanings.) */
447 regcomp (preg, pattern, cflags)
448 regex_t *__restrict preg;
449 const char *__restrict pattern;
450 int cflags;
452 reg_errcode_t ret;
453 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
454 : RE_SYNTAX_POSIX_BASIC);
456 preg->buffer = NULL;
457 preg->allocated = 0;
458 preg->used = 0;
460 /* Try to allocate space for the fastmap. */
461 preg->fastmap = re_malloc (char, SBC_MAX);
462 if (BE (preg->fastmap == NULL, 0))
463 return REG_ESPACE;
465 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
467 /* If REG_NEWLINE is set, newlines are treated differently. */
468 if (cflags & REG_NEWLINE)
469 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
470 syntax &= ~RE_DOT_NEWLINE;
471 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
472 /* It also changes the matching behavior. */
473 preg->newline_anchor = 1;
475 else
476 preg->newline_anchor = 0;
477 preg->no_sub = !!(cflags & REG_NOSUB);
478 preg->translate = NULL;
480 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
482 /* POSIX doesn't distinguish between an unmatched open-group and an
483 unmatched close-group: both are REG_EPAREN. */
484 if (ret == REG_ERPAREN)
485 ret = REG_EPAREN;
487 /* We have already checked preg->fastmap != NULL. */
488 if (BE (ret == REG_NOERROR, 1))
489 /* Compute the fastmap now, since regexec cannot modify the pattern
490 buffer. This function never fails in this implementation. */
491 (void) re_compile_fastmap (preg);
492 else
494 /* Some error occurred while compiling the expression. */
495 re_free (preg->fastmap);
496 preg->fastmap = NULL;
499 return (int) ret;
501 #ifdef _LIBC
502 weak_alias (__regcomp, regcomp)
503 #endif
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506 from either regcomp or regexec. We don't use PREG here. */
508 size_t
509 regerror (errcode, preg, errbuf, errbuf_size)
510 int errcode;
511 const regex_t *__restrict preg;
512 char *__restrict errbuf;
513 size_t errbuf_size;
515 const char *msg;
516 size_t msg_size;
518 if (BE (errcode < 0
519 || errcode >= (int) (sizeof (__re_error_msgid_idx)
520 / sizeof (__re_error_msgid_idx[0])), 0))
521 /* Only error codes returned by the rest of the code should be passed
522 to this routine. If we are given anything else, or if other regex
523 code generates an invalid error code, then the program has a bug.
524 Dump core so we can fix it. */
525 abort ();
527 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
529 msg_size = strlen (msg) + 1; /* Includes the null. */
531 if (BE (errbuf_size != 0, 1))
533 if (BE (msg_size > errbuf_size, 0))
535 #if defined HAVE_MEMPCPY || defined _LIBC
536 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
537 #else
538 memcpy (errbuf, msg, errbuf_size - 1);
539 errbuf[errbuf_size - 1] = 0;
540 #endif
542 else
543 memcpy (errbuf, msg, msg_size);
546 return msg_size;
548 #ifdef _LIBC
549 weak_alias (__regerror, regerror)
550 #endif
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map =
560 /* Set the first 128 bits. */
561 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 #endif
566 static void
567 free_dfa_content (re_dfa_t *dfa)
569 int i, j;
571 if (dfa->nodes)
572 for (i = 0; i < dfa->nodes_len; ++i)
573 free_token (dfa->nodes + i);
574 re_free (dfa->nexts);
575 for (i = 0; i < dfa->nodes_len; ++i)
577 if (dfa->eclosures != NULL)
578 re_node_set_free (dfa->eclosures + i);
579 if (dfa->inveclosures != NULL)
580 re_node_set_free (dfa->inveclosures + i);
581 if (dfa->edests != NULL)
582 re_node_set_free (dfa->edests + i);
584 re_free (dfa->edests);
585 re_free (dfa->eclosures);
586 re_free (dfa->inveclosures);
587 re_free (dfa->nodes);
589 if (dfa->state_table)
590 for (i = 0; i <= dfa->state_hash_mask; ++i)
592 struct re_state_table_entry *entry = dfa->state_table + i;
593 for (j = 0; j < entry->num; ++j)
595 re_dfastate_t *state = entry->array[j];
596 free_state (state);
598 re_free (entry->array);
600 re_free (dfa->state_table);
601 #ifdef RE_ENABLE_I18N
602 if (dfa->sb_char != utf8_sb_map)
603 re_free (dfa->sb_char);
604 #endif
605 re_free (dfa->subexp_map);
606 #ifdef DEBUG
607 re_free (dfa->re_str);
608 #endif
610 re_free (dfa);
614 /* Free dynamically allocated space used by PREG. */
616 void
617 regfree (preg)
618 regex_t *preg;
620 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
621 if (BE (dfa != NULL, 1))
622 free_dfa_content (dfa);
623 preg->buffer = NULL;
624 preg->allocated = 0;
626 re_free (preg->fastmap);
627 preg->fastmap = NULL;
629 re_free (preg->translate);
630 preg->translate = NULL;
632 #ifdef _LIBC
633 weak_alias (__regfree, regfree)
634 #endif
636 /* Entry points compatible with 4.2 BSD regex library. We don't define
637 them unless specifically requested. */
639 #if defined _REGEX_RE_COMP || defined _LIBC
641 /* BSD has one and only one pattern buffer. */
642 static struct re_pattern_buffer re_comp_buf;
644 char *
645 # ifdef _LIBC
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647 these names if they don't use our functions, and still use
648 regcomp/regexec above without link errors. */
649 weak_function
650 # endif
651 re_comp (s)
652 const char *s;
654 reg_errcode_t ret;
655 char *fastmap;
657 if (!s)
659 if (!re_comp_buf.buffer)
660 return gettext ("No previous regular expression");
661 return 0;
664 if (re_comp_buf.buffer)
666 fastmap = re_comp_buf.fastmap;
667 re_comp_buf.fastmap = NULL;
668 __regfree (&re_comp_buf);
669 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
670 re_comp_buf.fastmap = fastmap;
673 if (re_comp_buf.fastmap == NULL)
675 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
676 if (re_comp_buf.fastmap == NULL)
677 return (char *) gettext (__re_error_msgid
678 + __re_error_msgid_idx[(int) REG_ESPACE]);
681 /* Since `re_exec' always passes NULL for the `regs' argument, we
682 don't need to initialize the pattern buffer fields which affect it. */
684 /* Match anchors at newlines. */
685 re_comp_buf.newline_anchor = 1;
687 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
689 if (!ret)
690 return NULL;
692 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
693 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
696 #ifdef _LIBC
697 libc_freeres_fn (free_mem)
699 __regfree (&re_comp_buf);
701 #endif
703 #endif /* _REGEX_RE_COMP */
705 /* Internal entry point.
706 Compile the regular expression PATTERN, whose length is LENGTH.
707 SYNTAX indicate regular expression's syntax. */
709 static reg_errcode_t
710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
711 reg_syntax_t syntax)
713 reg_errcode_t err = REG_NOERROR;
714 re_dfa_t *dfa;
715 re_string_t regexp;
717 /* Initialize the pattern buffer. */
718 preg->fastmap_accurate = 0;
719 preg->syntax = syntax;
720 preg->not_bol = preg->not_eol = 0;
721 preg->used = 0;
722 preg->re_nsub = 0;
723 preg->can_be_null = 0;
724 preg->regs_allocated = REGS_UNALLOCATED;
726 /* Initialize the dfa. */
727 dfa = (re_dfa_t *) preg->buffer;
728 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
730 /* If zero allocated, but buffer is non-null, try to realloc
731 enough space. This loses if buffer's address is bogus, but
732 that is the user's responsibility. If ->buffer is NULL this
733 is a simple allocation. */
734 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
735 if (dfa == NULL)
736 return REG_ESPACE;
737 preg->allocated = sizeof (re_dfa_t);
738 preg->buffer = (unsigned char *) dfa;
740 preg->used = sizeof (re_dfa_t);
742 err = init_dfa (dfa, length);
743 if (BE (err != REG_NOERROR, 0))
745 free_dfa_content (dfa);
746 preg->buffer = NULL;
747 preg->allocated = 0;
748 return err;
750 #ifdef DEBUG
751 /* Note: length+1 will not overflow since it is checked in init_dfa. */
752 dfa->re_str = re_malloc (char, length + 1);
753 strncpy (dfa->re_str, pattern, length + 1);
754 #endif
756 __libc_lock_init (dfa->lock);
758 err = re_string_construct (&regexp, pattern, length, preg->translate,
759 syntax & RE_ICASE, dfa);
760 if (BE (err != REG_NOERROR, 0))
762 re_compile_internal_free_return:
763 free_workarea_compile (preg);
764 re_string_destruct (&regexp);
765 free_dfa_content (dfa);
766 preg->buffer = NULL;
767 preg->allocated = 0;
768 return err;
771 /* Parse the regular expression, and build a structure tree. */
772 preg->re_nsub = 0;
773 dfa->str_tree = parse (&regexp, preg, syntax, &err);
774 if (BE (dfa->str_tree == NULL, 0))
775 goto re_compile_internal_free_return;
777 /* Analyze the tree and create the nfa. */
778 err = analyze (preg);
779 if (BE (err != REG_NOERROR, 0))
780 goto re_compile_internal_free_return;
782 #ifdef RE_ENABLE_I18N
783 /* If possible, do searching in single byte encoding to speed things up. */
784 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
785 optimize_utf8 (dfa);
786 #endif
788 /* Then create the initial state of the dfa. */
789 err = create_initial_state (dfa);
791 /* Release work areas. */
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
795 if (BE (err != REG_NOERROR, 0))
797 free_dfa_content (dfa);
798 preg->buffer = NULL;
799 preg->allocated = 0;
802 return err;
805 /* Initialize DFA. We use the length of the regular expression PAT_LEN
806 as the initial length of some arrays. */
808 static reg_errcode_t
809 init_dfa (re_dfa_t *dfa, size_t pat_len)
811 unsigned int table_size;
812 #ifndef _LIBC
813 char *codeset_name;
814 #endif
816 memset (dfa, '\0', sizeof (re_dfa_t));
818 /* Force allocation of str_tree_storage the first time. */
819 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
821 /* Avoid overflows. */
822 if (pat_len == SIZE_MAX)
823 return REG_ESPACE;
825 dfa->nodes_alloc = pat_len + 1;
826 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828 /* table_size = 2 ^ ceil(log pat_len) */
829 for (table_size = 1; ; table_size <<= 1)
830 if (table_size > pat_len)
831 break;
833 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
834 dfa->state_hash_mask = table_size - 1;
836 dfa->mb_cur_max = MB_CUR_MAX;
837 #ifdef _LIBC
838 if (dfa->mb_cur_max == 6
839 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
840 dfa->is_utf8 = 1;
841 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
842 != 0);
843 #else
844 # ifdef HAVE_LANGINFO_CODESET
845 codeset_name = nl_langinfo (CODESET);
846 # else
847 codeset_name = getenv ("LC_ALL");
848 if (codeset_name == NULL || codeset_name[0] == '\0')
849 codeset_name = getenv ("LC_CTYPE");
850 if (codeset_name == NULL || codeset_name[0] == '\0')
851 codeset_name = getenv ("LANG");
852 if (codeset_name == NULL)
853 codeset_name = "";
854 else if (strchr (codeset_name, '.') != NULL)
855 codeset_name = strchr (codeset_name, '.') + 1;
856 # endif
858 if (strcasecmp (codeset_name, "UTF-8") == 0
859 || strcasecmp (codeset_name, "UTF8") == 0)
860 dfa->is_utf8 = 1;
862 /* We check exhaustively in the loop below if this charset is a
863 superset of ASCII. */
864 dfa->map_notascii = 0;
865 #endif
867 #ifdef RE_ENABLE_I18N
868 if (dfa->mb_cur_max > 1)
870 if (dfa->is_utf8)
871 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
872 else
874 int i, j, ch;
876 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
877 if (BE (dfa->sb_char == NULL, 0))
878 return REG_ESPACE;
880 /* Set the bits corresponding to single byte chars. */
881 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
882 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
884 wint_t wch = __btowc (ch);
885 if (wch != WEOF)
886 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
887 # ifndef _LIBC
888 if (isascii (ch) && wch != ch)
889 dfa->map_notascii = 1;
890 # endif
894 #endif
896 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
897 return REG_ESPACE;
898 return REG_NOERROR;
901 /* Initialize WORD_CHAR table, which indicate which character is
902 "word". In this case "word" means that it is the word construction
903 character used by some operators like "\<", "\>", etc. */
905 static void
906 internal_function
907 init_word_char (re_dfa_t *dfa)
909 int i, j, ch;
910 dfa->word_ops_used = 1;
911 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
912 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
913 if (isalnum (ch) || ch == '_')
914 dfa->word_char[i] |= (bitset_word_t) 1 << j;
917 /* Free the work area which are only used while compiling. */
919 static void
920 free_workarea_compile (regex_t *preg)
922 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
923 bin_tree_storage_t *storage, *next;
924 for (storage = dfa->str_tree_storage; storage; storage = next)
926 next = storage->next;
927 re_free (storage);
929 dfa->str_tree_storage = NULL;
930 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
931 dfa->str_tree = NULL;
932 re_free (dfa->org_indices);
933 dfa->org_indices = NULL;
936 /* Create initial states for all contexts. */
938 static reg_errcode_t
939 create_initial_state (re_dfa_t *dfa)
941 int first, i;
942 reg_errcode_t err;
943 re_node_set init_nodes;
945 /* Initial states have the epsilon closure of the node which is
946 the first node of the regular expression. */
947 first = dfa->str_tree->first->node_idx;
948 dfa->init_node = first;
949 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
950 if (BE (err != REG_NOERROR, 0))
951 return err;
953 /* The back-references which are in initial states can epsilon transit,
954 since in this case all of the subexpressions can be null.
955 Then we add epsilon closures of the nodes which are the next nodes of
956 the back-references. */
957 if (dfa->nbackref > 0)
958 for (i = 0; i < init_nodes.nelem; ++i)
960 int node_idx = init_nodes.elems[i];
961 re_token_type_t type = dfa->nodes[node_idx].type;
963 int clexp_idx;
964 if (type != OP_BACK_REF)
965 continue;
966 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
968 re_token_t *clexp_node;
969 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
970 if (clexp_node->type == OP_CLOSE_SUBEXP
971 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
972 break;
974 if (clexp_idx == init_nodes.nelem)
975 continue;
977 if (type == OP_BACK_REF)
979 int dest_idx = dfa->edests[node_idx].elems[0];
980 if (!re_node_set_contains (&init_nodes, dest_idx))
982 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
983 i = 0;
988 /* It must be the first time to invoke acquire_state. */
989 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
990 /* We don't check ERR here, since the initial state must not be NULL. */
991 if (BE (dfa->init_state == NULL, 0))
992 return err;
993 if (dfa->init_state->has_constraint)
995 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
996 CONTEXT_WORD);
997 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
998 CONTEXT_NEWLINE);
999 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1000 &init_nodes,
1001 CONTEXT_NEWLINE
1002 | CONTEXT_BEGBUF);
1003 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1004 || dfa->init_state_begbuf == NULL, 0))
1005 return err;
1007 else
1008 dfa->init_state_word = dfa->init_state_nl
1009 = dfa->init_state_begbuf = dfa->init_state;
1011 re_node_set_free (&init_nodes);
1012 return REG_NOERROR;
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018 DFA nodes where needed. */
1020 static void
1021 optimize_utf8 (re_dfa_t *dfa)
1023 int node, i, mb_chars = 0, has_period = 0;
1025 for (node = 0; node < dfa->nodes_len; ++node)
1026 switch (dfa->nodes[node].type)
1028 case CHARACTER:
1029 if (dfa->nodes[node].opr.c >= 0x80)
1030 mb_chars = 1;
1031 break;
1032 case ANCHOR:
1033 switch (dfa->nodes[node].opr.idx)
1035 case LINE_FIRST:
1036 case LINE_LAST:
1037 case BUF_FIRST:
1038 case BUF_LAST:
1039 break;
1040 default:
1041 /* Word anchors etc. cannot be handled. */
1042 return;
1044 break;
1045 case OP_PERIOD:
1046 has_period = 1;
1047 break;
1048 case OP_BACK_REF:
1049 case OP_ALT:
1050 case END_OF_RE:
1051 case OP_DUP_ASTERISK:
1052 case OP_OPEN_SUBEXP:
1053 case OP_CLOSE_SUBEXP:
1054 break;
1055 case COMPLEX_BRACKET:
1056 return;
1057 case SIMPLE_BRACKET:
1058 /* Just double check. The non-ASCII range starts at 0x80. */
1059 assert (0x80 % BITSET_WORD_BITS == 0);
1060 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061 if (dfa->nodes[node].opr.sbcset[i])
1062 return;
1063 break;
1064 default:
1065 abort ();
1068 if (mb_chars || has_period)
1069 for (node = 0; node < dfa->nodes_len; ++node)
1071 if (dfa->nodes[node].type == CHARACTER
1072 && dfa->nodes[node].opr.c >= 0x80)
1073 dfa->nodes[node].mb_partial = 0;
1074 else if (dfa->nodes[node].type == OP_PERIOD)
1075 dfa->nodes[node].type = OP_UTF8_PERIOD;
1078 /* The search can be in single byte locale. */
1079 dfa->mb_cur_max = 1;
1080 dfa->is_utf8 = 0;
1081 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1083 #endif
1085 /* Analyze the structure tree, and calculate "first", "next", "edest",
1086 "eclosure", and "inveclosure". */
1088 static reg_errcode_t
1089 analyze (regex_t *preg)
1091 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1092 reg_errcode_t ret;
1094 /* Allocate arrays. */
1095 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1096 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1097 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1098 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1099 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1100 || dfa->eclosures == NULL, 0))
1101 return REG_ESPACE;
1103 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1104 if (dfa->subexp_map != NULL)
1106 int i;
1107 for (i = 0; i < preg->re_nsub; i++)
1108 dfa->subexp_map[i] = i;
1109 preorder (dfa->str_tree, optimize_subexps, dfa);
1110 for (i = 0; i < preg->re_nsub; i++)
1111 if (dfa->subexp_map[i] != i)
1112 break;
1113 if (i == preg->re_nsub)
1115 free (dfa->subexp_map);
1116 dfa->subexp_map = NULL;
1120 ret = postorder (dfa->str_tree, lower_subexps, preg);
1121 if (BE (ret != REG_NOERROR, 0))
1122 return ret;
1123 ret = postorder (dfa->str_tree, calc_first, dfa);
1124 if (BE (ret != REG_NOERROR, 0))
1125 return ret;
1126 preorder (dfa->str_tree, calc_next, dfa);
1127 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1128 if (BE (ret != REG_NOERROR, 0))
1129 return ret;
1130 ret = calc_eclosure (dfa);
1131 if (BE (ret != REG_NOERROR, 0))
1132 return ret;
1134 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1135 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1136 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1137 || dfa->nbackref)
1139 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1140 if (BE (dfa->inveclosures == NULL, 0))
1141 return REG_ESPACE;
1142 ret = calc_inveclosure (dfa);
1145 return ret;
1148 /* Our parse trees are very unbalanced, so we cannot use a stack to
1149 implement parse tree visits. Instead, we use parent pointers and
1150 some hairy code in these two functions. */
1151 static reg_errcode_t
1152 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1153 void *extra)
1155 bin_tree_t *node, *prev;
1157 for (node = root; ; )
1159 /* Descend down the tree, preferably to the left (or to the right
1160 if that's the only child). */
1161 while (node->left || node->right)
1162 if (node->left)
1163 node = node->left;
1164 else
1165 node = node->right;
1169 reg_errcode_t err = fn (extra, node);
1170 if (BE (err != REG_NOERROR, 0))
1171 return err;
1172 if (node->parent == NULL)
1173 return REG_NOERROR;
1174 prev = node;
1175 node = node->parent;
1177 /* Go up while we have a node that is reached from the right. */
1178 while (node->right == prev || node->right == NULL);
1179 node = node->right;
1183 static reg_errcode_t
1184 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1185 void *extra)
1187 bin_tree_t *node;
1189 for (node = root; ; )
1191 reg_errcode_t err = fn (extra, node);
1192 if (BE (err != REG_NOERROR, 0))
1193 return err;
1195 /* Go to the left node, or up and to the right. */
1196 if (node->left)
1197 node = node->left;
1198 else
1200 bin_tree_t *prev = NULL;
1201 while (node->right == prev || node->right == NULL)
1203 prev = node;
1204 node = node->parent;
1205 if (!node)
1206 return REG_NOERROR;
1208 node = node->right;
1213 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1214 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1215 backreferences as well. Requires a preorder visit. */
1216 static reg_errcode_t
1217 optimize_subexps (void *extra, bin_tree_t *node)
1219 re_dfa_t *dfa = (re_dfa_t *) extra;
1221 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1223 int idx = node->token.opr.idx;
1224 node->token.opr.idx = dfa->subexp_map[idx];
1225 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1228 else if (node->token.type == SUBEXP
1229 && node->left && node->left->token.type == SUBEXP)
1231 int other_idx = node->left->token.opr.idx;
1233 node->left = node->left->left;
1234 if (node->left)
1235 node->left->parent = node;
1237 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1238 if (other_idx < BITSET_WORD_BITS)
1239 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1242 return REG_NOERROR;
1245 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1246 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1247 static reg_errcode_t
1248 lower_subexps (void *extra, bin_tree_t *node)
1250 regex_t *preg = (regex_t *) extra;
1251 reg_errcode_t err = REG_NOERROR;
1253 if (node->left && node->left->token.type == SUBEXP)
1255 node->left = lower_subexp (&err, preg, node->left);
1256 if (node->left)
1257 node->left->parent = node;
1259 if (node->right && node->right->token.type == SUBEXP)
1261 node->right = lower_subexp (&err, preg, node->right);
1262 if (node->right)
1263 node->right->parent = node;
1266 return err;
1269 static bin_tree_t *
1270 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1272 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1273 bin_tree_t *body = node->left;
1274 bin_tree_t *op, *cls, *tree1, *tree;
1276 if (preg->no_sub
1277 /* We do not optimize empty subexpressions, because otherwise we may
1278 have bad CONCAT nodes with NULL children. This is obviously not
1279 very common, so we do not lose much. An example that triggers
1280 this case is the sed "script" /\(\)/x. */
1281 && node->left != NULL
1282 && (node->token.opr.idx >= BITSET_WORD_BITS
1283 || !(dfa->used_bkref_map
1284 & ((bitset_word_t) 1 << node->token.opr.idx))))
1285 return node->left;
1287 /* Convert the SUBEXP node to the concatenation of an
1288 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1289 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1290 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1291 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1292 tree = create_tree (dfa, op, tree1, CONCAT);
1293 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1295 *err = REG_ESPACE;
1296 return NULL;
1299 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1300 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1301 return tree;
1304 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1305 nodes. Requires a postorder visit. */
1306 static reg_errcode_t
1307 calc_first (void *extra, bin_tree_t *node)
1309 re_dfa_t *dfa = (re_dfa_t *) extra;
1310 if (node->token.type == CONCAT)
1312 node->first = node->left->first;
1313 node->node_idx = node->left->node_idx;
1315 else
1317 node->first = node;
1318 node->node_idx = re_dfa_add_node (dfa, node->token);
1319 if (BE (node->node_idx == -1, 0))
1320 return REG_ESPACE;
1322 return REG_NOERROR;
1325 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1326 static reg_errcode_t
1327 calc_next (void *extra, bin_tree_t *node)
1329 switch (node->token.type)
1331 case OP_DUP_ASTERISK:
1332 node->left->next = node;
1333 break;
1334 case CONCAT:
1335 node->left->next = node->right->first;
1336 node->right->next = node->next;
1337 break;
1338 default:
1339 if (node->left)
1340 node->left->next = node->next;
1341 if (node->right)
1342 node->right->next = node->next;
1343 break;
1345 return REG_NOERROR;
1348 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1349 static reg_errcode_t
1350 link_nfa_nodes (void *extra, bin_tree_t *node)
1352 re_dfa_t *dfa = (re_dfa_t *) extra;
1353 int idx = node->node_idx;
1354 reg_errcode_t err = REG_NOERROR;
1356 switch (node->token.type)
1358 case CONCAT:
1359 break;
1361 case END_OF_RE:
1362 assert (node->next == NULL);
1363 break;
1365 case OP_DUP_ASTERISK:
1366 case OP_ALT:
1368 int left, right;
1369 dfa->has_plural_match = 1;
1370 if (node->left != NULL)
1371 left = node->left->first->node_idx;
1372 else
1373 left = node->next->node_idx;
1374 if (node->right != NULL)
1375 right = node->right->first->node_idx;
1376 else
1377 right = node->next->node_idx;
1378 assert (left > -1);
1379 assert (right > -1);
1380 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1382 break;
1384 case ANCHOR:
1385 case OP_OPEN_SUBEXP:
1386 case OP_CLOSE_SUBEXP:
1387 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1388 break;
1390 case OP_BACK_REF:
1391 dfa->nexts[idx] = node->next->node_idx;
1392 if (node->token.type == OP_BACK_REF)
1393 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1394 break;
1396 default:
1397 assert (!IS_EPSILON_NODE (node->token.type));
1398 dfa->nexts[idx] = node->next->node_idx;
1399 break;
1402 return err;
1405 /* Duplicate the epsilon closure of the node ROOT_NODE.
1406 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1407 to their own constraint. */
1409 static reg_errcode_t
1410 internal_function
1411 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1412 int root_node, unsigned int init_constraint)
1414 int org_node, clone_node, ret;
1415 unsigned int constraint = init_constraint;
1416 for (org_node = top_org_node, clone_node = top_clone_node;;)
1418 int org_dest, clone_dest;
1419 if (dfa->nodes[org_node].type == OP_BACK_REF)
1421 /* If the back reference epsilon-transit, its destination must
1422 also have the constraint. Then duplicate the epsilon closure
1423 of the destination of the back reference, and store it in
1424 edests of the back reference. */
1425 org_dest = dfa->nexts[org_node];
1426 re_node_set_empty (dfa->edests + clone_node);
1427 clone_dest = duplicate_node (dfa, org_dest, constraint);
1428 if (BE (clone_dest == -1, 0))
1429 return REG_ESPACE;
1430 dfa->nexts[clone_node] = dfa->nexts[org_node];
1431 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1432 if (BE (ret < 0, 0))
1433 return REG_ESPACE;
1435 else if (dfa->edests[org_node].nelem == 0)
1437 /* In case of the node can't epsilon-transit, don't duplicate the
1438 destination and store the original destination as the
1439 destination of the node. */
1440 dfa->nexts[clone_node] = dfa->nexts[org_node];
1441 break;
1443 else if (dfa->edests[org_node].nelem == 1)
1445 /* In case of the node can epsilon-transit, and it has only one
1446 destination. */
1447 org_dest = dfa->edests[org_node].elems[0];
1448 re_node_set_empty (dfa->edests + clone_node);
1449 if (dfa->nodes[org_node].type == ANCHOR)
1451 /* In case of the node has another constraint, append it. */
1452 if (org_node == root_node && clone_node != org_node)
1454 /* ...but if the node is root_node itself, it means the
1455 epsilon closure have a loop, then tie it to the
1456 destination of the root_node. */
1457 ret = re_node_set_insert (dfa->edests + clone_node,
1458 org_dest);
1459 if (BE (ret < 0, 0))
1460 return REG_ESPACE;
1461 break;
1463 constraint |= dfa->nodes[org_node].opr.ctx_type;
1465 clone_dest = duplicate_node (dfa, org_dest, constraint);
1466 if (BE (clone_dest == -1, 0))
1467 return REG_ESPACE;
1468 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1469 if (BE (ret < 0, 0))
1470 return REG_ESPACE;
1472 else /* dfa->edests[org_node].nelem == 2 */
1474 /* In case of the node can epsilon-transit, and it has two
1475 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1476 org_dest = dfa->edests[org_node].elems[0];
1477 re_node_set_empty (dfa->edests + clone_node);
1478 /* Search for a duplicated node which satisfies the constraint. */
1479 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1480 if (clone_dest == -1)
1482 /* There are no such a duplicated node, create a new one. */
1483 reg_errcode_t err;
1484 clone_dest = duplicate_node (dfa, org_dest, constraint);
1485 if (BE (clone_dest == -1, 0))
1486 return REG_ESPACE;
1487 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488 if (BE (ret < 0, 0))
1489 return REG_ESPACE;
1490 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1491 root_node, constraint);
1492 if (BE (err != REG_NOERROR, 0))
1493 return err;
1495 else
1497 /* There are a duplicated node which satisfy the constraint,
1498 use it to avoid infinite loop. */
1499 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500 if (BE (ret < 0, 0))
1501 return REG_ESPACE;
1504 org_dest = dfa->edests[org_node].elems[1];
1505 clone_dest = duplicate_node (dfa, org_dest, constraint);
1506 if (BE (clone_dest == -1, 0))
1507 return REG_ESPACE;
1508 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 if (BE (ret < 0, 0))
1510 return REG_ESPACE;
1512 org_node = org_dest;
1513 clone_node = clone_dest;
1515 return REG_NOERROR;
1518 /* Search for a node which is duplicated from the node ORG_NODE, and
1519 satisfies the constraint CONSTRAINT. */
1521 static int
1522 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1523 unsigned int constraint)
1525 int idx;
1526 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1528 if (org_node == dfa->org_indices[idx]
1529 && constraint == dfa->nodes[idx].constraint)
1530 return idx; /* Found. */
1532 return -1; /* Not found. */
1535 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1536 Return the index of the new node, or -1 if insufficient storage is
1537 available. */
1539 static int
1540 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1542 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1543 if (BE (dup_idx != -1, 1))
1545 dfa->nodes[dup_idx].constraint = constraint;
1546 if (dfa->nodes[org_idx].type == ANCHOR)
1547 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1548 dfa->nodes[dup_idx].duplicated = 1;
1550 /* Store the index of the original node. */
1551 dfa->org_indices[dup_idx] = org_idx;
1553 return dup_idx;
1556 static reg_errcode_t
1557 calc_inveclosure (re_dfa_t *dfa)
1559 int src, idx, ret;
1560 for (idx = 0; idx < dfa->nodes_len; ++idx)
1561 re_node_set_init_empty (dfa->inveclosures + idx);
1563 for (src = 0; src < dfa->nodes_len; ++src)
1565 int *elems = dfa->eclosures[src].elems;
1566 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1568 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1569 if (BE (ret == -1, 0))
1570 return REG_ESPACE;
1574 return REG_NOERROR;
1577 /* Calculate "eclosure" for all the node in DFA. */
1579 static reg_errcode_t
1580 calc_eclosure (re_dfa_t *dfa)
1582 int node_idx, incomplete;
1583 #ifdef DEBUG
1584 assert (dfa->nodes_len > 0);
1585 #endif
1586 incomplete = 0;
1587 /* For each nodes, calculate epsilon closure. */
1588 for (node_idx = 0; ; ++node_idx)
1590 reg_errcode_t err;
1591 re_node_set eclosure_elem;
1592 if (node_idx == dfa->nodes_len)
1594 if (!incomplete)
1595 break;
1596 incomplete = 0;
1597 node_idx = 0;
1600 #ifdef DEBUG
1601 assert (dfa->eclosures[node_idx].nelem != -1);
1602 #endif
1604 /* If we have already calculated, skip it. */
1605 if (dfa->eclosures[node_idx].nelem != 0)
1606 continue;
1607 /* Calculate epsilon closure of `node_idx'. */
1608 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1609 if (BE (err != REG_NOERROR, 0))
1610 return err;
1612 if (dfa->eclosures[node_idx].nelem == 0)
1614 incomplete = 1;
1615 re_node_set_free (&eclosure_elem);
1618 return REG_NOERROR;
1621 /* Calculate epsilon closure of NODE. */
1623 static reg_errcode_t
1624 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1626 reg_errcode_t err;
1627 unsigned int constraint;
1628 int i, incomplete;
1629 re_node_set eclosure;
1630 incomplete = 0;
1631 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1632 if (BE (err != REG_NOERROR, 0))
1633 return err;
1635 /* This indicates that we are calculating this node now.
1636 We reference this value to avoid infinite loop. */
1637 dfa->eclosures[node].nelem = -1;
1639 constraint = ((dfa->nodes[node].type == ANCHOR)
1640 ? dfa->nodes[node].opr.ctx_type : 0);
1641 /* If the current node has constraints, duplicate all nodes.
1642 Since they must inherit the constraints. */
1643 if (constraint
1644 && dfa->edests[node].nelem
1645 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1647 int org_node, cur_node;
1648 org_node = cur_node = node;
1649 err = duplicate_node_closure (dfa, node, node, node, constraint);
1650 if (BE (err != REG_NOERROR, 0))
1651 return err;
1654 /* Expand each epsilon destination nodes. */
1655 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1656 for (i = 0; i < dfa->edests[node].nelem; ++i)
1658 re_node_set eclosure_elem;
1659 int edest = dfa->edests[node].elems[i];
1660 /* If calculating the epsilon closure of `edest' is in progress,
1661 return intermediate result. */
1662 if (dfa->eclosures[edest].nelem == -1)
1664 incomplete = 1;
1665 continue;
1667 /* If we haven't calculated the epsilon closure of `edest' yet,
1668 calculate now. Otherwise use calculated epsilon closure. */
1669 if (dfa->eclosures[edest].nelem == 0)
1671 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1672 if (BE (err != REG_NOERROR, 0))
1673 return err;
1675 else
1676 eclosure_elem = dfa->eclosures[edest];
1677 /* Merge the epsilon closure of `edest'. */
1678 re_node_set_merge (&eclosure, &eclosure_elem);
1679 /* If the epsilon closure of `edest' is incomplete,
1680 the epsilon closure of this node is also incomplete. */
1681 if (dfa->eclosures[edest].nelem == 0)
1683 incomplete = 1;
1684 re_node_set_free (&eclosure_elem);
1688 /* Epsilon closures include itself. */
1689 re_node_set_insert (&eclosure, node);
1690 if (incomplete && !root)
1691 dfa->eclosures[node].nelem = 0;
1692 else
1693 dfa->eclosures[node] = eclosure;
1694 *new_set = eclosure;
1695 return REG_NOERROR;
1698 /* Functions for token which are used in the parser. */
1700 /* Fetch a token from INPUT.
1701 We must not use this function inside bracket expressions. */
1703 static void
1704 internal_function
1705 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1707 re_string_skip_bytes (input, peek_token (result, input, syntax));
1710 /* Peek a token from INPUT, and return the length of the token.
1711 We must not use this function inside bracket expressions. */
1713 static int
1714 internal_function
1715 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1717 unsigned char c;
1719 if (re_string_eoi (input))
1721 token->type = END_OF_RE;
1722 return 0;
1725 c = re_string_peek_byte (input, 0);
1726 token->opr.c = c;
1728 token->word_char = 0;
1729 #ifdef RE_ENABLE_I18N
1730 token->mb_partial = 0;
1731 if (input->mb_cur_max > 1 &&
1732 !re_string_first_byte (input, re_string_cur_idx (input)))
1734 token->type = CHARACTER;
1735 token->mb_partial = 1;
1736 return 1;
1738 #endif
1739 if (c == '\\')
1741 unsigned char c2;
1742 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1744 token->type = BACK_SLASH;
1745 return 1;
1748 c2 = re_string_peek_byte_case (input, 1);
1749 token->opr.c = c2;
1750 token->type = CHARACTER;
1751 #ifdef RE_ENABLE_I18N
1752 if (input->mb_cur_max > 1)
1754 wint_t wc = re_string_wchar_at (input,
1755 re_string_cur_idx (input) + 1);
1756 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1758 else
1759 #endif
1760 token->word_char = IS_WORD_CHAR (c2) != 0;
1762 switch (c2)
1764 case '|':
1765 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1766 token->type = OP_ALT;
1767 break;
1768 case '1': case '2': case '3': case '4': case '5':
1769 case '6': case '7': case '8': case '9':
1770 if (!(syntax & RE_NO_BK_REFS))
1772 token->type = OP_BACK_REF;
1773 token->opr.idx = c2 - '1';
1775 break;
1776 case '<':
1777 if (!(syntax & RE_NO_GNU_OPS))
1779 token->type = ANCHOR;
1780 token->opr.ctx_type = WORD_FIRST;
1782 break;
1783 case '>':
1784 if (!(syntax & RE_NO_GNU_OPS))
1786 token->type = ANCHOR;
1787 token->opr.ctx_type = WORD_LAST;
1789 break;
1790 case 'b':
1791 if (!(syntax & RE_NO_GNU_OPS))
1793 token->type = ANCHOR;
1794 token->opr.ctx_type = WORD_DELIM;
1796 break;
1797 case 'B':
1798 if (!(syntax & RE_NO_GNU_OPS))
1800 token->type = ANCHOR;
1801 token->opr.ctx_type = NOT_WORD_DELIM;
1803 break;
1804 case 'w':
1805 if (!(syntax & RE_NO_GNU_OPS))
1806 token->type = OP_WORD;
1807 break;
1808 case 'W':
1809 if (!(syntax & RE_NO_GNU_OPS))
1810 token->type = OP_NOTWORD;
1811 break;
1812 case 's':
1813 if (!(syntax & RE_NO_GNU_OPS))
1814 token->type = OP_SPACE;
1815 break;
1816 case 'S':
1817 if (!(syntax & RE_NO_GNU_OPS))
1818 token->type = OP_NOTSPACE;
1819 break;
1820 case '`':
1821 if (!(syntax & RE_NO_GNU_OPS))
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = BUF_FIRST;
1826 break;
1827 case '\'':
1828 if (!(syntax & RE_NO_GNU_OPS))
1830 token->type = ANCHOR;
1831 token->opr.ctx_type = BUF_LAST;
1833 break;
1834 case '(':
1835 if (!(syntax & RE_NO_BK_PARENS))
1836 token->type = OP_OPEN_SUBEXP;
1837 break;
1838 case ')':
1839 if (!(syntax & RE_NO_BK_PARENS))
1840 token->type = OP_CLOSE_SUBEXP;
1841 break;
1842 case '+':
1843 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1844 token->type = OP_DUP_PLUS;
1845 break;
1846 case '?':
1847 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1848 token->type = OP_DUP_QUESTION;
1849 break;
1850 case '{':
1851 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1852 token->type = OP_OPEN_DUP_NUM;
1853 break;
1854 case '}':
1855 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1856 token->type = OP_CLOSE_DUP_NUM;
1857 break;
1858 default:
1859 break;
1861 return 2;
1864 token->type = CHARACTER;
1865 #ifdef RE_ENABLE_I18N
1866 if (input->mb_cur_max > 1)
1868 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1869 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1871 else
1872 #endif
1873 token->word_char = IS_WORD_CHAR (token->opr.c);
1875 switch (c)
1877 case '\n':
1878 if (syntax & RE_NEWLINE_ALT)
1879 token->type = OP_ALT;
1880 break;
1881 case '|':
1882 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1883 token->type = OP_ALT;
1884 break;
1885 case '*':
1886 token->type = OP_DUP_ASTERISK;
1887 break;
1888 case '+':
1889 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1890 token->type = OP_DUP_PLUS;
1891 break;
1892 case '?':
1893 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_QUESTION;
1895 break;
1896 case '{':
1897 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1898 token->type = OP_OPEN_DUP_NUM;
1899 break;
1900 case '}':
1901 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1902 token->type = OP_CLOSE_DUP_NUM;
1903 break;
1904 case '(':
1905 if (syntax & RE_NO_BK_PARENS)
1906 token->type = OP_OPEN_SUBEXP;
1907 break;
1908 case ')':
1909 if (syntax & RE_NO_BK_PARENS)
1910 token->type = OP_CLOSE_SUBEXP;
1911 break;
1912 case '[':
1913 token->type = OP_OPEN_BRACKET;
1914 break;
1915 case '.':
1916 token->type = OP_PERIOD;
1917 break;
1918 case '^':
1919 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1920 re_string_cur_idx (input) != 0)
1922 char prev = re_string_peek_byte (input, -1);
1923 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1924 break;
1926 token->type = ANCHOR;
1927 token->opr.ctx_type = LINE_FIRST;
1928 break;
1929 case '$':
1930 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1931 re_string_cur_idx (input) + 1 != re_string_length (input))
1933 re_token_t next;
1934 re_string_skip_bytes (input, 1);
1935 peek_token (&next, input, syntax);
1936 re_string_skip_bytes (input, -1);
1937 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1938 break;
1940 token->type = ANCHOR;
1941 token->opr.ctx_type = LINE_LAST;
1942 break;
1943 default:
1944 break;
1946 return 1;
1949 /* Peek a token from INPUT, and return the length of the token.
1950 We must not use this function out of bracket expressions. */
1952 static int
1953 internal_function
1954 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1956 unsigned char c;
1957 if (re_string_eoi (input))
1959 token->type = END_OF_RE;
1960 return 0;
1962 c = re_string_peek_byte (input, 0);
1963 token->opr.c = c;
1965 #ifdef RE_ENABLE_I18N
1966 if (input->mb_cur_max > 1 &&
1967 !re_string_first_byte (input, re_string_cur_idx (input)))
1969 token->type = CHARACTER;
1970 return 1;
1972 #endif /* RE_ENABLE_I18N */
1974 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1975 && re_string_cur_idx (input) + 1 < re_string_length (input))
1977 /* In this case, '\' escape a character. */
1978 unsigned char c2;
1979 re_string_skip_bytes (input, 1);
1980 c2 = re_string_peek_byte (input, 0);
1981 token->opr.c = c2;
1982 token->type = CHARACTER;
1983 return 1;
1985 if (c == '[') /* '[' is a special char in a bracket exps. */
1987 unsigned char c2;
1988 int token_len;
1989 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1990 c2 = re_string_peek_byte (input, 1);
1991 else
1992 c2 = 0;
1993 token->opr.c = c2;
1994 token_len = 2;
1995 switch (c2)
1997 case '.':
1998 token->type = OP_OPEN_COLL_ELEM;
1999 break;
2000 case '=':
2001 token->type = OP_OPEN_EQUIV_CLASS;
2002 break;
2003 case ':':
2004 if (syntax & RE_CHAR_CLASSES)
2006 token->type = OP_OPEN_CHAR_CLASS;
2007 break;
2009 /* else fall through. */
2010 default:
2011 token->type = CHARACTER;
2012 token->opr.c = c;
2013 token_len = 1;
2014 break;
2016 return token_len;
2018 switch (c)
2020 case '-':
2021 token->type = OP_CHARSET_RANGE;
2022 break;
2023 case ']':
2024 token->type = OP_CLOSE_BRACKET;
2025 break;
2026 case '^':
2027 token->type = OP_NON_MATCH_LIST;
2028 break;
2029 default:
2030 token->type = CHARACTER;
2032 return 1;
2035 /* Functions for parser. */
2037 /* Entry point of the parser.
2038 Parse the regular expression REGEXP and return the structure tree.
2039 If an error is occured, ERR is set by error code, and return NULL.
2040 This function build the following tree, from regular expression <reg_exp>:
2044 <reg_exp> EOR
2046 CAT means concatenation.
2047 EOR means end of regular expression. */
2049 static bin_tree_t *
2050 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2051 reg_errcode_t *err)
2053 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2054 bin_tree_t *tree, *eor, *root;
2055 re_token_t current_token;
2056 dfa->syntax = syntax;
2057 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2058 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2059 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2060 return NULL;
2061 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2062 if (tree != NULL)
2063 root = create_tree (dfa, tree, eor, CONCAT);
2064 else
2065 root = eor;
2066 if (BE (eor == NULL || root == NULL, 0))
2068 *err = REG_ESPACE;
2069 return NULL;
2071 return root;
2074 /* This function build the following tree, from regular expression
2075 <branch1>|<branch2>:
2079 <branch1> <branch2>
2081 ALT means alternative, which represents the operator `|'. */
2083 static bin_tree_t *
2084 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2085 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2087 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2088 bin_tree_t *tree, *branch = NULL;
2089 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2090 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2091 return NULL;
2093 while (token->type == OP_ALT)
2095 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2096 if (token->type != OP_ALT && token->type != END_OF_RE
2097 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2099 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2100 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2101 return NULL;
2103 else
2104 branch = NULL;
2105 tree = create_tree (dfa, tree, branch, OP_ALT);
2106 if (BE (tree == NULL, 0))
2108 *err = REG_ESPACE;
2109 return NULL;
2112 return tree;
2115 /* This function build the following tree, from regular expression
2116 <exp1><exp2>:
2120 <exp1> <exp2>
2122 CAT means concatenation. */
2124 static bin_tree_t *
2125 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2126 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2128 bin_tree_t *tree, *exp;
2129 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2131 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2132 return NULL;
2134 while (token->type != OP_ALT && token->type != END_OF_RE
2135 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2137 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2138 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2140 return NULL;
2142 if (tree != NULL && exp != NULL)
2144 tree = create_tree (dfa, tree, exp, CONCAT);
2145 if (tree == NULL)
2147 *err = REG_ESPACE;
2148 return NULL;
2151 else if (tree == NULL)
2152 tree = exp;
2153 /* Otherwise exp == NULL, we don't need to create new tree. */
2155 return tree;
2158 /* This function build the following tree, from regular expression a*:
2164 static bin_tree_t *
2165 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2166 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2168 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2169 bin_tree_t *tree;
2170 switch (token->type)
2172 case CHARACTER:
2173 tree = create_token_tree (dfa, NULL, NULL, token);
2174 if (BE (tree == NULL, 0))
2176 *err = REG_ESPACE;
2177 return NULL;
2179 #ifdef RE_ENABLE_I18N
2180 if (dfa->mb_cur_max > 1)
2182 while (!re_string_eoi (regexp)
2183 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2185 bin_tree_t *mbc_remain;
2186 fetch_token (token, regexp, syntax);
2187 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2188 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2189 if (BE (mbc_remain == NULL || tree == NULL, 0))
2191 *err = REG_ESPACE;
2192 return NULL;
2196 #endif
2197 break;
2198 case OP_OPEN_SUBEXP:
2199 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2200 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2201 return NULL;
2202 break;
2203 case OP_OPEN_BRACKET:
2204 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2205 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2206 return NULL;
2207 break;
2208 case OP_BACK_REF:
2209 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2211 *err = REG_ESUBREG;
2212 return NULL;
2214 dfa->used_bkref_map |= 1 << token->opr.idx;
2215 tree = create_token_tree (dfa, NULL, NULL, token);
2216 if (BE (tree == NULL, 0))
2218 *err = REG_ESPACE;
2219 return NULL;
2221 ++dfa->nbackref;
2222 dfa->has_mb_node = 1;
2223 break;
2224 case OP_OPEN_DUP_NUM:
2225 if (syntax & RE_CONTEXT_INVALID_DUP)
2227 *err = REG_BADRPT;
2228 return NULL;
2230 /* FALLTHROUGH */
2231 case OP_DUP_ASTERISK:
2232 case OP_DUP_PLUS:
2233 case OP_DUP_QUESTION:
2234 if (syntax & RE_CONTEXT_INVALID_OPS)
2236 *err = REG_BADRPT;
2237 return NULL;
2239 else if (syntax & RE_CONTEXT_INDEP_OPS)
2241 fetch_token (token, regexp, syntax);
2242 return parse_expression (regexp, preg, token, syntax, nest, err);
2244 /* else fall through */
2245 case OP_CLOSE_SUBEXP:
2246 if ((token->type == OP_CLOSE_SUBEXP) &&
2247 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2249 *err = REG_ERPAREN;
2250 return NULL;
2252 /* else fall through */
2253 case OP_CLOSE_DUP_NUM:
2254 /* We treat it as a normal character. */
2256 /* Then we can these characters as normal characters. */
2257 token->type = CHARACTER;
2258 /* mb_partial and word_char bits should be initialized already
2259 by peek_token. */
2260 tree = create_token_tree (dfa, NULL, NULL, token);
2261 if (BE (tree == NULL, 0))
2263 *err = REG_ESPACE;
2264 return NULL;
2266 break;
2267 case ANCHOR:
2268 if ((token->opr.ctx_type
2269 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2270 && dfa->word_ops_used == 0)
2271 init_word_char (dfa);
2272 if (token->opr.ctx_type == WORD_DELIM
2273 || token->opr.ctx_type == NOT_WORD_DELIM)
2275 bin_tree_t *tree_first, *tree_last;
2276 if (token->opr.ctx_type == WORD_DELIM)
2278 token->opr.ctx_type = WORD_FIRST;
2279 tree_first = create_token_tree (dfa, NULL, NULL, token);
2280 token->opr.ctx_type = WORD_LAST;
2282 else
2284 token->opr.ctx_type = INSIDE_WORD;
2285 tree_first = create_token_tree (dfa, NULL, NULL, token);
2286 token->opr.ctx_type = INSIDE_NOTWORD;
2288 tree_last = create_token_tree (dfa, NULL, NULL, token);
2289 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2290 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2292 *err = REG_ESPACE;
2293 return NULL;
2296 else
2298 tree = create_token_tree (dfa, NULL, NULL, token);
2299 if (BE (tree == NULL, 0))
2301 *err = REG_ESPACE;
2302 return NULL;
2305 /* We must return here, since ANCHORs can't be followed
2306 by repetition operators.
2307 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2308 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2309 fetch_token (token, regexp, syntax);
2310 return tree;
2311 case OP_PERIOD:
2312 tree = create_token_tree (dfa, NULL, NULL, token);
2313 if (BE (tree == NULL, 0))
2315 *err = REG_ESPACE;
2316 return NULL;
2318 if (dfa->mb_cur_max > 1)
2319 dfa->has_mb_node = 1;
2320 break;
2321 case OP_WORD:
2322 case OP_NOTWORD:
2323 tree = build_charclass_op (dfa, regexp->trans,
2324 (const unsigned char *) "alnum",
2325 (const unsigned char *) "_",
2326 token->type == OP_NOTWORD, err);
2327 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2328 return NULL;
2329 break;
2330 case OP_SPACE:
2331 case OP_NOTSPACE:
2332 tree = build_charclass_op (dfa, regexp->trans,
2333 (const unsigned char *) "space",
2334 (const unsigned char *) "",
2335 token->type == OP_NOTSPACE, err);
2336 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2337 return NULL;
2338 break;
2339 case OP_ALT:
2340 case END_OF_RE:
2341 return NULL;
2342 case BACK_SLASH:
2343 *err = REG_EESCAPE;
2344 return NULL;
2345 default:
2346 /* Must not happen? */
2347 #ifdef DEBUG
2348 assert (0);
2349 #endif
2350 return NULL;
2352 fetch_token (token, regexp, syntax);
2354 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2355 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2357 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2358 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2359 return NULL;
2360 /* In BRE consecutive duplications are not allowed. */
2361 if ((syntax & RE_CONTEXT_INVALID_DUP)
2362 && (token->type == OP_DUP_ASTERISK
2363 || token->type == OP_OPEN_DUP_NUM))
2365 *err = REG_BADRPT;
2366 return NULL;
2370 return tree;
2373 /* This function build the following tree, from regular expression
2374 (<reg_exp>):
2375 SUBEXP
2377 <reg_exp>
2380 static bin_tree_t *
2381 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2382 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2384 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2385 bin_tree_t *tree;
2386 size_t cur_nsub;
2387 cur_nsub = preg->re_nsub++;
2389 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2391 /* The subexpression may be a null string. */
2392 if (token->type == OP_CLOSE_SUBEXP)
2393 tree = NULL;
2394 else
2396 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2397 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2398 *err = REG_EPAREN;
2399 if (BE (*err != REG_NOERROR, 0))
2400 return NULL;
2403 if (cur_nsub <= '9' - '1')
2404 dfa->completed_bkref_map |= 1 << cur_nsub;
2406 tree = create_tree (dfa, tree, NULL, SUBEXP);
2407 if (BE (tree == NULL, 0))
2409 *err = REG_ESPACE;
2410 return NULL;
2412 tree->token.opr.idx = cur_nsub;
2413 return tree;
2416 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2418 static bin_tree_t *
2419 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2420 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2422 bin_tree_t *tree = NULL, *old_tree = NULL;
2423 int i, start, end, start_idx = re_string_cur_idx (regexp);
2424 re_token_t start_token = *token;
2426 if (token->type == OP_OPEN_DUP_NUM)
2428 end = 0;
2429 start = fetch_number (regexp, token, syntax);
2430 if (start == -1)
2432 if (token->type == CHARACTER && token->opr.c == ',')
2433 start = 0; /* We treat "{,m}" as "{0,m}". */
2434 else
2436 *err = REG_BADBR; /* <re>{} is invalid. */
2437 return NULL;
2440 if (BE (start != -2, 1))
2442 /* We treat "{n}" as "{n,n}". */
2443 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2444 : ((token->type == CHARACTER && token->opr.c == ',')
2445 ? fetch_number (regexp, token, syntax) : -2));
2447 if (BE (start == -2 || end == -2, 0))
2449 /* Invalid sequence. */
2450 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2452 if (token->type == END_OF_RE)
2453 *err = REG_EBRACE;
2454 else
2455 *err = REG_BADBR;
2457 return NULL;
2460 /* If the syntax bit is set, rollback. */
2461 re_string_set_index (regexp, start_idx);
2462 *token = start_token;
2463 token->type = CHARACTER;
2464 /* mb_partial and word_char bits should be already initialized by
2465 peek_token. */
2466 return elem;
2469 if (BE (end != -1 && start > end, 0))
2471 /* First number greater than second. */
2472 *err = REG_BADBR;
2473 return NULL;
2476 else
2478 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2479 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2482 fetch_token (token, regexp, syntax);
2484 if (BE (elem == NULL, 0))
2485 return NULL;
2486 if (BE (start == 0 && end == 0, 0))
2488 postorder (elem, free_tree, NULL);
2489 return NULL;
2492 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2493 if (BE (start > 0, 0))
2495 tree = elem;
2496 for (i = 2; i <= start; ++i)
2498 elem = duplicate_tree (elem, dfa);
2499 tree = create_tree (dfa, tree, elem, CONCAT);
2500 if (BE (elem == NULL || tree == NULL, 0))
2501 goto parse_dup_op_espace;
2504 if (start == end)
2505 return tree;
2507 /* Duplicate ELEM before it is marked optional. */
2508 elem = duplicate_tree (elem, dfa);
2509 old_tree = tree;
2511 else
2512 old_tree = NULL;
2514 if (elem->token.type == SUBEXP)
2515 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2517 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2518 if (BE (tree == NULL, 0))
2519 goto parse_dup_op_espace;
2521 /* This loop is actually executed only when end != -1,
2522 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2523 already created the start+1-th copy. */
2524 for (i = start + 2; i <= end; ++i)
2526 elem = duplicate_tree (elem, dfa);
2527 tree = create_tree (dfa, tree, elem, CONCAT);
2528 if (BE (elem == NULL || tree == NULL, 0))
2529 goto parse_dup_op_espace;
2531 tree = create_tree (dfa, tree, NULL, OP_ALT);
2532 if (BE (tree == NULL, 0))
2533 goto parse_dup_op_espace;
2536 if (old_tree)
2537 tree = create_tree (dfa, old_tree, tree, CONCAT);
2539 return tree;
2541 parse_dup_op_espace:
2542 *err = REG_ESPACE;
2543 return NULL;
2546 /* Size of the names for collating symbol/equivalence_class/character_class.
2547 I'm not sure, but maybe enough. */
2548 #define BRACKET_NAME_BUF_SIZE 32
2550 #ifndef _LIBC
2551 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2552 Build the range expression which starts from START_ELEM, and ends
2553 at END_ELEM. The result are written to MBCSET and SBCSET.
2554 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2555 mbcset->range_ends, is a pointer argument sinse we may
2556 update it. */
2558 static reg_errcode_t
2559 internal_function
2560 # ifdef RE_ENABLE_I18N
2561 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2562 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2563 # else /* not RE_ENABLE_I18N */
2564 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2565 bracket_elem_t *end_elem)
2566 # endif /* not RE_ENABLE_I18N */
2568 unsigned int start_ch, end_ch;
2569 /* Equivalence Classes and Character Classes can't be a range start/end. */
2570 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2571 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2573 return REG_ERANGE;
2575 /* We can handle no multi character collating elements without libc
2576 support. */
2577 if (BE ((start_elem->type == COLL_SYM
2578 && strlen ((char *) start_elem->opr.name) > 1)
2579 || (end_elem->type == COLL_SYM
2580 && strlen ((char *) end_elem->opr.name) > 1), 0))
2581 return REG_ECOLLATE;
2583 # ifdef RE_ENABLE_I18N
2585 wchar_t wc;
2586 wint_t start_wc;
2587 wint_t end_wc;
2588 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2590 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2591 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2592 : 0));
2593 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2594 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2595 : 0));
2596 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2597 ? __btowc (start_ch) : start_elem->opr.wch);
2598 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2599 ? __btowc (end_ch) : end_elem->opr.wch);
2600 if (start_wc == WEOF || end_wc == WEOF)
2601 return REG_ECOLLATE;
2602 cmp_buf[0] = start_wc;
2603 cmp_buf[4] = end_wc;
2604 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2605 return REG_ERANGE;
2607 /* Got valid collation sequence values, add them as a new entry.
2608 However, for !_LIBC we have no collation elements: if the
2609 character set is single byte, the single byte character set
2610 that we build below suffices. parse_bracket_exp passes
2611 no MBCSET if dfa->mb_cur_max == 1. */
2612 if (mbcset)
2614 /* Check the space of the arrays. */
2615 if (BE (*range_alloc == mbcset->nranges, 0))
2617 /* There is not enough space, need realloc. */
2618 wchar_t *new_array_start, *new_array_end;
2619 int new_nranges;
2621 /* +1 in case of mbcset->nranges is 0. */
2622 new_nranges = 2 * mbcset->nranges + 1;
2623 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2624 are NULL if *range_alloc == 0. */
2625 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2626 new_nranges);
2627 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2628 new_nranges);
2630 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2631 return REG_ESPACE;
2633 mbcset->range_starts = new_array_start;
2634 mbcset->range_ends = new_array_end;
2635 *range_alloc = new_nranges;
2638 mbcset->range_starts[mbcset->nranges] = start_wc;
2639 mbcset->range_ends[mbcset->nranges++] = end_wc;
2642 /* Build the table for single byte characters. */
2643 for (wc = 0; wc < SBC_MAX; ++wc)
2645 cmp_buf[2] = wc;
2646 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2647 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2648 bitset_set (sbcset, wc);
2651 # else /* not RE_ENABLE_I18N */
2653 unsigned int ch;
2654 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2655 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2656 : 0));
2657 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2658 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2659 : 0));
2660 if (start_ch > end_ch)
2661 return REG_ERANGE;
2662 /* Build the table for single byte characters. */
2663 for (ch = 0; ch < SBC_MAX; ++ch)
2664 if (start_ch <= ch && ch <= end_ch)
2665 bitset_set (sbcset, ch);
2667 # endif /* not RE_ENABLE_I18N */
2668 return REG_NOERROR;
2670 #endif /* not _LIBC */
2672 #ifndef _LIBC
2673 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2674 Build the collating element which is represented by NAME.
2675 The result are written to MBCSET and SBCSET.
2676 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2677 pointer argument since we may update it. */
2679 static reg_errcode_t
2680 internal_function
2681 # ifdef RE_ENABLE_I18N
2682 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2683 int *coll_sym_alloc, const unsigned char *name)
2684 # else /* not RE_ENABLE_I18N */
2685 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2686 # endif /* not RE_ENABLE_I18N */
2688 size_t name_len = strlen ((const char *) name);
2689 if (BE (name_len != 1, 0))
2690 return REG_ECOLLATE;
2691 else
2693 bitset_set (sbcset, name[0]);
2694 return REG_NOERROR;
2697 #endif /* not _LIBC */
2699 /* This function parse bracket expression like "[abc]", "[a-c]",
2700 "[[.a-a.]]" etc. */
2702 static bin_tree_t *
2703 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2704 reg_syntax_t syntax, reg_errcode_t *err)
2706 #ifdef _LIBC
2707 const unsigned char *collseqmb;
2708 const char *collseqwc;
2709 uint32_t nrules;
2710 int32_t table_size;
2711 const int32_t *symb_table;
2712 const unsigned char *extra;
2714 /* Local function for parse_bracket_exp used in _LIBC environement.
2715 Seek the collating symbol entry correspondings to NAME.
2716 Return the index of the symbol in the SYMB_TABLE. */
2718 auto inline int32_t
2719 __attribute ((always_inline))
2720 seek_collating_symbol_entry (name, name_len)
2721 const unsigned char *name;
2722 size_t name_len;
2724 int32_t hash = elem_hash ((const char *) name, name_len);
2725 int32_t elem = hash % table_size;
2726 if (symb_table[2 * elem] != 0)
2728 int32_t second = hash % (table_size - 2) + 1;
2732 /* First compare the hashing value. */
2733 if (symb_table[2 * elem] == hash
2734 /* Compare the length of the name. */
2735 && name_len == extra[symb_table[2 * elem + 1]]
2736 /* Compare the name. */
2737 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2738 name_len) == 0)
2740 /* Yep, this is the entry. */
2741 break;
2744 /* Next entry. */
2745 elem += second;
2747 while (symb_table[2 * elem] != 0);
2749 return elem;
2752 /* Local function for parse_bracket_exp used in _LIBC environement.
2753 Look up the collation sequence value of BR_ELEM.
2754 Return the value if succeeded, UINT_MAX otherwise. */
2756 auto inline unsigned int
2757 __attribute ((always_inline))
2758 lookup_collation_sequence_value (br_elem)
2759 bracket_elem_t *br_elem;
2761 if (br_elem->type == SB_CHAR)
2764 if (MB_CUR_MAX == 1)
2766 if (nrules == 0)
2767 return collseqmb[br_elem->opr.ch];
2768 else
2770 wint_t wc = __btowc (br_elem->opr.ch);
2771 return __collseq_table_lookup (collseqwc, wc);
2774 else if (br_elem->type == MB_CHAR)
2776 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2778 else if (br_elem->type == COLL_SYM)
2780 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2781 if (nrules != 0)
2783 int32_t elem, idx;
2784 elem = seek_collating_symbol_entry (br_elem->opr.name,
2785 sym_name_len);
2786 if (symb_table[2 * elem] != 0)
2788 /* We found the entry. */
2789 idx = symb_table[2 * elem + 1];
2790 /* Skip the name of collating element name. */
2791 idx += 1 + extra[idx];
2792 /* Skip the byte sequence of the collating element. */
2793 idx += 1 + extra[idx];
2794 /* Adjust for the alignment. */
2795 idx = (idx + 3) & ~3;
2796 /* Skip the multibyte collation sequence value. */
2797 idx += sizeof (unsigned int);
2798 /* Skip the wide char sequence of the collating element. */
2799 idx += sizeof (unsigned int) *
2800 (1 + *(unsigned int *) (extra + idx));
2801 /* Return the collation sequence value. */
2802 return *(unsigned int *) (extra + idx);
2804 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2806 /* No valid character. Match it as a single byte
2807 character. */
2808 return collseqmb[br_elem->opr.name[0]];
2811 else if (sym_name_len == 1)
2812 return collseqmb[br_elem->opr.name[0]];
2814 return UINT_MAX;
2817 /* Local function for parse_bracket_exp used in _LIBC environement.
2818 Build the range expression which starts from START_ELEM, and ends
2819 at END_ELEM. The result are written to MBCSET and SBCSET.
2820 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2821 mbcset->range_ends, is a pointer argument sinse we may
2822 update it. */
2824 auto inline reg_errcode_t
2825 __attribute ((always_inline))
2826 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2827 re_charset_t *mbcset;
2828 int *range_alloc;
2829 bitset_t sbcset;
2830 bracket_elem_t *start_elem, *end_elem;
2832 unsigned int ch;
2833 uint32_t start_collseq;
2834 uint32_t end_collseq;
2836 /* Equivalence Classes and Character Classes can't be a range
2837 start/end. */
2838 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2839 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2841 return REG_ERANGE;
2843 start_collseq = lookup_collation_sequence_value (start_elem);
2844 end_collseq = lookup_collation_sequence_value (end_elem);
2845 /* Check start/end collation sequence values. */
2846 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2847 return REG_ECOLLATE;
2848 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2849 return REG_ERANGE;
2851 /* Got valid collation sequence values, add them as a new entry.
2852 However, if we have no collation elements, and the character set
2853 is single byte, the single byte character set that we
2854 build below suffices. */
2855 if (nrules > 0 || dfa->mb_cur_max > 1)
2857 /* Check the space of the arrays. */
2858 if (BE (*range_alloc == mbcset->nranges, 0))
2860 /* There is not enough space, need realloc. */
2861 uint32_t *new_array_start;
2862 uint32_t *new_array_end;
2863 int new_nranges;
2865 /* +1 in case of mbcset->nranges is 0. */
2866 new_nranges = 2 * mbcset->nranges + 1;
2867 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2868 new_nranges);
2869 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2870 new_nranges);
2872 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2873 return REG_ESPACE;
2875 mbcset->range_starts = new_array_start;
2876 mbcset->range_ends = new_array_end;
2877 *range_alloc = new_nranges;
2880 mbcset->range_starts[mbcset->nranges] = start_collseq;
2881 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2884 /* Build the table for single byte characters. */
2885 for (ch = 0; ch < SBC_MAX; ch++)
2887 uint32_t ch_collseq;
2889 if (MB_CUR_MAX == 1)
2891 if (nrules == 0)
2892 ch_collseq = collseqmb[ch];
2893 else
2894 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2895 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2896 bitset_set (sbcset, ch);
2898 return REG_NOERROR;
2901 /* Local function for parse_bracket_exp used in _LIBC environement.
2902 Build the collating element which is represented by NAME.
2903 The result are written to MBCSET and SBCSET.
2904 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2905 pointer argument sinse we may update it. */
2907 auto inline reg_errcode_t
2908 __attribute ((always_inline))
2909 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2910 re_charset_t *mbcset;
2911 int *coll_sym_alloc;
2912 bitset_t sbcset;
2913 const unsigned char *name;
2915 int32_t elem, idx;
2916 size_t name_len = strlen ((const char *) name);
2917 if (nrules != 0)
2919 elem = seek_collating_symbol_entry (name, name_len);
2920 if (symb_table[2 * elem] != 0)
2922 /* We found the entry. */
2923 idx = symb_table[2 * elem + 1];
2924 /* Skip the name of collating element name. */
2925 idx += 1 + extra[idx];
2927 else if (symb_table[2 * elem] == 0 && name_len == 1)
2929 /* No valid character, treat it as a normal
2930 character. */
2931 bitset_set (sbcset, name[0]);
2932 return REG_NOERROR;
2934 else
2935 return REG_ECOLLATE;
2937 /* Got valid collation sequence, add it as a new entry. */
2938 /* Check the space of the arrays. */
2939 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2941 /* Not enough, realloc it. */
2942 /* +1 in case of mbcset->ncoll_syms is 0. */
2943 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2944 /* Use realloc since mbcset->coll_syms is NULL
2945 if *alloc == 0. */
2946 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2947 new_coll_sym_alloc);
2948 if (BE (new_coll_syms == NULL, 0))
2949 return REG_ESPACE;
2950 mbcset->coll_syms = new_coll_syms;
2951 *coll_sym_alloc = new_coll_sym_alloc;
2953 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2954 return REG_NOERROR;
2956 else
2958 if (BE (name_len != 1, 0))
2959 return REG_ECOLLATE;
2960 else
2962 bitset_set (sbcset, name[0]);
2963 return REG_NOERROR;
2967 #endif
2969 re_token_t br_token;
2970 re_bitset_ptr_t sbcset;
2971 #ifdef RE_ENABLE_I18N
2972 re_charset_t *mbcset;
2973 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2974 int equiv_class_alloc = 0, char_class_alloc = 0;
2975 #endif /* not RE_ENABLE_I18N */
2976 int non_match = 0;
2977 bin_tree_t *work_tree;
2978 int token_len;
2979 int first_round = 1;
2980 #ifdef _LIBC
2981 collseqmb = (const unsigned char *)
2982 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2983 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2984 if (nrules)
2987 if (MB_CUR_MAX > 1)
2989 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2990 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2991 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2992 _NL_COLLATE_SYMB_TABLEMB);
2993 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2994 _NL_COLLATE_SYMB_EXTRAMB);
2996 #endif
2997 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2998 #ifdef RE_ENABLE_I18N
2999 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3000 #endif /* RE_ENABLE_I18N */
3001 #ifdef RE_ENABLE_I18N
3002 if (BE (sbcset == NULL || mbcset == NULL, 0))
3003 #else
3004 if (BE (sbcset == NULL, 0))
3005 #endif /* RE_ENABLE_I18N */
3007 *err = REG_ESPACE;
3008 return NULL;
3011 token_len = peek_token_bracket (token, regexp, syntax);
3012 if (BE (token->type == END_OF_RE, 0))
3014 *err = REG_BADPAT;
3015 goto parse_bracket_exp_free_return;
3017 if (token->type == OP_NON_MATCH_LIST)
3019 #ifdef RE_ENABLE_I18N
3020 mbcset->non_match = 1;
3021 #endif /* not RE_ENABLE_I18N */
3022 non_match = 1;
3023 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3024 bitset_set (sbcset, '\0');
3025 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3026 token_len = peek_token_bracket (token, regexp, syntax);
3027 if (BE (token->type == END_OF_RE, 0))
3029 *err = REG_BADPAT;
3030 goto parse_bracket_exp_free_return;
3034 /* We treat the first ']' as a normal character. */
3035 if (token->type == OP_CLOSE_BRACKET)
3036 token->type = CHARACTER;
3038 while (1)
3040 bracket_elem_t start_elem, end_elem;
3041 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3042 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3043 reg_errcode_t ret;
3044 int token_len2 = 0, is_range_exp = 0;
3045 re_token_t token2;
3047 start_elem.opr.name = start_name_buf;
3048 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3049 syntax, first_round);
3050 if (BE (ret != REG_NOERROR, 0))
3052 *err = ret;
3053 goto parse_bracket_exp_free_return;
3055 first_round = 0;
3057 /* Get information about the next token. We need it in any case. */
3058 token_len = peek_token_bracket (token, regexp, syntax);
3060 /* Do not check for ranges if we know they are not allowed. */
3061 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3063 if (BE (token->type == END_OF_RE, 0))
3065 *err = REG_EBRACK;
3066 goto parse_bracket_exp_free_return;
3068 if (token->type == OP_CHARSET_RANGE)
3070 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3071 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3072 if (BE (token2.type == END_OF_RE, 0))
3074 *err = REG_EBRACK;
3075 goto parse_bracket_exp_free_return;
3077 if (token2.type == OP_CLOSE_BRACKET)
3079 /* We treat the last '-' as a normal character. */
3080 re_string_skip_bytes (regexp, -token_len);
3081 token->type = CHARACTER;
3083 else
3084 is_range_exp = 1;
3088 if (is_range_exp == 1)
3090 end_elem.opr.name = end_name_buf;
3091 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3092 dfa, syntax, 1);
3093 if (BE (ret != REG_NOERROR, 0))
3095 *err = ret;
3096 goto parse_bracket_exp_free_return;
3099 token_len = peek_token_bracket (token, regexp, syntax);
3101 #ifdef _LIBC
3102 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3103 &start_elem, &end_elem);
3104 #else
3105 # ifdef RE_ENABLE_I18N
3106 *err = build_range_exp (sbcset,
3107 dfa->mb_cur_max > 1 ? mbcset : NULL,
3108 &range_alloc, &start_elem, &end_elem);
3109 # else
3110 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3111 # endif
3112 #endif /* RE_ENABLE_I18N */
3113 if (BE (*err != REG_NOERROR, 0))
3114 goto parse_bracket_exp_free_return;
3116 else
3118 switch (start_elem.type)
3120 case SB_CHAR:
3121 bitset_set (sbcset, start_elem.opr.ch);
3122 break;
3123 #ifdef RE_ENABLE_I18N
3124 case MB_CHAR:
3125 /* Check whether the array has enough space. */
3126 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3128 wchar_t *new_mbchars;
3129 /* Not enough, realloc it. */
3130 /* +1 in case of mbcset->nmbchars is 0. */
3131 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3132 /* Use realloc since array is NULL if *alloc == 0. */
3133 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3134 mbchar_alloc);
3135 if (BE (new_mbchars == NULL, 0))
3136 goto parse_bracket_exp_espace;
3137 mbcset->mbchars = new_mbchars;
3139 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3140 break;
3141 #endif /* RE_ENABLE_I18N */
3142 case EQUIV_CLASS:
3143 *err = build_equiv_class (sbcset,
3144 #ifdef RE_ENABLE_I18N
3145 mbcset, &equiv_class_alloc,
3146 #endif /* RE_ENABLE_I18N */
3147 start_elem.opr.name);
3148 if (BE (*err != REG_NOERROR, 0))
3149 goto parse_bracket_exp_free_return;
3150 break;
3151 case COLL_SYM:
3152 *err = build_collating_symbol (sbcset,
3153 #ifdef RE_ENABLE_I18N
3154 mbcset, &coll_sym_alloc,
3155 #endif /* RE_ENABLE_I18N */
3156 start_elem.opr.name);
3157 if (BE (*err != REG_NOERROR, 0))
3158 goto parse_bracket_exp_free_return;
3159 break;
3160 case CHAR_CLASS:
3161 *err = build_charclass (regexp->trans, sbcset,
3162 #ifdef RE_ENABLE_I18N
3163 mbcset, &char_class_alloc,
3164 #endif /* RE_ENABLE_I18N */
3165 start_elem.opr.name, syntax);
3166 if (BE (*err != REG_NOERROR, 0))
3167 goto parse_bracket_exp_free_return;
3168 break;
3169 default:
3170 assert (0);
3171 break;
3174 if (BE (token->type == END_OF_RE, 0))
3176 *err = REG_EBRACK;
3177 goto parse_bracket_exp_free_return;
3179 if (token->type == OP_CLOSE_BRACKET)
3180 break;
3183 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3185 /* If it is non-matching list. */
3186 if (non_match)
3187 bitset_not (sbcset);
3189 #ifdef RE_ENABLE_I18N
3190 /* Ensure only single byte characters are set. */
3191 if (dfa->mb_cur_max > 1)
3192 bitset_mask (sbcset, dfa->sb_char);
3194 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3195 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3196 || mbcset->non_match)))
3198 bin_tree_t *mbc_tree;
3199 int sbc_idx;
3200 /* Build a tree for complex bracket. */
3201 dfa->has_mb_node = 1;
3202 br_token.type = COMPLEX_BRACKET;
3203 br_token.opr.mbcset = mbcset;
3204 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3205 if (BE (mbc_tree == NULL, 0))
3206 goto parse_bracket_exp_espace;
3207 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3208 if (sbcset[sbc_idx])
3209 break;
3210 /* If there are no bits set in sbcset, there is no point
3211 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3212 if (sbc_idx < BITSET_WORDS)
3214 /* Build a tree for simple bracket. */
3215 br_token.type = SIMPLE_BRACKET;
3216 br_token.opr.sbcset = sbcset;
3217 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218 if (BE (work_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3221 /* Then join them by ALT node. */
3222 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3223 if (BE (work_tree == NULL, 0))
3224 goto parse_bracket_exp_espace;
3226 else
3228 re_free (sbcset);
3229 work_tree = mbc_tree;
3232 else
3233 #endif /* not RE_ENABLE_I18N */
3235 #ifdef RE_ENABLE_I18N
3236 free_charset (mbcset);
3237 #endif
3238 /* Build a tree for simple bracket. */
3239 br_token.type = SIMPLE_BRACKET;
3240 br_token.opr.sbcset = sbcset;
3241 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3242 if (BE (work_tree == NULL, 0))
3243 goto parse_bracket_exp_espace;
3245 return work_tree;
3247 parse_bracket_exp_espace:
3248 *err = REG_ESPACE;
3249 parse_bracket_exp_free_return:
3250 re_free (sbcset);
3251 #ifdef RE_ENABLE_I18N
3252 free_charset (mbcset);
3253 #endif /* RE_ENABLE_I18N */
3254 return NULL;
3257 /* Parse an element in the bracket expression. */
3259 static reg_errcode_t
3260 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3261 re_token_t *token, int token_len, re_dfa_t *dfa,
3262 reg_syntax_t syntax, int accept_hyphen)
3264 #ifdef RE_ENABLE_I18N
3265 int cur_char_size;
3266 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3267 if (cur_char_size > 1)
3269 elem->type = MB_CHAR;
3270 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3271 re_string_skip_bytes (regexp, cur_char_size);
3272 return REG_NOERROR;
3274 #endif /* RE_ENABLE_I18N */
3275 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3276 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3277 || token->type == OP_OPEN_EQUIV_CLASS)
3278 return parse_bracket_symbol (elem, regexp, token);
3279 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3281 /* A '-' must only appear as anything but a range indicator before
3282 the closing bracket. Everything else is an error. */
3283 re_token_t token2;
3284 (void) peek_token_bracket (&token2, regexp, syntax);
3285 if (token2.type != OP_CLOSE_BRACKET)
3286 /* The actual error value is not standardized since this whole
3287 case is undefined. But ERANGE makes good sense. */
3288 return REG_ERANGE;
3290 elem->type = SB_CHAR;
3291 elem->opr.ch = token->opr.c;
3292 return REG_NOERROR;
3295 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3296 such as [:<character_class>:], [.<collating_element>.], and
3297 [=<equivalent_class>=]. */
3299 static reg_errcode_t
3300 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3301 re_token_t *token)
3303 unsigned char ch, delim = token->opr.c;
3304 int i = 0;
3305 if (re_string_eoi(regexp))
3306 return REG_EBRACK;
3307 for (;; ++i)
3309 if (i >= BRACKET_NAME_BUF_SIZE)
3310 return REG_EBRACK;
3311 if (token->type == OP_OPEN_CHAR_CLASS)
3312 ch = re_string_fetch_byte_case (regexp);
3313 else
3314 ch = re_string_fetch_byte (regexp);
3315 if (re_string_eoi(regexp))
3316 return REG_EBRACK;
3317 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3318 break;
3319 elem->opr.name[i] = ch;
3321 re_string_skip_bytes (regexp, 1);
3322 elem->opr.name[i] = '\0';
3323 switch (token->type)
3325 case OP_OPEN_COLL_ELEM:
3326 elem->type = COLL_SYM;
3327 break;
3328 case OP_OPEN_EQUIV_CLASS:
3329 elem->type = EQUIV_CLASS;
3330 break;
3331 case OP_OPEN_CHAR_CLASS:
3332 elem->type = CHAR_CLASS;
3333 break;
3334 default:
3335 break;
3337 return REG_NOERROR;
3340 /* Helper function for parse_bracket_exp.
3341 Build the equivalence class which is represented by NAME.
3342 The result are written to MBCSET and SBCSET.
3343 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3344 is a pointer argument sinse we may update it. */
3346 static reg_errcode_t
3347 #ifdef RE_ENABLE_I18N
3348 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3349 int *equiv_class_alloc, const unsigned char *name)
3350 #else /* not RE_ENABLE_I18N */
3351 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3352 #endif /* not RE_ENABLE_I18N */
3354 #ifdef _LIBC
3355 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3356 if (nrules != 0)
3358 const int32_t *table, *indirect;
3359 const unsigned char *weights, *extra, *cp;
3360 unsigned char char_buf[2];
3361 int32_t idx1, idx2;
3362 unsigned int ch;
3363 size_t len;
3364 /* This #include defines a local function! */
3365 # include <locale/weight.h>
3366 /* Calculate the index for equivalence class. */
3367 cp = name;
3368 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3369 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3370 _NL_COLLATE_WEIGHTMB);
3371 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3372 _NL_COLLATE_EXTRAMB);
3373 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3374 _NL_COLLATE_INDIRECTMB);
3375 idx1 = findidx (&cp);
3376 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3377 /* This isn't a valid character. */
3378 return REG_ECOLLATE;
3380 /* Build single byte matcing table for this equivalence class. */
3381 char_buf[1] = (unsigned char) '\0';
3382 len = weights[idx1];
3383 for (ch = 0; ch < SBC_MAX; ++ch)
3385 char_buf[0] = ch;
3386 cp = char_buf;
3387 idx2 = findidx (&cp);
3389 idx2 = table[ch];
3391 if (idx2 == 0)
3392 /* This isn't a valid character. */
3393 continue;
3394 if (len == weights[idx2])
3396 int cnt = 0;
3397 while (cnt <= len &&
3398 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3399 ++cnt;
3401 if (cnt > len)
3402 bitset_set (sbcset, ch);
3405 /* Check whether the array has enough space. */
3406 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3408 /* Not enough, realloc it. */
3409 /* +1 in case of mbcset->nequiv_classes is 0. */
3410 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3411 /* Use realloc since the array is NULL if *alloc == 0. */
3412 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3413 int32_t,
3414 new_equiv_class_alloc);
3415 if (BE (new_equiv_classes == NULL, 0))
3416 return REG_ESPACE;
3417 mbcset->equiv_classes = new_equiv_classes;
3418 *equiv_class_alloc = new_equiv_class_alloc;
3420 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3422 else
3423 #endif /* _LIBC */
3425 if (BE (strlen ((const char *) name) != 1, 0))
3426 return REG_ECOLLATE;
3427 bitset_set (sbcset, *name);
3429 return REG_NOERROR;
3432 /* Helper function for parse_bracket_exp.
3433 Build the character class which is represented by NAME.
3434 The result are written to MBCSET and SBCSET.
3435 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3436 is a pointer argument sinse we may update it. */
3438 static reg_errcode_t
3439 #ifdef RE_ENABLE_I18N
3440 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3441 re_charset_t *mbcset, int *char_class_alloc,
3442 const unsigned char *class_name, reg_syntax_t syntax)
3443 #else /* not RE_ENABLE_I18N */
3444 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3445 const unsigned char *class_name, reg_syntax_t syntax)
3446 #endif /* not RE_ENABLE_I18N */
3448 int i;
3449 const char *name = (const char *) class_name;
3451 /* In case of REG_ICASE "upper" and "lower" match the both of
3452 upper and lower cases. */
3453 if ((syntax & RE_ICASE)
3454 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3455 name = "alpha";
3457 #ifdef RE_ENABLE_I18N
3458 /* Check the space of the arrays. */
3459 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3461 /* Not enough, realloc it. */
3462 /* +1 in case of mbcset->nchar_classes is 0. */
3463 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3464 /* Use realloc since array is NULL if *alloc == 0. */
3465 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3466 new_char_class_alloc);
3467 if (BE (new_char_classes == NULL, 0))
3468 return REG_ESPACE;
3469 mbcset->char_classes = new_char_classes;
3470 *char_class_alloc = new_char_class_alloc;
3472 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3473 #endif /* RE_ENABLE_I18N */
3475 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3476 do { \
3477 if (BE (trans != NULL, 0)) \
3479 for (i = 0; i < SBC_MAX; ++i) \
3480 if (ctype_func (i)) \
3481 bitset_set (sbcset, trans[i]); \
3483 else \
3485 for (i = 0; i < SBC_MAX; ++i) \
3486 if (ctype_func (i)) \
3487 bitset_set (sbcset, i); \
3489 } while (0)
3491 if (strcmp (name, "alnum") == 0)
3492 BUILD_CHARCLASS_LOOP (isalnum);
3493 else if (strcmp (name, "cntrl") == 0)
3494 BUILD_CHARCLASS_LOOP (iscntrl);
3495 else if (strcmp (name, "lower") == 0)
3496 BUILD_CHARCLASS_LOOP (islower);
3497 else if (strcmp (name, "space") == 0)
3498 BUILD_CHARCLASS_LOOP (isspace);
3499 else if (strcmp (name, "alpha") == 0)
3500 BUILD_CHARCLASS_LOOP (isalpha);
3501 else if (strcmp (name, "digit") == 0)
3502 BUILD_CHARCLASS_LOOP (isdigit);
3503 else if (strcmp (name, "print") == 0)
3504 BUILD_CHARCLASS_LOOP (isprint);
3505 else if (strcmp (name, "upper") == 0)
3506 BUILD_CHARCLASS_LOOP (isupper);
3507 else if (strcmp (name, "blank") == 0)
3508 BUILD_CHARCLASS_LOOP (isblank);
3509 else if (strcmp (name, "graph") == 0)
3510 BUILD_CHARCLASS_LOOP (isgraph);
3511 else if (strcmp (name, "punct") == 0)
3512 BUILD_CHARCLASS_LOOP (ispunct);
3513 else if (strcmp (name, "xdigit") == 0)
3514 BUILD_CHARCLASS_LOOP (isxdigit);
3515 else
3516 return REG_ECTYPE;
3518 return REG_NOERROR;
3521 static bin_tree_t *
3522 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3523 const unsigned char *class_name,
3524 const unsigned char *extra, int non_match,
3525 reg_errcode_t *err)
3527 re_bitset_ptr_t sbcset;
3528 #ifdef RE_ENABLE_I18N
3529 re_charset_t *mbcset;
3530 int alloc = 0;
3531 #endif /* not RE_ENABLE_I18N */
3532 reg_errcode_t ret;
3533 re_token_t br_token;
3534 bin_tree_t *tree;
3536 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3537 #ifdef RE_ENABLE_I18N
3538 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3539 #endif /* RE_ENABLE_I18N */
3541 #ifdef RE_ENABLE_I18N
3542 if (BE (sbcset == NULL || mbcset == NULL, 0))
3543 #else /* not RE_ENABLE_I18N */
3544 if (BE (sbcset == NULL, 0))
3545 #endif /* not RE_ENABLE_I18N */
3547 *err = REG_ESPACE;
3548 return NULL;
3551 if (non_match)
3553 #ifdef RE_ENABLE_I18N
3555 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3556 bitset_set(cset->sbcset, '\0');
3558 mbcset->non_match = 1;
3559 #endif /* not RE_ENABLE_I18N */
3562 /* We don't care the syntax in this case. */
3563 ret = build_charclass (trans, sbcset,
3564 #ifdef RE_ENABLE_I18N
3565 mbcset, &alloc,
3566 #endif /* RE_ENABLE_I18N */
3567 class_name, 0);
3569 if (BE (ret != REG_NOERROR, 0))
3571 re_free (sbcset);
3572 #ifdef RE_ENABLE_I18N
3573 free_charset (mbcset);
3574 #endif /* RE_ENABLE_I18N */
3575 *err = ret;
3576 return NULL;
3578 /* \w match '_' also. */
3579 for (; *extra; extra++)
3580 bitset_set (sbcset, *extra);
3582 /* If it is non-matching list. */
3583 if (non_match)
3584 bitset_not (sbcset);
3586 #ifdef RE_ENABLE_I18N
3587 /* Ensure only single byte characters are set. */
3588 if (dfa->mb_cur_max > 1)
3589 bitset_mask (sbcset, dfa->sb_char);
3590 #endif
3592 /* Build a tree for simple bracket. */
3593 br_token.type = SIMPLE_BRACKET;
3594 br_token.opr.sbcset = sbcset;
3595 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3596 if (BE (tree == NULL, 0))
3597 goto build_word_op_espace;
3599 #ifdef RE_ENABLE_I18N
3600 if (dfa->mb_cur_max > 1)
3602 bin_tree_t *mbc_tree;
3603 /* Build a tree for complex bracket. */
3604 br_token.type = COMPLEX_BRACKET;
3605 br_token.opr.mbcset = mbcset;
3606 dfa->has_mb_node = 1;
3607 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3608 if (BE (mbc_tree == NULL, 0))
3609 goto build_word_op_espace;
3610 /* Then join them by ALT node. */
3611 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3612 if (BE (mbc_tree != NULL, 1))
3613 return tree;
3615 else
3617 free_charset (mbcset);
3618 return tree;
3620 #else /* not RE_ENABLE_I18N */
3621 return tree;
3622 #endif /* not RE_ENABLE_I18N */
3624 build_word_op_espace:
3625 re_free (sbcset);
3626 #ifdef RE_ENABLE_I18N
3627 free_charset (mbcset);
3628 #endif /* RE_ENABLE_I18N */
3629 *err = REG_ESPACE;
3630 return NULL;
3633 /* This is intended for the expressions like "a{1,3}".
3634 Fetch a number from `input', and return the number.
3635 Return -1, if the number field is empty like "{,1}".
3636 Return -2, If an error is occured. */
3638 static int
3639 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3641 int num = -1;
3642 unsigned char c;
3643 while (1)
3645 fetch_token (token, input, syntax);
3646 c = token->opr.c;
3647 if (BE (token->type == END_OF_RE, 0))
3648 return -2;
3649 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3650 break;
3651 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3652 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3653 num = (num > RE_DUP_MAX) ? -2 : num;
3655 return num;
3658 #ifdef RE_ENABLE_I18N
3659 static void
3660 free_charset (re_charset_t *cset)
3662 re_free (cset->mbchars);
3663 # ifdef _LIBC
3664 re_free (cset->coll_syms);
3665 re_free (cset->equiv_classes);
3666 re_free (cset->range_starts);
3667 re_free (cset->range_ends);
3668 # endif
3669 re_free (cset->char_classes);
3670 re_free (cset);
3672 #endif /* RE_ENABLE_I18N */
3674 /* Functions for binary tree operation. */
3676 /* Create a tree node. */
3678 static bin_tree_t *
3679 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3680 re_token_type_t type)
3682 re_token_t t;
3683 t.type = type;
3684 return create_token_tree (dfa, left, right, &t);
3687 static bin_tree_t *
3688 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3689 const re_token_t *token)
3691 bin_tree_t *tree;
3692 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3694 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3696 if (storage == NULL)
3697 return NULL;
3698 storage->next = dfa->str_tree_storage;
3699 dfa->str_tree_storage = storage;
3700 dfa->str_tree_storage_idx = 0;
3702 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3704 tree->parent = NULL;
3705 tree->left = left;
3706 tree->right = right;
3707 tree->token = *token;
3708 tree->token.duplicated = 0;
3709 tree->token.opt_subexp = 0;
3710 tree->first = NULL;
3711 tree->next = NULL;
3712 tree->node_idx = -1;
3714 if (left != NULL)
3715 left->parent = tree;
3716 if (right != NULL)
3717 right->parent = tree;
3718 return tree;
3721 /* Mark the tree SRC as an optional subexpression.
3722 To be called from preorder or postorder. */
3724 static reg_errcode_t
3725 mark_opt_subexp (void *extra, bin_tree_t *node)
3727 int idx = (int) (long) extra;
3728 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3729 node->token.opt_subexp = 1;
3731 return REG_NOERROR;
3734 /* Free the allocated memory inside NODE. */
3736 static void
3737 free_token (re_token_t *node)
3739 #ifdef RE_ENABLE_I18N
3740 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3741 free_charset (node->opr.mbcset);
3742 else
3743 #endif /* RE_ENABLE_I18N */
3744 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3745 re_free (node->opr.sbcset);
3748 /* Worker function for tree walking. Free the allocated memory inside NODE
3749 and its children. */
3751 static reg_errcode_t
3752 free_tree (void *extra, bin_tree_t *node)
3754 free_token (&node->token);
3755 return REG_NOERROR;
3759 /* Duplicate the node SRC, and return new node. This is a preorder
3760 visit similar to the one implemented by the generic visitor, but
3761 we need more infrastructure to maintain two parallel trees --- so,
3762 it's easier to duplicate. */
3764 static bin_tree_t *
3765 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3767 const bin_tree_t *node;
3768 bin_tree_t *dup_root;
3769 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3771 for (node = root; ; )
3773 /* Create a new tree and link it back to the current parent. */
3774 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3775 if (*p_new == NULL)
3776 return NULL;
3777 (*p_new)->parent = dup_node;
3778 (*p_new)->token.duplicated = 1;
3779 dup_node = *p_new;
3781 /* Go to the left node, or up and to the right. */
3782 if (node->left)
3784 node = node->left;
3785 p_new = &dup_node->left;
3787 else
3789 const bin_tree_t *prev = NULL;
3790 while (node->right == prev || node->right == NULL)
3792 prev = node;
3793 node = node->parent;
3794 dup_node = dup_node->parent;
3795 if (!node)
3796 return dup_root;
3798 node = node->right;
3799 p_new = &dup_node->right;