Update
[glibc.git] / posix / regcomp.c
blob5de5bf725ac66f00400d9d82d6e03b449618ff97
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 struct subexp_optimize
38 re_dfa_t *dfa;
39 re_token_t *nodes;
40 int no_sub, re_nsub;
42 static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
43 bin_tree_t *node, int sidx, int depth);
44 static reg_errcode_t analyze (re_dfa_t *dfa);
45 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
46 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
47 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
48 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
49 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
50 int top_clone_node, int root_node,
51 unsigned int constraint);
52 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
53 unsigned int constraint);
54 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
55 unsigned int constraint);
56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 int node, int root);
59 static void calc_inveclosure (re_dfa_t *dfa);
60 static int fetch_number (re_string_t *input, re_token_t *token,
61 reg_syntax_t syntax);
62 static void fetch_token (re_token_t *result, re_string_t *input,
63 reg_syntax_t syntax);
64 static int peek_token (re_token_t *token, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
69 reg_syntax_t syntax, reg_errcode_t *err);
70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
80 re_token_t *token, reg_syntax_t syntax,
81 int nest, reg_errcode_t *err);
82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
83 re_dfa_t *dfa, re_token_t *token,
84 reg_syntax_t syntax, reg_errcode_t *err);
85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
86 re_token_t *token, reg_syntax_t syntax,
87 reg_errcode_t *err);
88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token, int token_len,
91 re_dfa_t *dfa,
92 reg_syntax_t syntax,
93 int accept_hyphen);
94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
95 re_string_t *regexp,
96 re_token_t *token);
97 #ifndef _LIBC
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 re_charset_t *mbcset, int *range_alloc,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *coll_sym_alloc,
106 const unsigned char *name);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
109 bracket_elem_t *start_elem,
110 bracket_elem_t *end_elem);
111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
112 const unsigned char *name);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
117 re_charset_t *mbcset,
118 int *equiv_class_alloc,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
121 re_bitset_ptr_t sbcset,
122 re_charset_t *mbcset,
123 int *char_class_alloc,
124 const unsigned char *class_name,
125 reg_syntax_t syntax);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
128 const unsigned char *name);
129 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
130 re_bitset_ptr_t sbcset,
131 const unsigned char *class_name,
132 reg_syntax_t syntax);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
135 unsigned RE_TRANSLATE_TYPE trans,
136 const unsigned char *class_name,
137 const unsigned char *extra,
138 int non_match, reg_errcode_t *err);
139 static bin_tree_t *create_tree (re_dfa_t *dfa,
140 bin_tree_t *left, bin_tree_t *right,
141 re_token_type_t type, int index);
142 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
143 bin_tree_t *left, bin_tree_t *right,
144 const re_token_t *token)
145 __attribute ((noinline));
146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
147 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
148 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid[] attribute_hidden =
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
159 "\0"
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
162 "\0"
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
165 "\0"
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
168 "\0"
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
171 "\0"
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
174 "\0"
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
177 "\0"
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
180 "\0"
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
183 "\0"
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
186 "\0"
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
189 "\0"
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
192 "\0"
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
195 "\0"
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
198 "\0"
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
201 "\0"
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
204 "\0"
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx[] attribute_hidden =
211 REG_NOERROR_IDX,
212 REG_NOMATCH_IDX,
213 REG_BADPAT_IDX,
214 REG_ECOLLATE_IDX,
215 REG_ECTYPE_IDX,
216 REG_EESCAPE_IDX,
217 REG_ESUBREG_IDX,
218 REG_EBRACK_IDX,
219 REG_EPAREN_IDX,
220 REG_EBRACE_IDX,
221 REG_BADBR_IDX,
222 REG_ERANGE_IDX,
223 REG_ESPACE_IDX,
224 REG_BADRPT_IDX,
225 REG_EEND_IDX,
226 REG_ESIZE_IDX,
227 REG_ERPAREN_IDX
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
239 const char *
240 re_compile_pattern (pattern, length, bufp)
241 const char *pattern;
242 size_t length;
243 struct re_pattern_buffer *bufp;
245 reg_errcode_t ret;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
252 /* Match anchors at newline. */
253 bufp->newline_anchor = 1;
255 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
257 if (!ret)
258 return NULL;
259 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
261 #ifdef _LIBC
262 weak_alias (__re_compile_pattern, re_compile_pattern)
263 #endif
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
280 reg_syntax_t
281 re_set_syntax (syntax)
282 reg_syntax_t syntax;
284 reg_syntax_t ret = re_syntax_options;
286 re_syntax_options = syntax;
287 return ret;
289 #ifdef _LIBC
290 weak_alias (__re_set_syntax, re_set_syntax)
291 #endif
294 re_compile_fastmap (bufp)
295 struct re_pattern_buffer *bufp;
297 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
298 char *fastmap = bufp->fastmap;
300 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
301 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
302 if (dfa->init_state != dfa->init_state_word)
303 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
304 if (dfa->init_state != dfa->init_state_nl)
305 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
306 if (dfa->init_state != dfa->init_state_begbuf)
307 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
308 bufp->fastmap_accurate = 1;
309 return 0;
311 #ifdef _LIBC
312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
313 #endif
315 static inline void
316 __attribute ((always_inline))
317 re_set_fastmap (char *fastmap, int icase, int ch)
319 fastmap[ch] = 1;
320 if (icase)
321 fastmap[tolower (ch)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
327 static void
328 re_compile_fastmap_iter (bufp, init_state, fastmap)
329 regex_t *bufp;
330 const re_dfastate_t *init_state;
331 char *fastmap;
333 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
334 int node_cnt;
335 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
336 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
338 int node = init_state->nodes.elems[node_cnt];
339 re_token_type_t type = dfa->nodes[node].type;
341 if (type == CHARACTER)
343 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
347 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
348 wchar_t wc;
349 mbstate_t state;
351 p = buf;
352 *p++ = dfa->nodes[node].opr.c;
353 while (++node < dfa->nodes_len
354 && dfa->nodes[node].type == CHARACTER
355 && dfa->nodes[node].mb_partial)
356 *p++ = dfa->nodes[node].opr.c;
357 memset (&state, 0, sizeof (state));
358 if (mbrtowc (&wc, (const char *) buf, p - buf,
359 &state) == p - buf
360 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
361 re_set_fastmap (fastmap, 0, buf[0]);
363 #endif
365 else if (type == SIMPLE_BRACKET)
367 int i, j, ch;
368 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
369 for (j = 0; j < UINT_BITS; ++j, ++ch)
370 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
371 re_set_fastmap (fastmap, icase, ch);
373 #ifdef RE_ENABLE_I18N
374 else if (type == COMPLEX_BRACKET)
376 int i;
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
379 || cset->nranges || cset->nchar_classes)
381 # ifdef _LIBC
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
390 int j, ch;
391 const int32_t *table = (const int32_t *)
392 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
393 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
394 for (j = 0; j < UINT_BITS; ++j, ++ch)
395 if (table[ch] < 0)
396 re_set_fastmap (fastmap, icase, ch);
398 # else
399 if (dfa->mb_cur_max > 1)
400 for (i = 0; i < SBC_MAX; ++i)
401 if (__btowc (i) == WEOF)
402 re_set_fastmap (fastmap, icase, i);
403 # endif /* not _LIBC */
405 for (i = 0; i < cset->nmbchars; ++i)
407 char buf[256];
408 mbstate_t state;
409 memset (&state, '\0', sizeof (state));
410 __wcrtomb (buf, cset->mbchars[i], &state);
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
415 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
419 #endif /* RE_ENABLE_I18N */
420 else if (type == OP_PERIOD
421 #ifdef RE_ENABLE_I18N
422 || type == OP_UTF8_PERIOD
423 #endif /* RE_ENABLE_I18N */
424 || type == END_OF_RE)
426 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
427 if (type == END_OF_RE)
428 bufp->can_be_null = 1;
429 return;
434 /* Entry point for POSIX code. */
435 /* regcomp takes a regular expression as a string and compiles it.
437 PREG is a regex_t *. We do not expect any fields to be initialized,
438 since POSIX says we shouldn't. Thus, we set
440 `buffer' to the compiled pattern;
441 `used' to the length of the compiled pattern;
442 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
443 REG_EXTENDED bit in CFLAGS is set; otherwise, to
444 RE_SYNTAX_POSIX_BASIC;
445 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
446 `fastmap' to an allocated space for the fastmap;
447 `fastmap_accurate' to zero;
448 `re_nsub' to the number of subexpressions in PATTERN.
450 PATTERN is the address of the pattern string.
452 CFLAGS is a series of bits which affect compilation.
454 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
455 use POSIX basic syntax.
457 If REG_NEWLINE is set, then . and [^...] don't match newline.
458 Also, regexec will try a match beginning after every newline.
460 If REG_ICASE is set, then we considers upper- and lowercase
461 versions of letters to be equivalent when matching.
463 If REG_NOSUB is set, then when PREG is passed to regexec, that
464 routine will report only success or failure, and nothing about the
465 registers.
467 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
468 the return codes and their meanings.) */
471 regcomp (preg, pattern, cflags)
472 regex_t *__restrict preg;
473 const char *__restrict pattern;
474 int cflags;
476 reg_errcode_t ret;
477 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
478 : RE_SYNTAX_POSIX_BASIC);
480 preg->buffer = NULL;
481 preg->allocated = 0;
482 preg->used = 0;
484 /* Try to allocate space for the fastmap. */
485 preg->fastmap = re_malloc (char, SBC_MAX);
486 if (BE (preg->fastmap == NULL, 0))
487 return REG_ESPACE;
489 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491 /* If REG_NEWLINE is set, newlines are treated differently. */
492 if (cflags & REG_NEWLINE)
493 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
494 syntax &= ~RE_DOT_NEWLINE;
495 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
496 /* It also changes the matching behavior. */
497 preg->newline_anchor = 1;
499 else
500 preg->newline_anchor = 0;
501 preg->no_sub = !!(cflags & REG_NOSUB);
502 preg->translate = NULL;
504 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506 /* POSIX doesn't distinguish between an unmatched open-group and an
507 unmatched close-group: both are REG_EPAREN. */
508 if (ret == REG_ERPAREN)
509 ret = REG_EPAREN;
511 /* We have already checked preg->fastmap != NULL. */
512 if (BE (ret == REG_NOERROR, 1))
513 /* Compute the fastmap now, since regexec cannot modify the pattern
514 buffer. This function never fails in this implementation. */
515 (void) re_compile_fastmap (preg);
516 else
518 /* Some error occurred while compiling the expression. */
519 re_free (preg->fastmap);
520 preg->fastmap = NULL;
523 return (int) ret;
525 #ifdef _LIBC
526 weak_alias (__regcomp, regcomp)
527 #endif
529 /* Returns a message corresponding to an error code, ERRCODE, returned
530 from either regcomp or regexec. We don't use PREG here. */
532 size_t
533 regerror (errcode, preg, errbuf, errbuf_size)
534 int errcode;
535 const regex_t *preg;
536 char *errbuf;
537 size_t errbuf_size;
539 const char *msg;
540 size_t msg_size;
542 if (BE (errcode < 0
543 || errcode >= (int) (sizeof (__re_error_msgid_idx)
544 / sizeof (__re_error_msgid_idx[0])), 0))
545 /* Only error codes returned by the rest of the code should be passed
546 to this routine. If we are given anything else, or if other regex
547 code generates an invalid error code, then the program has a bug.
548 Dump core so we can fix it. */
549 abort ();
551 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
553 msg_size = strlen (msg) + 1; /* Includes the null. */
555 if (BE (errbuf_size != 0, 1))
557 if (BE (msg_size > errbuf_size, 0))
559 #if defined HAVE_MEMPCPY || defined _LIBC
560 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
561 #else
562 memcpy (errbuf, msg, errbuf_size - 1);
563 errbuf[errbuf_size - 1] = 0;
564 #endif
566 else
567 memcpy (errbuf, msg, msg_size);
570 return msg_size;
572 #ifdef _LIBC
573 weak_alias (__regerror, regerror)
574 #endif
577 #ifdef RE_ENABLE_I18N
578 /* This static array is used for the map to single-byte characters when
579 UTF-8 is used. Otherwise we would allocate memory just to initialize
580 it the same all the time. UTF-8 is the preferred encoding so this is
581 a worthwhile optimization. */
582 static const bitset utf8_sb_map =
584 /* Set the first 128 bits. */
585 # if UINT_MAX == 0xffffffff
586 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
587 # else
588 # error "Add case for new unsigned int size"
589 # endif
591 #endif
594 static void
595 free_dfa_content (re_dfa_t *dfa)
597 int i, j;
599 if (dfa->nodes)
600 for (i = 0; i < dfa->nodes_len; ++i)
602 re_token_t *node = dfa->nodes + i;
603 #ifdef RE_ENABLE_I18N
604 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
605 free_charset (node->opr.mbcset);
606 else
607 #endif /* RE_ENABLE_I18N */
608 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
609 re_free (node->opr.sbcset);
611 re_free (dfa->nexts);
612 for (i = 0; i < dfa->nodes_len; ++i)
614 if (dfa->eclosures != NULL)
615 re_node_set_free (dfa->eclosures + i);
616 if (dfa->inveclosures != NULL)
617 re_node_set_free (dfa->inveclosures + i);
618 if (dfa->edests != NULL)
619 re_node_set_free (dfa->edests + i);
621 re_free (dfa->edests);
622 re_free (dfa->eclosures);
623 re_free (dfa->inveclosures);
624 re_free (dfa->nodes);
626 if (dfa->state_table)
627 for (i = 0; i <= dfa->state_hash_mask; ++i)
629 struct re_state_table_entry *entry = dfa->state_table + i;
630 for (j = 0; j < entry->num; ++j)
632 re_dfastate_t *state = entry->array[j];
633 free_state (state);
635 re_free (entry->array);
637 re_free (dfa->state_table);
638 #ifdef RE_ENABLE_I18N
639 if (dfa->sb_char != utf8_sb_map)
640 re_free (dfa->sb_char);
641 #endif
642 re_free (dfa->subexp_map);
643 #ifdef DEBUG
644 re_free (dfa->re_str);
645 #endif
647 re_free (dfa);
651 /* Free dynamically allocated space used by PREG. */
653 void
654 regfree (preg)
655 regex_t *preg;
657 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
658 if (BE (dfa != NULL, 1))
659 free_dfa_content (dfa);
660 preg->buffer = NULL;
661 preg->allocated = 0;
663 re_free (preg->fastmap);
664 preg->fastmap = NULL;
666 re_free (preg->translate);
667 preg->translate = NULL;
669 #ifdef _LIBC
670 weak_alias (__regfree, regfree)
671 #endif
673 /* Entry points compatible with 4.2 BSD regex library. We don't define
674 them unless specifically requested. */
676 #if defined _REGEX_RE_COMP || defined _LIBC
678 /* BSD has one and only one pattern buffer. */
679 static struct re_pattern_buffer re_comp_buf;
681 char *
682 # ifdef _LIBC
683 /* Make these definitions weak in libc, so POSIX programs can redefine
684 these names if they don't use our functions, and still use
685 regcomp/regexec above without link errors. */
686 weak_function
687 # endif
688 re_comp (s)
689 const char *s;
691 reg_errcode_t ret;
692 char *fastmap;
694 if (!s)
696 if (!re_comp_buf.buffer)
697 return gettext ("No previous regular expression");
698 return 0;
701 if (re_comp_buf.buffer)
703 fastmap = re_comp_buf.fastmap;
704 re_comp_buf.fastmap = NULL;
705 __regfree (&re_comp_buf);
706 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
707 re_comp_buf.fastmap = fastmap;
710 if (re_comp_buf.fastmap == NULL)
712 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
713 if (re_comp_buf.fastmap == NULL)
714 return (char *) gettext (__re_error_msgid
715 + __re_error_msgid_idx[(int) REG_ESPACE]);
718 /* Since `re_exec' always passes NULL for the `regs' argument, we
719 don't need to initialize the pattern buffer fields which affect it. */
721 /* Match anchors at newlines. */
722 re_comp_buf.newline_anchor = 1;
724 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
726 if (!ret)
727 return NULL;
729 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
730 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
733 #ifdef _LIBC
734 libc_freeres_fn (free_mem)
736 __regfree (&re_comp_buf);
738 #endif
740 #endif /* _REGEX_RE_COMP */
742 /* Internal entry point.
743 Compile the regular expression PATTERN, whose length is LENGTH.
744 SYNTAX indicate regular expression's syntax. */
746 static reg_errcode_t
747 re_compile_internal (preg, pattern, length, syntax)
748 regex_t *preg;
749 const char * pattern;
750 int length;
751 reg_syntax_t syntax;
753 reg_errcode_t err = REG_NOERROR;
754 re_dfa_t *dfa;
755 re_string_t regexp;
757 /* Initialize the pattern buffer. */
758 preg->fastmap_accurate = 0;
759 preg->syntax = syntax;
760 preg->not_bol = preg->not_eol = 0;
761 preg->used = 0;
762 preg->re_nsub = 0;
763 preg->can_be_null = 0;
764 preg->regs_allocated = REGS_UNALLOCATED;
766 /* Initialize the dfa. */
767 dfa = (re_dfa_t *) preg->buffer;
768 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
775 if (dfa == NULL)
776 return REG_ESPACE;
777 preg->allocated = sizeof (re_dfa_t);
778 preg->buffer = (unsigned char *) dfa;
780 preg->used = sizeof (re_dfa_t);
782 err = init_dfa (dfa, length);
783 if (BE (err != REG_NOERROR, 0))
785 free_dfa_content (dfa);
786 preg->buffer = NULL;
787 preg->allocated = 0;
788 return err;
790 #ifdef DEBUG
791 dfa->re_str = re_malloc (char, length + 1);
792 strncpy (dfa->re_str, pattern, length + 1);
793 #endif
795 err = re_string_construct (&regexp, pattern, length, preg->translate,
796 syntax & RE_ICASE, dfa);
797 if (BE (err != REG_NOERROR, 0))
799 re_compile_internal_free_return:
800 free_workarea_compile (preg);
801 re_string_destruct (&regexp);
802 free_dfa_content (dfa);
803 preg->buffer = NULL;
804 preg->allocated = 0;
805 return err;
808 /* Parse the regular expression, and build a structure tree. */
809 preg->re_nsub = 0;
810 dfa->str_tree = parse (&regexp, preg, syntax, &err);
811 if (BE (dfa->str_tree == NULL, 0))
812 goto re_compile_internal_free_return;
814 #ifdef RE_ENABLE_I18N
815 /* If possible, do searching in single byte encoding to speed things up. */
816 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
817 optimize_utf8 (dfa);
818 #endif
820 if (preg->re_nsub > 0)
822 struct subexp_optimize so;
824 so.dfa = dfa;
825 so.nodes = dfa->nodes;
826 so.no_sub = preg->no_sub;
827 so.re_nsub = preg->re_nsub;
828 dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
831 /* Analyze the tree and collect information which is necessary to
832 create the dfa. */
833 err = analyze (dfa);
834 if (BE (err != REG_NOERROR, 0))
835 goto re_compile_internal_free_return;
837 /* Then create the initial state of the dfa. */
838 err = create_initial_state (dfa);
840 /* Release work areas. */
841 free_workarea_compile (preg);
842 re_string_destruct (&regexp);
844 if (BE (err != REG_NOERROR, 0))
846 free_dfa_content (dfa);
847 preg->buffer = NULL;
848 preg->allocated = 0;
851 return err;
854 /* Initialize DFA. We use the length of the regular expression PAT_LEN
855 as the initial length of some arrays. */
857 static reg_errcode_t
858 init_dfa (dfa, pat_len)
859 re_dfa_t *dfa;
860 int pat_len;
862 int table_size;
863 #ifndef _LIBC
864 char *codeset_name;
865 #endif
867 memset (dfa, '\0', sizeof (re_dfa_t));
869 /* Force allocation of str_tree_storage the first time. */
870 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
872 dfa->nodes_alloc = pat_len + 1;
873 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
875 dfa->states_alloc = pat_len + 1;
877 /* table_size = 2 ^ ceil(log pat_len) */
878 for (table_size = 1; table_size > 0; table_size <<= 1)
879 if (table_size > pat_len)
880 break;
882 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
883 dfa->state_hash_mask = table_size - 1;
885 dfa->mb_cur_max = MB_CUR_MAX;
886 #ifdef _LIBC
887 if (dfa->mb_cur_max == 6
888 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
889 dfa->is_utf8 = 1;
890 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
891 != 0);
892 #else
893 # ifdef HAVE_LANGINFO_CODESET
894 codeset_name = nl_langinfo (CODESET);
895 # else
896 codeset_name = getenv ("LC_ALL");
897 if (codeset_name == NULL || codeset[0] == '\0')
898 codeset_name = getenv ("LC_CTYPE");
899 if (codeset_name == NULL || codeset[0] == '\0')
900 codeset_name = getenv ("LANG");
901 if (codeset_name == NULL)
902 codeset_name = "";
903 else if (strchr (codeset_name, '.') != NULL)
904 codeset_name = strchr (codeset_name, '.') + 1;
905 # endif
907 if (strcasecmp (codeset_name, "UTF-8") == 0
908 || strcasecmp (codeset_name, "UTF8") == 0)
909 dfa->is_utf8 = 1;
911 /* We check exhaustively in the loop below if this charset is a
912 superset of ASCII. */
913 dfa->map_notascii = 0;
914 #endif
916 #ifdef RE_ENABLE_I18N
917 if (dfa->mb_cur_max > 1)
919 if (dfa->is_utf8)
920 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
921 else
923 int i, j, ch;
925 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
926 if (BE (dfa->sb_char == NULL, 0))
927 return REG_ESPACE;
929 /* Clear all bits by, then set those corresponding to single
930 byte chars. */
931 bitset_empty (dfa->sb_char);
933 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
934 for (j = 0; j < UINT_BITS; ++j, ++ch)
936 wchar_t wch = __btowc (ch);
937 if (wch != WEOF)
938 dfa->sb_char[i] |= 1 << j;
939 # ifndef _LIBC
940 if (isascii (ch) && wch != (wchar_t) ch)
941 dfa->map_notascii = 1;
942 # endif
946 #endif
948 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
949 return REG_ESPACE;
950 return REG_NOERROR;
953 /* Initialize WORD_CHAR table, which indicate which character is
954 "word". In this case "word" means that it is the word construction
955 character used by some operators like "\<", "\>", etc. */
957 static void
958 init_word_char (dfa)
959 re_dfa_t *dfa;
961 int i, j, ch;
962 dfa->word_ops_used = 1;
963 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
964 for (j = 0; j < UINT_BITS; ++j, ++ch)
965 if (isalnum (ch) || ch == '_')
966 dfa->word_char[i] |= 1 << j;
969 /* Free the work area which are only used while compiling. */
971 static void
972 free_workarea_compile (preg)
973 regex_t *preg;
975 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
976 bin_tree_storage_t *storage, *next;
977 for (storage = dfa->str_tree_storage; storage; storage = next)
979 next = storage->next;
980 re_free (storage);
982 dfa->str_tree_storage = NULL;
983 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
984 dfa->str_tree = NULL;
985 re_free (dfa->org_indices);
986 dfa->org_indices = NULL;
989 /* Create initial states for all contexts. */
991 static reg_errcode_t
992 create_initial_state (dfa)
993 re_dfa_t *dfa;
995 int first, i;
996 reg_errcode_t err;
997 re_node_set init_nodes;
999 /* Initial states have the epsilon closure of the node which is
1000 the first node of the regular expression. */
1001 first = dfa->str_tree->first;
1002 dfa->init_node = first;
1003 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1004 if (BE (err != REG_NOERROR, 0))
1005 return err;
1007 /* The back-references which are in initial states can epsilon transit,
1008 since in this case all of the subexpressions can be null.
1009 Then we add epsilon closures of the nodes which are the next nodes of
1010 the back-references. */
1011 if (dfa->nbackref > 0)
1012 for (i = 0; i < init_nodes.nelem; ++i)
1014 int node_idx = init_nodes.elems[i];
1015 re_token_type_t type = dfa->nodes[node_idx].type;
1017 int clexp_idx;
1018 if (type != OP_BACK_REF)
1019 continue;
1020 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1022 re_token_t *clexp_node;
1023 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1024 if (clexp_node->type == OP_CLOSE_SUBEXP
1025 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1026 break;
1028 if (clexp_idx == init_nodes.nelem)
1029 continue;
1031 if (type == OP_BACK_REF)
1033 int dest_idx = dfa->edests[node_idx].elems[0];
1034 if (!re_node_set_contains (&init_nodes, dest_idx))
1036 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1037 i = 0;
1042 /* It must be the first time to invoke acquire_state. */
1043 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1044 /* We don't check ERR here, since the initial state must not be NULL. */
1045 if (BE (dfa->init_state == NULL, 0))
1046 return err;
1047 if (dfa->init_state->has_constraint)
1049 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1050 CONTEXT_WORD);
1051 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1052 CONTEXT_NEWLINE);
1053 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1054 &init_nodes,
1055 CONTEXT_NEWLINE
1056 | CONTEXT_BEGBUF);
1057 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1058 || dfa->init_state_begbuf == NULL, 0))
1059 return err;
1061 else
1062 dfa->init_state_word = dfa->init_state_nl
1063 = dfa->init_state_begbuf = dfa->init_state;
1065 re_node_set_free (&init_nodes);
1066 return REG_NOERROR;
1069 #ifdef RE_ENABLE_I18N
1070 /* If it is possible to do searching in single byte encoding instead of UTF-8
1071 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1072 DFA nodes where needed. */
1074 static void
1075 optimize_utf8 (dfa)
1076 re_dfa_t *dfa;
1078 int node, i, mb_chars = 0, has_period = 0;
1080 for (node = 0; node < dfa->nodes_len; ++node)
1081 switch (dfa->nodes[node].type)
1083 case CHARACTER:
1084 if (dfa->nodes[node].opr.c >= 0x80)
1085 mb_chars = 1;
1086 break;
1087 case ANCHOR:
1088 switch (dfa->nodes[node].opr.idx)
1090 case LINE_FIRST:
1091 case LINE_LAST:
1092 case BUF_FIRST:
1093 case BUF_LAST:
1094 break;
1095 default:
1096 /* Word anchors etc. cannot be handled. */
1097 return;
1099 break;
1100 case OP_PERIOD:
1101 has_period = 1;
1102 break;
1103 case OP_BACK_REF:
1104 case OP_ALT:
1105 case END_OF_RE:
1106 case OP_DUP_ASTERISK:
1107 case OP_DUP_QUESTION:
1108 case OP_OPEN_SUBEXP:
1109 case OP_CLOSE_SUBEXP:
1110 break;
1111 case SIMPLE_BRACKET:
1112 /* Just double check. */
1113 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1114 if (dfa->nodes[node].opr.sbcset[i])
1115 return;
1116 break;
1117 default:
1118 return;
1121 if (mb_chars || has_period)
1122 for (node = 0; node < dfa->nodes_len; ++node)
1124 if (dfa->nodes[node].type == CHARACTER
1125 && dfa->nodes[node].opr.c >= 0x80)
1126 dfa->nodes[node].mb_partial = 0;
1127 else if (dfa->nodes[node].type == OP_PERIOD)
1128 dfa->nodes[node].type = OP_UTF8_PERIOD;
1131 /* The search can be in single byte locale. */
1132 dfa->mb_cur_max = 1;
1133 dfa->is_utf8 = 0;
1134 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1136 #endif
1138 static bin_tree_t *
1139 optimize_subexps (so, node, sidx, depth)
1140 struct subexp_optimize *so;
1141 bin_tree_t *node;
1142 int sidx, depth;
1144 int idx, new_depth, new_sidx;
1145 bin_tree_t *ret;
1146 if (node == NULL)
1147 return NULL;
1149 new_depth = 0;
1150 new_sidx = sidx;
1151 if ((depth & 1) && node->type == CONCAT
1152 && node->right && node->right->type == 0
1153 && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1155 new_depth = depth + 1;
1156 if (new_depth == 2
1157 || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1158 && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
1159 new_sidx = so->nodes[idx].opr.idx;
1161 node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
1162 new_depth = (depth & 1) == 0 && node->type == CONCAT
1163 && node->left && node->left->type == 0
1164 && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
1165 ? depth + 1 : 0;
1166 node->right = optimize_subexps (so, node->right, sidx, new_depth);
1168 if (node->type != CONCAT)
1169 return node;
1170 if ((depth & 1) == 0
1171 && node->left
1172 && node->left->type == 0
1173 && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
1174 ret = node->right;
1175 else if ((depth & 1)
1176 && node->right
1177 && node->right->type == 0
1178 && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1179 ret = node->left;
1180 else
1181 return node;
1183 if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1184 && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
1185 return node;
1187 if (!so->no_sub)
1189 int i;
1191 if (depth < 2)
1192 return node;
1194 if (so->dfa->subexp_map == NULL)
1196 so->dfa->subexp_map = re_malloc (int, so->re_nsub);
1197 if (so->dfa->subexp_map == NULL)
1198 return node;
1200 for (i = 0; i < so->re_nsub; i++)
1201 so->dfa->subexp_map[i] = i;
1204 i = so->nodes[idx].opr.idx;
1205 assert (sidx < i);
1206 so->dfa->subexp_map[i] = sidx;
1209 so->nodes[idx].type = OP_DELETED_SUBEXP;
1210 ret->parent = node->parent;
1211 return ret;
1214 /* Analyze the structure tree, and calculate "first", "next", "edest",
1215 "eclosure", and "inveclosure". */
1217 static reg_errcode_t
1218 analyze (dfa)
1219 re_dfa_t *dfa;
1221 int i;
1222 reg_errcode_t ret;
1224 /* Allocate arrays. */
1225 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1226 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1227 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1228 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1229 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1230 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1231 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1232 return REG_ESPACE;
1233 /* Initialize them. */
1234 for (i = 0; i < dfa->nodes_len; ++i)
1236 dfa->nexts[i] = -1;
1237 re_node_set_init_empty (dfa->edests + i);
1238 re_node_set_init_empty (dfa->eclosures + i);
1239 re_node_set_init_empty (dfa->inveclosures + i);
1242 ret = analyze_tree (dfa, dfa->str_tree);
1243 if (BE (ret == REG_NOERROR, 1))
1245 ret = calc_eclosure (dfa);
1246 if (ret == REG_NOERROR)
1247 calc_inveclosure (dfa);
1249 return ret;
1252 /* Helper functions for analyze.
1253 This function calculate "first", "next", and "edest" for the subtree
1254 whose root is NODE. */
1256 static reg_errcode_t
1257 analyze_tree (dfa, node)
1258 re_dfa_t *dfa;
1259 bin_tree_t *node;
1261 reg_errcode_t ret;
1262 if (node->first == -1)
1263 calc_first (dfa, node);
1264 if (node->next == -1)
1265 calc_next (dfa, node);
1266 calc_epsdest (dfa, node);
1268 /* Calculate "first" etc. for the left child. */
1269 if (node->left != NULL)
1271 ret = analyze_tree (dfa, node->left);
1272 if (BE (ret != REG_NOERROR, 0))
1273 return ret;
1275 /* Calculate "first" etc. for the right child. */
1276 if (node->right != NULL)
1278 ret = analyze_tree (dfa, node->right);
1279 if (BE (ret != REG_NOERROR, 0))
1280 return ret;
1282 return REG_NOERROR;
1285 /* Calculate "first" for the node NODE. */
1286 static void
1287 calc_first (dfa, node)
1288 re_dfa_t *dfa;
1289 bin_tree_t *node;
1291 int idx, type;
1292 idx = node->node_idx;
1293 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1295 switch (type)
1297 #ifdef DEBUG
1298 case OP_OPEN_BRACKET:
1299 case OP_CLOSE_BRACKET:
1300 case OP_OPEN_DUP_NUM:
1301 case OP_CLOSE_DUP_NUM:
1302 case OP_DUP_PLUS:
1303 case OP_NON_MATCH_LIST:
1304 case OP_OPEN_COLL_ELEM:
1305 case OP_CLOSE_COLL_ELEM:
1306 case OP_OPEN_EQUIV_CLASS:
1307 case OP_CLOSE_EQUIV_CLASS:
1308 case OP_OPEN_CHAR_CLASS:
1309 case OP_CLOSE_CHAR_CLASS:
1310 /* These must not appear here. */
1311 assert (0);
1312 #endif
1313 case END_OF_RE:
1314 case CHARACTER:
1315 case OP_PERIOD:
1316 case OP_DUP_ASTERISK:
1317 case OP_DUP_QUESTION:
1318 #ifdef RE_ENABLE_I18N
1319 case OP_UTF8_PERIOD:
1320 case COMPLEX_BRACKET:
1321 #endif /* RE_ENABLE_I18N */
1322 case SIMPLE_BRACKET:
1323 case OP_BACK_REF:
1324 case ANCHOR:
1325 case OP_OPEN_SUBEXP:
1326 case OP_CLOSE_SUBEXP:
1327 node->first = idx;
1328 break;
1329 case OP_ALT:
1330 node->first = idx;
1331 break;
1332 /* else fall through */
1333 default:
1334 #ifdef DEBUG
1335 assert (node->left != NULL);
1336 #endif
1337 if (node->left->first == -1)
1338 calc_first (dfa, node->left);
1339 node->first = node->left->first;
1340 break;
1344 /* Calculate "next" for the node NODE. */
1346 static void
1347 calc_next (dfa, node)
1348 re_dfa_t *dfa;
1349 bin_tree_t *node;
1351 int idx, type;
1352 bin_tree_t *parent = node->parent;
1353 if (parent == NULL)
1355 node->next = -1;
1356 idx = node->node_idx;
1357 if (node->type == 0)
1358 dfa->nexts[idx] = node->next;
1359 return;
1362 idx = parent->node_idx;
1363 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1365 switch (type)
1367 case OP_DUP_ASTERISK:
1368 node->next = idx;
1369 break;
1370 case CONCAT:
1371 if (parent->left == node)
1373 if (parent->right->first == -1)
1374 calc_first (dfa, parent->right);
1375 node->next = parent->right->first;
1376 break;
1378 /* else fall through */
1379 default:
1380 if (parent->next == -1)
1381 calc_next (dfa, parent);
1382 node->next = parent->next;
1383 break;
1385 idx = node->node_idx;
1386 if (node->type == 0)
1387 dfa->nexts[idx] = node->next;
1390 /* Calculate "edest" for the node NODE. */
1392 static void
1393 calc_epsdest (dfa, node)
1394 re_dfa_t *dfa;
1395 bin_tree_t *node;
1397 int idx;
1398 idx = node->node_idx;
1399 if (node->type == 0)
1401 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1402 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1404 if (node->left->first == -1)
1405 calc_first (dfa, node->left);
1406 if (node->next == -1)
1407 calc_next (dfa, node);
1408 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1409 node->next);
1411 else if (dfa->nodes[idx].type == OP_ALT)
1413 int left, right;
1414 if (node->left != NULL)
1416 if (node->left->first == -1)
1417 calc_first (dfa, node->left);
1418 left = node->left->first;
1420 else
1422 if (node->next == -1)
1423 calc_next (dfa, node);
1424 left = node->next;
1426 if (node->right != NULL)
1428 if (node->right->first == -1)
1429 calc_first (dfa, node->right);
1430 right = node->right->first;
1432 else
1434 if (node->next == -1)
1435 calc_next (dfa, node);
1436 right = node->next;
1438 re_node_set_init_2 (dfa->edests + idx, left, right);
1440 else if (dfa->nodes[idx].type == ANCHOR
1441 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1442 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1443 || dfa->nodes[idx].type == OP_BACK_REF)
1444 re_node_set_init_1 (dfa->edests + idx, node->next);
1445 else
1446 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1450 /* Duplicate the epsilon closure of the node ROOT_NODE.
1451 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1452 to their own constraint. */
1454 static reg_errcode_t
1455 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1456 init_constraint)
1457 re_dfa_t *dfa;
1458 int top_org_node, top_clone_node, root_node;
1459 unsigned int init_constraint;
1461 reg_errcode_t err;
1462 int org_node, clone_node, ret;
1463 unsigned int constraint = init_constraint;
1464 for (org_node = top_org_node, clone_node = top_clone_node;;)
1466 int org_dest, clone_dest;
1467 if (dfa->nodes[org_node].type == OP_BACK_REF)
1469 /* If the back reference epsilon-transit, its destination must
1470 also have the constraint. Then duplicate the epsilon closure
1471 of the destination of the back reference, and store it in
1472 edests of the back reference. */
1473 org_dest = dfa->nexts[org_node];
1474 re_node_set_empty (dfa->edests + clone_node);
1475 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1476 if (BE (err != REG_NOERROR, 0))
1477 return err;
1478 dfa->nexts[clone_node] = dfa->nexts[org_node];
1479 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1480 if (BE (ret < 0, 0))
1481 return REG_ESPACE;
1483 else if (dfa->edests[org_node].nelem == 0)
1485 /* In case of the node can't epsilon-transit, don't duplicate the
1486 destination and store the original destination as the
1487 destination of the node. */
1488 dfa->nexts[clone_node] = dfa->nexts[org_node];
1489 break;
1491 else if (dfa->edests[org_node].nelem == 1)
1493 /* In case of the node can epsilon-transit, and it has only one
1494 destination. */
1495 org_dest = dfa->edests[org_node].elems[0];
1496 re_node_set_empty (dfa->edests + clone_node);
1497 if (dfa->nodes[org_node].type == ANCHOR)
1499 /* In case of the node has another constraint, append it. */
1500 if (org_node == root_node && clone_node != org_node)
1502 /* ...but if the node is root_node itself, it means the
1503 epsilon closure have a loop, then tie it to the
1504 destination of the root_node. */
1505 ret = re_node_set_insert (dfa->edests + clone_node,
1506 org_dest);
1507 if (BE (ret < 0, 0))
1508 return REG_ESPACE;
1509 break;
1511 constraint |= dfa->nodes[org_node].opr.ctx_type;
1513 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1514 if (BE (err != REG_NOERROR, 0))
1515 return err;
1516 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 if (BE (ret < 0, 0))
1518 return REG_ESPACE;
1520 else /* dfa->edests[org_node].nelem == 2 */
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. E.g. '|', '*', '+', '?'. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528 if (clone_dest == -1)
1530 /* There are no such a duplicated node, create a new one. */
1531 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1532 if (BE (err != REG_NOERROR, 0))
1533 return err;
1534 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 if (BE (ret < 0, 0))
1536 return REG_ESPACE;
1537 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1538 root_node, constraint);
1539 if (BE (err != REG_NOERROR, 0))
1540 return err;
1542 else
1544 /* There are a duplicated node which satisfy the constraint,
1545 use it to avoid infinite loop. */
1546 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 if (BE (ret < 0, 0))
1548 return REG_ESPACE;
1551 org_dest = dfa->edests[org_node].elems[1];
1552 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1553 if (BE (err != REG_NOERROR, 0))
1554 return err;
1555 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 if (BE (ret < 0, 0))
1557 return REG_ESPACE;
1559 org_node = org_dest;
1560 clone_node = clone_dest;
1562 return REG_NOERROR;
1565 /* Search for a node which is duplicated from the node ORG_NODE, and
1566 satisfies the constraint CONSTRAINT. */
1568 static int
1569 search_duplicated_node (dfa, org_node, constraint)
1570 re_dfa_t *dfa;
1571 int org_node;
1572 unsigned int constraint;
1574 int idx;
1575 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1577 if (org_node == dfa->org_indices[idx]
1578 && constraint == dfa->nodes[idx].constraint)
1579 return idx; /* Found. */
1581 return -1; /* Not found. */
1584 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1585 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1586 otherwise return the error code. */
1588 static reg_errcode_t
1589 duplicate_node (new_idx, dfa, org_idx, constraint)
1590 re_dfa_t *dfa;
1591 int *new_idx, org_idx;
1592 unsigned int constraint;
1594 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1595 if (BE (dup_idx == -1, 0))
1596 return REG_ESPACE;
1597 dfa->nodes[dup_idx].constraint = constraint;
1598 if (dfa->nodes[org_idx].type == ANCHOR)
1599 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1600 dfa->nodes[dup_idx].duplicated = 1;
1601 re_node_set_init_empty (dfa->edests + dup_idx);
1602 re_node_set_init_empty (dfa->eclosures + dup_idx);
1603 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1605 /* Store the index of the original node. */
1606 dfa->org_indices[dup_idx] = org_idx;
1607 *new_idx = dup_idx;
1608 return REG_NOERROR;
1611 static void
1612 calc_inveclosure (dfa)
1613 re_dfa_t *dfa;
1615 int src, idx, dest;
1616 for (src = 0; src < dfa->nodes_len; ++src)
1618 if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
1619 continue;
1620 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1622 dest = dfa->eclosures[src].elems[idx];
1623 re_node_set_insert_last (dfa->inveclosures + dest, src);
1628 /* Calculate "eclosure" for all the node in DFA. */
1630 static reg_errcode_t
1631 calc_eclosure (dfa)
1632 re_dfa_t *dfa;
1634 int node_idx, incomplete;
1635 #ifdef DEBUG
1636 assert (dfa->nodes_len > 0);
1637 #endif
1638 incomplete = 0;
1639 /* For each nodes, calculate epsilon closure. */
1640 for (node_idx = 0; ; ++node_idx)
1642 reg_errcode_t err;
1643 re_node_set eclosure_elem;
1644 if (node_idx == dfa->nodes_len)
1646 if (!incomplete)
1647 break;
1648 incomplete = 0;
1649 node_idx = 0;
1652 #ifdef DEBUG
1653 assert (dfa->eclosures[node_idx].nelem != -1);
1654 #endif
1655 if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
1656 continue;
1658 /* If we have already calculated, skip it. */
1659 if (dfa->eclosures[node_idx].nelem != 0)
1660 continue;
1661 /* Calculate epsilon closure of `node_idx'. */
1662 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1663 if (BE (err != REG_NOERROR, 0))
1664 return err;
1666 if (dfa->eclosures[node_idx].nelem == 0)
1668 incomplete = 1;
1669 re_node_set_free (&eclosure_elem);
1672 return REG_NOERROR;
1675 /* Calculate epsilon closure of NODE. */
1677 static reg_errcode_t
1678 calc_eclosure_iter (new_set, dfa, node, root)
1679 re_node_set *new_set;
1680 re_dfa_t *dfa;
1681 int node, root;
1683 reg_errcode_t err;
1684 unsigned int constraint;
1685 int i, incomplete;
1686 re_node_set eclosure;
1687 incomplete = 0;
1688 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689 if (BE (err != REG_NOERROR, 0))
1690 return err;
1692 /* This indicates that we are calculating this node now.
1693 We reference this value to avoid infinite loop. */
1694 dfa->eclosures[node].nelem = -1;
1696 constraint = ((dfa->nodes[node].type == ANCHOR)
1697 ? dfa->nodes[node].opr.ctx_type : 0);
1698 /* If the current node has constraints, duplicate all nodes.
1699 Since they must inherit the constraints. */
1700 if (constraint
1701 && dfa->edests[node].nelem
1702 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1704 int org_node, cur_node;
1705 org_node = cur_node = node;
1706 err = duplicate_node_closure (dfa, node, node, node, constraint);
1707 if (BE (err != REG_NOERROR, 0))
1708 return err;
1711 /* Expand each epsilon destination nodes. */
1712 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713 for (i = 0; i < dfa->edests[node].nelem; ++i)
1715 re_node_set eclosure_elem;
1716 int edest = dfa->edests[node].elems[i];
1717 /* If calculating the epsilon closure of `edest' is in progress,
1718 return intermediate result. */
1719 if (dfa->eclosures[edest].nelem == -1)
1721 incomplete = 1;
1722 continue;
1724 /* If we haven't calculated the epsilon closure of `edest' yet,
1725 calculate now. Otherwise use calculated epsilon closure. */
1726 if (dfa->eclosures[edest].nelem == 0)
1728 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1732 else
1733 eclosure_elem = dfa->eclosures[edest];
1734 /* Merge the epsilon closure of `edest'. */
1735 re_node_set_merge (&eclosure, &eclosure_elem);
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1740 incomplete = 1;
1741 re_node_set_free (&eclosure_elem);
1745 /* Epsilon closures include itself. */
1746 re_node_set_insert (&eclosure, node);
1747 if (incomplete && !root)
1748 dfa->eclosures[node].nelem = 0;
1749 else
1750 dfa->eclosures[node] = eclosure;
1751 *new_set = eclosure;
1752 return REG_NOERROR;
1755 /* Functions for token which are used in the parser. */
1757 /* Fetch a token from INPUT.
1758 We must not use this function inside bracket expressions. */
1760 static void
1761 fetch_token (result, input, syntax)
1762 re_token_t *result;
1763 re_string_t *input;
1764 reg_syntax_t syntax;
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1772 static int
1773 peek_token (token, input, syntax)
1774 re_token_t *token;
1775 re_string_t *input;
1776 reg_syntax_t syntax;
1778 unsigned char c;
1780 if (re_string_eoi (input))
1782 token->type = END_OF_RE;
1783 return 0;
1786 c = re_string_peek_byte (input, 0);
1787 token->opr.c = c;
1789 token->word_char = 0;
1790 #ifdef RE_ENABLE_I18N
1791 token->mb_partial = 0;
1792 if (input->mb_cur_max > 1 &&
1793 !re_string_first_byte (input, re_string_cur_idx (input)))
1795 token->type = CHARACTER;
1796 token->mb_partial = 1;
1797 return 1;
1799 #endif
1800 if (c == '\\')
1802 unsigned char c2;
1803 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1805 token->type = BACK_SLASH;
1806 return 1;
1809 c2 = re_string_peek_byte_case (input, 1);
1810 token->opr.c = c2;
1811 token->type = CHARACTER;
1812 #ifdef RE_ENABLE_I18N
1813 if (input->mb_cur_max > 1)
1815 wint_t wc = re_string_wchar_at (input,
1816 re_string_cur_idx (input) + 1);
1817 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1819 else
1820 #endif
1821 token->word_char = IS_WORD_CHAR (c2) != 0;
1823 switch (c2)
1825 case '|':
1826 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1827 token->type = OP_ALT;
1828 break;
1829 case '1': case '2': case '3': case '4': case '5':
1830 case '6': case '7': case '8': case '9':
1831 if (!(syntax & RE_NO_BK_REFS))
1833 token->type = OP_BACK_REF;
1834 token->opr.idx = c2 - '1';
1836 break;
1837 case '<':
1838 if (!(syntax & RE_NO_GNU_OPS))
1840 token->type = ANCHOR;
1841 token->opr.ctx_type = WORD_FIRST;
1843 break;
1844 case '>':
1845 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = WORD_LAST;
1850 break;
1851 case 'b':
1852 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = WORD_DELIM;
1857 break;
1858 case 'B':
1859 if (!(syntax & RE_NO_GNU_OPS))
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = INSIDE_WORD;
1864 break;
1865 case 'w':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = OP_WORD;
1868 break;
1869 case 'W':
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = OP_NOTWORD;
1872 break;
1873 case 's':
1874 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = OP_SPACE;
1876 break;
1877 case 'S':
1878 if (!(syntax & RE_NO_GNU_OPS))
1879 token->type = OP_NOTSPACE;
1880 break;
1881 case '`':
1882 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = ANCHOR;
1885 token->opr.ctx_type = BUF_FIRST;
1887 break;
1888 case '\'':
1889 if (!(syntax & RE_NO_GNU_OPS))
1891 token->type = ANCHOR;
1892 token->opr.ctx_type = BUF_LAST;
1894 break;
1895 case '(':
1896 if (!(syntax & RE_NO_BK_PARENS))
1897 token->type = OP_OPEN_SUBEXP;
1898 break;
1899 case ')':
1900 if (!(syntax & RE_NO_BK_PARENS))
1901 token->type = OP_CLOSE_SUBEXP;
1902 break;
1903 case '+':
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905 token->type = OP_DUP_PLUS;
1906 break;
1907 case '?':
1908 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1909 token->type = OP_DUP_QUESTION;
1910 break;
1911 case '{':
1912 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913 token->type = OP_OPEN_DUP_NUM;
1914 break;
1915 case '}':
1916 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1917 token->type = OP_CLOSE_DUP_NUM;
1918 break;
1919 default:
1920 break;
1922 return 2;
1925 token->type = CHARACTER;
1926 #ifdef RE_ENABLE_I18N
1927 if (input->mb_cur_max > 1)
1929 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1930 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1932 else
1933 #endif
1934 token->word_char = IS_WORD_CHAR (token->opr.c);
1936 switch (c)
1938 case '\n':
1939 if (syntax & RE_NEWLINE_ALT)
1940 token->type = OP_ALT;
1941 break;
1942 case '|':
1943 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1944 token->type = OP_ALT;
1945 break;
1946 case '*':
1947 token->type = OP_DUP_ASTERISK;
1948 break;
1949 case '+':
1950 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951 token->type = OP_DUP_PLUS;
1952 break;
1953 case '?':
1954 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1955 token->type = OP_DUP_QUESTION;
1956 break;
1957 case '{':
1958 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959 token->type = OP_OPEN_DUP_NUM;
1960 break;
1961 case '}':
1962 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1963 token->type = OP_CLOSE_DUP_NUM;
1964 break;
1965 case '(':
1966 if (syntax & RE_NO_BK_PARENS)
1967 token->type = OP_OPEN_SUBEXP;
1968 break;
1969 case ')':
1970 if (syntax & RE_NO_BK_PARENS)
1971 token->type = OP_CLOSE_SUBEXP;
1972 break;
1973 case '[':
1974 token->type = OP_OPEN_BRACKET;
1975 break;
1976 case '.':
1977 token->type = OP_PERIOD;
1978 break;
1979 case '^':
1980 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1981 re_string_cur_idx (input) != 0)
1983 char prev = re_string_peek_byte (input, -1);
1984 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985 break;
1987 token->type = ANCHOR;
1988 token->opr.ctx_type = LINE_FIRST;
1989 break;
1990 case '$':
1991 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1992 re_string_cur_idx (input) + 1 != re_string_length (input))
1994 re_token_t next;
1995 re_string_skip_bytes (input, 1);
1996 peek_token (&next, input, syntax);
1997 re_string_skip_bytes (input, -1);
1998 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999 break;
2001 token->type = ANCHOR;
2002 token->opr.ctx_type = LINE_LAST;
2003 break;
2004 default:
2005 break;
2007 return 1;
2010 /* Peek a token from INPUT, and return the length of the token.
2011 We must not use this function out of bracket expressions. */
2013 static int
2014 peek_token_bracket (token, input, syntax)
2015 re_token_t *token;
2016 re_string_t *input;
2017 reg_syntax_t syntax;
2019 unsigned char c;
2020 if (re_string_eoi (input))
2022 token->type = END_OF_RE;
2023 return 0;
2025 c = re_string_peek_byte (input, 0);
2026 token->opr.c = c;
2028 #ifdef RE_ENABLE_I18N
2029 if (input->mb_cur_max > 1 &&
2030 !re_string_first_byte (input, re_string_cur_idx (input)))
2032 token->type = CHARACTER;
2033 return 1;
2035 #endif /* RE_ENABLE_I18N */
2037 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2038 && re_string_cur_idx (input) + 1 < re_string_length (input))
2040 /* In this case, '\' escape a character. */
2041 unsigned char c2;
2042 re_string_skip_bytes (input, 1);
2043 c2 = re_string_peek_byte (input, 0);
2044 token->opr.c = c2;
2045 token->type = CHARACTER;
2046 return 1;
2048 if (c == '[') /* '[' is a special char in a bracket exps. */
2050 unsigned char c2;
2051 int token_len;
2052 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2053 c2 = re_string_peek_byte (input, 1);
2054 else
2055 c2 = 0;
2056 token->opr.c = c2;
2057 token_len = 2;
2058 switch (c2)
2060 case '.':
2061 token->type = OP_OPEN_COLL_ELEM;
2062 break;
2063 case '=':
2064 token->type = OP_OPEN_EQUIV_CLASS;
2065 break;
2066 case ':':
2067 if (syntax & RE_CHAR_CLASSES)
2069 token->type = OP_OPEN_CHAR_CLASS;
2070 break;
2072 /* else fall through. */
2073 default:
2074 token->type = CHARACTER;
2075 token->opr.c = c;
2076 token_len = 1;
2077 break;
2079 return token_len;
2081 switch (c)
2083 case '-':
2084 token->type = OP_CHARSET_RANGE;
2085 break;
2086 case ']':
2087 token->type = OP_CLOSE_BRACKET;
2088 break;
2089 case '^':
2090 token->type = OP_NON_MATCH_LIST;
2091 break;
2092 default:
2093 token->type = CHARACTER;
2095 return 1;
2098 /* Functions for parser. */
2100 /* Entry point of the parser.
2101 Parse the regular expression REGEXP and return the structure tree.
2102 If an error is occured, ERR is set by error code, and return NULL.
2103 This function build the following tree, from regular expression <reg_exp>:
2107 <reg_exp> EOR
2109 CAT means concatenation.
2110 EOR means end of regular expression. */
2112 static bin_tree_t *
2113 parse (regexp, preg, syntax, err)
2114 re_string_t *regexp;
2115 regex_t *preg;
2116 reg_syntax_t syntax;
2117 reg_errcode_t *err;
2119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120 bin_tree_t *tree, *eor, *root;
2121 re_token_t current_token;
2122 dfa->syntax = syntax;
2123 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2125 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126 return NULL;
2127 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2128 if (tree != NULL)
2129 root = create_tree (dfa, tree, eor, CONCAT, 0);
2130 else
2131 root = eor;
2132 if (BE (eor == NULL || root == NULL, 0))
2134 *err = REG_ESPACE;
2135 return NULL;
2137 return root;
2140 /* This function build the following tree, from regular expression
2141 <branch1>|<branch2>:
2145 <branch1> <branch2>
2147 ALT means alternative, which represents the operator `|'. */
2149 static bin_tree_t *
2150 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2151 re_string_t *regexp;
2152 regex_t *preg;
2153 re_token_t *token;
2154 reg_syntax_t syntax;
2155 int nest;
2156 reg_errcode_t *err;
2158 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2159 bin_tree_t *tree, *branch = NULL;
2160 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2161 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162 return NULL;
2164 while (token->type == OP_ALT)
2166 re_token_t alt_token = *token;
2167 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2168 if (token->type != OP_ALT && token->type != END_OF_RE
2169 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2171 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2172 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2173 return NULL;
2175 else
2176 branch = NULL;
2177 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2178 if (BE (tree == NULL, 0))
2180 *err = REG_ESPACE;
2181 return NULL;
2183 dfa->has_plural_match = 1;
2185 return tree;
2188 /* This function build the following tree, from regular expression
2189 <exp1><exp2>:
2193 <exp1> <exp2>
2195 CAT means concatenation. */
2197 static bin_tree_t *
2198 parse_branch (regexp, preg, token, syntax, nest, err)
2199 re_string_t *regexp;
2200 regex_t *preg;
2201 re_token_t *token;
2202 reg_syntax_t syntax;
2203 int nest;
2204 reg_errcode_t *err;
2206 bin_tree_t *tree, *exp;
2207 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2208 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2209 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210 return NULL;
2212 while (token->type != OP_ALT && token->type != END_OF_RE
2213 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2215 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2216 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2218 return NULL;
2220 if (tree != NULL && exp != NULL)
2222 tree = create_tree (dfa, tree, exp, CONCAT, 0);
2223 if (tree == NULL)
2225 *err = REG_ESPACE;
2226 return NULL;
2229 else if (tree == NULL)
2230 tree = exp;
2231 /* Otherwise exp == NULL, we don't need to create new tree. */
2233 return tree;
2236 /* This function build the following tree, from regular expression a*:
2242 static bin_tree_t *
2243 parse_expression (regexp, preg, token, syntax, nest, err)
2244 re_string_t *regexp;
2245 regex_t *preg;
2246 re_token_t *token;
2247 reg_syntax_t syntax;
2248 int nest;
2249 reg_errcode_t *err;
2251 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2252 bin_tree_t *tree;
2253 switch (token->type)
2255 case CHARACTER:
2256 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2257 if (BE (tree == NULL, 0))
2259 *err = REG_ESPACE;
2260 return NULL;
2262 #ifdef RE_ENABLE_I18N
2263 if (dfa->mb_cur_max > 1)
2265 while (!re_string_eoi (regexp)
2266 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2268 bin_tree_t *mbc_remain;
2269 fetch_token (token, regexp, syntax);
2270 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2271 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2272 if (BE (mbc_remain == NULL || tree == NULL, 0))
2274 *err = REG_ESPACE;
2275 return NULL;
2279 #endif
2280 break;
2281 case OP_OPEN_SUBEXP:
2282 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2284 return NULL;
2285 break;
2286 case OP_OPEN_BRACKET:
2287 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2289 return NULL;
2290 break;
2291 case OP_BACK_REF:
2292 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2294 *err = REG_ESUBREG;
2295 return NULL;
2297 dfa->used_bkref_map |= 1 << token->opr.idx;
2298 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2299 if (BE (tree == NULL, 0))
2301 *err = REG_ESPACE;
2302 return NULL;
2304 ++dfa->nbackref;
2305 dfa->has_mb_node = 1;
2306 break;
2307 case OP_OPEN_DUP_NUM:
2308 if (syntax & RE_CONTEXT_INVALID_DUP)
2310 *err = REG_BADRPT;
2311 return NULL;
2313 /* FALLTHROUGH */
2314 case OP_DUP_ASTERISK:
2315 case OP_DUP_PLUS:
2316 case OP_DUP_QUESTION:
2317 if (syntax & RE_CONTEXT_INVALID_OPS)
2319 *err = REG_BADRPT;
2320 return NULL;
2322 else if (syntax & RE_CONTEXT_INDEP_OPS)
2324 fetch_token (token, regexp, syntax);
2325 return parse_expression (regexp, preg, token, syntax, nest, err);
2327 /* else fall through */
2328 case OP_CLOSE_SUBEXP:
2329 if ((token->type == OP_CLOSE_SUBEXP) &&
2330 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2332 *err = REG_ERPAREN;
2333 return NULL;
2335 /* else fall through */
2336 case OP_CLOSE_DUP_NUM:
2337 /* We treat it as a normal character. */
2339 /* Then we can these characters as normal characters. */
2340 token->type = CHARACTER;
2341 /* mb_partial and word_char bits should be initialized already
2342 by peek_token. */
2343 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2344 if (BE (tree == NULL, 0))
2346 *err = REG_ESPACE;
2347 return NULL;
2349 break;
2350 case ANCHOR:
2351 if ((token->opr.ctx_type
2352 & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2353 && dfa->word_ops_used == 0)
2354 init_word_char (dfa);
2355 if (token->opr.ctx_type == WORD_DELIM)
2357 bin_tree_t *tree_first, *tree_last;
2358 token->opr.ctx_type = WORD_FIRST;
2359 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2360 token->opr.ctx_type = WORD_LAST;
2361 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2362 token->type = OP_ALT;
2363 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2364 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2366 *err = REG_ESPACE;
2367 return NULL;
2370 else
2372 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2373 if (BE (tree == NULL, 0))
2375 *err = REG_ESPACE;
2376 return NULL;
2379 /* We must return here, since ANCHORs can't be followed
2380 by repetition operators.
2381 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2382 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2383 fetch_token (token, regexp, syntax);
2384 return tree;
2385 case OP_PERIOD:
2386 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2387 if (BE (tree == NULL, 0))
2389 *err = REG_ESPACE;
2390 return NULL;
2392 if (dfa->mb_cur_max > 1)
2393 dfa->has_mb_node = 1;
2394 break;
2395 case OP_WORD:
2396 case OP_NOTWORD:
2397 tree = build_charclass_op (dfa, regexp->trans,
2398 (const unsigned char *) "alnum",
2399 (const unsigned char *) "_",
2400 token->type == OP_NOTWORD, err);
2401 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2402 return NULL;
2403 break;
2404 case OP_SPACE:
2405 case OP_NOTSPACE:
2406 tree = build_charclass_op (dfa, regexp->trans,
2407 (const unsigned char *) "space",
2408 (const unsigned char *) "",
2409 token->type == OP_NOTSPACE, err);
2410 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2411 return NULL;
2412 break;
2413 case OP_ALT:
2414 case END_OF_RE:
2415 return NULL;
2416 case BACK_SLASH:
2417 *err = REG_EESCAPE;
2418 return NULL;
2419 default:
2420 /* Must not happen? */
2421 #ifdef DEBUG
2422 assert (0);
2423 #endif
2424 return NULL;
2426 fetch_token (token, regexp, syntax);
2428 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2429 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2431 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2432 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2433 return NULL;
2434 /* In BRE consecutive duplications are not allowed. */
2435 if ((syntax & RE_CONTEXT_INVALID_DUP)
2436 && (token->type == OP_DUP_ASTERISK
2437 || token->type == OP_OPEN_DUP_NUM))
2439 *err = REG_BADRPT;
2440 return NULL;
2442 dfa->has_plural_match = 1;
2445 return tree;
2448 /* This function build the following tree, from regular expression
2449 (<reg_exp>):
2450 SUBEXP
2452 <reg_exp>
2455 static bin_tree_t *
2456 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2457 re_string_t *regexp;
2458 regex_t *preg;
2459 re_token_t *token;
2460 reg_syntax_t syntax;
2461 int nest;
2462 reg_errcode_t *err;
2464 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2465 bin_tree_t *tree, *left_par, *right_par;
2466 size_t cur_nsub;
2467 cur_nsub = preg->re_nsub++;
2469 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2470 if (BE (left_par == NULL, 0))
2472 *err = REG_ESPACE;
2473 return NULL;
2475 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2476 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2478 /* The subexpression may be a null string. */
2479 if (token->type == OP_CLOSE_SUBEXP)
2480 tree = NULL;
2481 else
2483 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2484 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2485 return NULL;
2487 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2489 *err = REG_EPAREN;
2490 return NULL;
2492 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2493 dfa->completed_bkref_map |= 1 << cur_nsub;
2494 tree = ((tree == NULL) ? right_par
2495 : create_tree (dfa, tree, right_par, CONCAT, 0));
2496 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2497 if (BE (right_par == NULL || tree == NULL, 0))
2499 *err = REG_ESPACE;
2500 return NULL;
2502 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2504 return tree;
2507 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2509 static bin_tree_t *
2510 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2511 bin_tree_t *elem;
2512 re_string_t *regexp;
2513 re_dfa_t *dfa;
2514 re_token_t *token;
2515 reg_syntax_t syntax;
2516 reg_errcode_t *err;
2518 re_token_t dup_token;
2519 bin_tree_t *tree = NULL, *old_tree = NULL;
2520 int i, start, end, start_idx = re_string_cur_idx (regexp);
2521 re_token_t start_token = *token;
2523 if (token->type == OP_OPEN_DUP_NUM)
2525 end = 0;
2526 start = fetch_number (regexp, token, syntax);
2527 if (start == -1)
2529 if (token->type == CHARACTER && token->opr.c == ',')
2530 start = 0; /* We treat "{,m}" as "{0,m}". */
2531 else
2533 *err = REG_BADBR; /* <re>{} is invalid. */
2534 return NULL;
2537 if (BE (start != -2, 1))
2539 /* We treat "{n}" as "{n,n}". */
2540 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2541 : ((token->type == CHARACTER && token->opr.c == ',')
2542 ? fetch_number (regexp, token, syntax) : -2));
2544 if (BE (start == -2 || end == -2, 0))
2546 /* Invalid sequence. */
2547 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549 if (token->type == END_OF_RE)
2550 *err = REG_EBRACE;
2551 else
2552 *err = REG_BADBR;
2554 return NULL;
2557 /* If the syntax bit is set, rollback. */
2558 re_string_set_index (regexp, start_idx);
2559 *token = start_token;
2560 token->type = CHARACTER;
2561 /* mb_partial and word_char bits should be already initialized by
2562 peek_token. */
2563 return elem;
2566 if (BE (end != -1 && start > end, 0))
2568 /* First number greater than second. */
2569 *err = REG_BADBR;
2570 return NULL;
2573 else
2575 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2576 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2579 fetch_token (token, regexp, syntax);
2581 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2582 if (BE (elem == NULL || (start == 0 && end == 0), 0))
2583 return NULL;
2585 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2586 if (BE (start > 0, 0))
2588 tree = elem;
2589 for (i = 2; i <= start; ++i)
2591 elem = duplicate_tree (elem, dfa);
2592 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2593 if (BE (elem == NULL || tree == NULL, 0))
2594 goto parse_dup_op_espace;
2597 if (start == end)
2598 return tree;
2600 /* Duplicate ELEM before it is marked optional. */
2601 elem = duplicate_tree (elem, dfa);
2602 old_tree = tree;
2604 else
2605 old_tree = NULL;
2607 mark_opt_subexp (elem, dfa);
2608 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2609 tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2610 if (BE (tree == NULL, 0))
2611 goto parse_dup_op_espace;
2613 /* This loop is actually executed only when end != -1,
2614 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2615 already created the start+1-th copy. */
2616 for (i = start + 2; i <= end; ++i)
2618 elem = duplicate_tree (elem, dfa);
2619 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2620 if (BE (elem == NULL || tree == NULL, 0))
2621 goto parse_dup_op_espace;
2623 tree = re_dfa_add_tree_node (dfa, tree, NULL, &dup_token);
2624 if (BE (tree == NULL, 0))
2625 goto parse_dup_op_espace;
2628 if (old_tree)
2629 tree = create_tree (dfa, old_tree, tree, CONCAT, 0);
2631 return tree;
2633 parse_dup_op_espace:
2634 *err = REG_ESPACE;
2635 return NULL;
2638 /* Size of the names for collating symbol/equivalence_class/character_class.
2639 I'm not sure, but maybe enough. */
2640 #define BRACKET_NAME_BUF_SIZE 32
2642 #ifndef _LIBC
2643 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2644 Build the range expression which starts from START_ELEM, and ends
2645 at END_ELEM. The result are written to MBCSET and SBCSET.
2646 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2647 mbcset->range_ends, is a pointer argument sinse we may
2648 update it. */
2650 static reg_errcode_t
2651 # ifdef RE_ENABLE_I18N
2652 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2653 re_charset_t *mbcset;
2654 int *range_alloc;
2655 # else /* not RE_ENABLE_I18N */
2656 build_range_exp (sbcset, start_elem, end_elem)
2657 # endif /* not RE_ENABLE_I18N */
2658 re_bitset_ptr_t sbcset;
2659 bracket_elem_t *start_elem, *end_elem;
2661 unsigned int start_ch, end_ch;
2662 /* Equivalence Classes and Character Classes can't be a range start/end. */
2663 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2664 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2666 return REG_ERANGE;
2668 /* We can handle no multi character collating elements without libc
2669 support. */
2670 if (BE ((start_elem->type == COLL_SYM
2671 && strlen ((char *) start_elem->opr.name) > 1)
2672 || (end_elem->type == COLL_SYM
2673 && strlen ((char *) end_elem->opr.name) > 1), 0))
2674 return REG_ECOLLATE;
2676 # ifdef RE_ENABLE_I18N
2678 wchar_t wc, start_wc, end_wc;
2679 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2681 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2682 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2683 : 0));
2684 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2685 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2686 : 0));
2687 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2688 ? __btowc (start_ch) : start_elem->opr.wch);
2689 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2690 ? __btowc (end_ch) : end_elem->opr.wch);
2691 if (start_wc == WEOF || end_wc == WEOF)
2692 return REG_ECOLLATE;
2693 cmp_buf[0] = start_wc;
2694 cmp_buf[4] = end_wc;
2695 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2696 return REG_ERANGE;
2698 /* Got valid collation sequence values, add them as a new entry.
2699 However, for !_LIBC we have no collation elements: if the
2700 character set is single byte, the single byte character set
2701 that we build below suffices. parse_bracket_exp passes
2702 no MBCSET if dfa->mb_cur_max == 1. */
2703 if (mbcset)
2705 /* Check the space of the arrays. */
2706 if (BE (*range_alloc == mbcset->nranges, 0))
2708 /* There is not enough space, need realloc. */
2709 wchar_t *new_array_start, *new_array_end;
2710 int new_nranges;
2712 /* +1 in case of mbcset->nranges is 0. */
2713 new_nranges = 2 * mbcset->nranges + 1;
2714 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2715 are NULL if *range_alloc == 0. */
2716 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2717 new_nranges);
2718 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2719 new_nranges);
2721 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2722 return REG_ESPACE;
2724 mbcset->range_starts = new_array_start;
2725 mbcset->range_ends = new_array_end;
2726 *range_alloc = new_nranges;
2729 mbcset->range_starts[mbcset->nranges] = start_wc;
2730 mbcset->range_ends[mbcset->nranges++] = end_wc;
2733 /* Build the table for single byte characters. */
2734 for (wc = 0; wc < SBC_MAX; ++wc)
2736 cmp_buf[2] = wc;
2737 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2738 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2739 bitset_set (sbcset, wc);
2742 # else /* not RE_ENABLE_I18N */
2744 unsigned int ch;
2745 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2746 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2747 : 0));
2748 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2749 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2750 : 0));
2751 if (start_ch > end_ch)
2752 return REG_ERANGE;
2753 /* Build the table for single byte characters. */
2754 for (ch = 0; ch < SBC_MAX; ++ch)
2755 if (start_ch <= ch && ch <= end_ch)
2756 bitset_set (sbcset, ch);
2758 # endif /* not RE_ENABLE_I18N */
2759 return REG_NOERROR;
2761 #endif /* not _LIBC */
2763 #ifndef _LIBC
2764 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2765 Build the collating element which is represented by NAME.
2766 The result are written to MBCSET and SBCSET.
2767 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2768 pointer argument since we may update it. */
2770 static reg_errcode_t
2771 # ifdef RE_ENABLE_I18N
2772 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2773 re_charset_t *mbcset;
2774 int *coll_sym_alloc;
2775 # else /* not RE_ENABLE_I18N */
2776 build_collating_symbol (sbcset, name)
2777 # endif /* not RE_ENABLE_I18N */
2778 re_bitset_ptr_t sbcset;
2779 const unsigned char *name;
2781 size_t name_len = strlen ((const char *) name);
2782 if (BE (name_len != 1, 0))
2783 return REG_ECOLLATE;
2784 else
2786 bitset_set (sbcset, name[0]);
2787 return REG_NOERROR;
2790 #endif /* not _LIBC */
2792 /* This function parse bracket expression like "[abc]", "[a-c]",
2793 "[[.a-a.]]" etc. */
2795 static bin_tree_t *
2796 parse_bracket_exp (regexp, dfa, token, syntax, err)
2797 re_string_t *regexp;
2798 re_dfa_t *dfa;
2799 re_token_t *token;
2800 reg_syntax_t syntax;
2801 reg_errcode_t *err;
2803 #ifdef _LIBC
2804 const unsigned char *collseqmb;
2805 const char *collseqwc;
2806 uint32_t nrules;
2807 int32_t table_size;
2808 const int32_t *symb_table;
2809 const unsigned char *extra;
2811 /* Local function for parse_bracket_exp used in _LIBC environement.
2812 Seek the collating symbol entry correspondings to NAME.
2813 Return the index of the symbol in the SYMB_TABLE. */
2815 auto inline int32_t
2816 __attribute ((always_inline))
2817 seek_collating_symbol_entry (name, name_len)
2818 const unsigned char *name;
2819 size_t name_len;
2821 int32_t hash = elem_hash ((const char *) name, name_len);
2822 int32_t elem = hash % table_size;
2823 int32_t second = hash % (table_size - 2);
2824 while (symb_table[2 * elem] != 0)
2826 /* First compare the hashing value. */
2827 if (symb_table[2 * elem] == hash
2828 /* Compare the length of the name. */
2829 && name_len == extra[symb_table[2 * elem + 1]]
2830 /* Compare the name. */
2831 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2832 name_len) == 0)
2834 /* Yep, this is the entry. */
2835 break;
2838 /* Next entry. */
2839 elem += second;
2841 return elem;
2844 /* Local function for parse_bracket_exp used in _LIBC environement.
2845 Look up the collation sequence value of BR_ELEM.
2846 Return the value if succeeded, UINT_MAX otherwise. */
2848 auto inline unsigned int
2849 __attribute ((always_inline))
2850 lookup_collation_sequence_value (br_elem)
2851 bracket_elem_t *br_elem;
2853 if (br_elem->type == SB_CHAR)
2856 if (MB_CUR_MAX == 1)
2858 if (nrules == 0)
2859 return collseqmb[br_elem->opr.ch];
2860 else
2862 wint_t wc = __btowc (br_elem->opr.ch);
2863 return __collseq_table_lookup (collseqwc, wc);
2866 else if (br_elem->type == MB_CHAR)
2868 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2870 else if (br_elem->type == COLL_SYM)
2872 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2873 if (nrules != 0)
2875 int32_t elem, idx;
2876 elem = seek_collating_symbol_entry (br_elem->opr.name,
2877 sym_name_len);
2878 if (symb_table[2 * elem] != 0)
2880 /* We found the entry. */
2881 idx = symb_table[2 * elem + 1];
2882 /* Skip the name of collating element name. */
2883 idx += 1 + extra[idx];
2884 /* Skip the byte sequence of the collating element. */
2885 idx += 1 + extra[idx];
2886 /* Adjust for the alignment. */
2887 idx = (idx + 3) & ~3;
2888 /* Skip the multibyte collation sequence value. */
2889 idx += sizeof (unsigned int);
2890 /* Skip the wide char sequence of the collating element. */
2891 idx += sizeof (unsigned int) *
2892 (1 + *(unsigned int *) (extra + idx));
2893 /* Return the collation sequence value. */
2894 return *(unsigned int *) (extra + idx);
2896 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2898 /* No valid character. Match it as a single byte
2899 character. */
2900 return collseqmb[br_elem->opr.name[0]];
2903 else if (sym_name_len == 1)
2904 return collseqmb[br_elem->opr.name[0]];
2906 return UINT_MAX;
2909 /* Local function for parse_bracket_exp used in _LIBC environement.
2910 Build the range expression which starts from START_ELEM, and ends
2911 at END_ELEM. The result are written to MBCSET and SBCSET.
2912 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2913 mbcset->range_ends, is a pointer argument sinse we may
2914 update it. */
2916 auto inline reg_errcode_t
2917 __attribute ((always_inline))
2918 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2919 re_charset_t *mbcset;
2920 int *range_alloc;
2921 re_bitset_ptr_t sbcset;
2922 bracket_elem_t *start_elem, *end_elem;
2924 unsigned int ch;
2925 uint32_t start_collseq;
2926 uint32_t end_collseq;
2928 /* Equivalence Classes and Character Classes can't be a range
2929 start/end. */
2930 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2931 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2933 return REG_ERANGE;
2935 start_collseq = lookup_collation_sequence_value (start_elem);
2936 end_collseq = lookup_collation_sequence_value (end_elem);
2937 /* Check start/end collation sequence values. */
2938 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2939 return REG_ECOLLATE;
2940 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2941 return REG_ERANGE;
2943 /* Got valid collation sequence values, add them as a new entry.
2944 However, if we have no collation elements, and the character set
2945 is single byte, the single byte character set that we
2946 build below suffices. */
2947 if (nrules > 0 || dfa->mb_cur_max > 1)
2949 /* Check the space of the arrays. */
2950 if (BE (*range_alloc == mbcset->nranges, 0))
2952 /* There is not enough space, need realloc. */
2953 uint32_t *new_array_start;
2954 uint32_t *new_array_end;
2955 int new_nranges;
2957 /* +1 in case of mbcset->nranges is 0. */
2958 new_nranges = 2 * mbcset->nranges + 1;
2959 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2960 new_nranges);
2961 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2962 new_nranges);
2964 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2965 return REG_ESPACE;
2967 mbcset->range_starts = new_array_start;
2968 mbcset->range_ends = new_array_end;
2969 *range_alloc = new_nranges;
2972 mbcset->range_starts[mbcset->nranges] = start_collseq;
2973 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2976 /* Build the table for single byte characters. */
2977 for (ch = 0; ch < SBC_MAX; ch++)
2979 uint32_t ch_collseq;
2981 if (MB_CUR_MAX == 1)
2983 if (nrules == 0)
2984 ch_collseq = collseqmb[ch];
2985 else
2986 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2987 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2988 bitset_set (sbcset, ch);
2990 return REG_NOERROR;
2993 /* Local function for parse_bracket_exp used in _LIBC environement.
2994 Build the collating element which is represented by NAME.
2995 The result are written to MBCSET and SBCSET.
2996 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2997 pointer argument sinse we may update it. */
2999 auto inline reg_errcode_t
3000 __attribute ((always_inline))
3001 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3002 re_charset_t *mbcset;
3003 int *coll_sym_alloc;
3004 re_bitset_ptr_t sbcset;
3005 const unsigned char *name;
3007 int32_t elem, idx;
3008 size_t name_len = strlen ((const char *) name);
3009 if (nrules != 0)
3011 elem = seek_collating_symbol_entry (name, name_len);
3012 if (symb_table[2 * elem] != 0)
3014 /* We found the entry. */
3015 idx = symb_table[2 * elem + 1];
3016 /* Skip the name of collating element name. */
3017 idx += 1 + extra[idx];
3019 else if (symb_table[2 * elem] == 0 && name_len == 1)
3021 /* No valid character, treat it as a normal
3022 character. */
3023 bitset_set (sbcset, name[0]);
3024 return REG_NOERROR;
3026 else
3027 return REG_ECOLLATE;
3029 /* Got valid collation sequence, add it as a new entry. */
3030 /* Check the space of the arrays. */
3031 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3033 /* Not enough, realloc it. */
3034 /* +1 in case of mbcset->ncoll_syms is 0. */
3035 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3036 /* Use realloc since mbcset->coll_syms is NULL
3037 if *alloc == 0. */
3038 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3039 new_coll_sym_alloc);
3040 if (BE (new_coll_syms == NULL, 0))
3041 return REG_ESPACE;
3042 mbcset->coll_syms = new_coll_syms;
3043 *coll_sym_alloc = new_coll_sym_alloc;
3045 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3046 return REG_NOERROR;
3048 else
3050 if (BE (name_len != 1, 0))
3051 return REG_ECOLLATE;
3052 else
3054 bitset_set (sbcset, name[0]);
3055 return REG_NOERROR;
3059 #endif
3061 re_token_t br_token;
3062 re_bitset_ptr_t sbcset;
3063 #ifdef RE_ENABLE_I18N
3064 re_charset_t *mbcset;
3065 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3066 int equiv_class_alloc = 0, char_class_alloc = 0;
3067 #endif /* not RE_ENABLE_I18N */
3068 int non_match = 0;
3069 bin_tree_t *work_tree;
3070 int token_len;
3071 int first_round = 1;
3072 #ifdef _LIBC
3073 collseqmb = (const unsigned char *)
3074 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3075 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3076 if (nrules)
3079 if (MB_CUR_MAX > 1)
3081 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3082 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3083 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3084 _NL_COLLATE_SYMB_TABLEMB);
3085 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3086 _NL_COLLATE_SYMB_EXTRAMB);
3088 #endif
3089 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3090 #ifdef RE_ENABLE_I18N
3091 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3092 #endif /* RE_ENABLE_I18N */
3093 #ifdef RE_ENABLE_I18N
3094 if (BE (sbcset == NULL || mbcset == NULL, 0))
3095 #else
3096 if (BE (sbcset == NULL, 0))
3097 #endif /* RE_ENABLE_I18N */
3099 *err = REG_ESPACE;
3100 return NULL;
3103 token_len = peek_token_bracket (token, regexp, syntax);
3104 if (BE (token->type == END_OF_RE, 0))
3106 *err = REG_BADPAT;
3107 goto parse_bracket_exp_free_return;
3109 if (token->type == OP_NON_MATCH_LIST)
3111 #ifdef RE_ENABLE_I18N
3112 mbcset->non_match = 1;
3113 #endif /* not RE_ENABLE_I18N */
3114 non_match = 1;
3115 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3116 bitset_set (sbcset, '\0');
3117 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3118 token_len = peek_token_bracket (token, regexp, syntax);
3119 if (BE (token->type == END_OF_RE, 0))
3121 *err = REG_BADPAT;
3122 goto parse_bracket_exp_free_return;
3126 /* We treat the first ']' as a normal character. */
3127 if (token->type == OP_CLOSE_BRACKET)
3128 token->type = CHARACTER;
3130 while (1)
3132 bracket_elem_t start_elem, end_elem;
3133 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3134 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3135 reg_errcode_t ret;
3136 int token_len2 = 0, is_range_exp = 0;
3137 re_token_t token2;
3139 start_elem.opr.name = start_name_buf;
3140 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3141 syntax, first_round);
3142 if (BE (ret != REG_NOERROR, 0))
3144 *err = ret;
3145 goto parse_bracket_exp_free_return;
3147 first_round = 0;
3149 /* Get information about the next token. We need it in any case. */
3150 token_len = peek_token_bracket (token, regexp, syntax);
3152 /* Do not check for ranges if we know they are not allowed. */
3153 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3155 if (BE (token->type == END_OF_RE, 0))
3157 *err = REG_EBRACK;
3158 goto parse_bracket_exp_free_return;
3160 if (token->type == OP_CHARSET_RANGE)
3162 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3163 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3164 if (BE (token2.type == END_OF_RE, 0))
3166 *err = REG_EBRACK;
3167 goto parse_bracket_exp_free_return;
3169 if (token2.type == OP_CLOSE_BRACKET)
3171 /* We treat the last '-' as a normal character. */
3172 re_string_skip_bytes (regexp, -token_len);
3173 token->type = CHARACTER;
3175 else
3176 is_range_exp = 1;
3180 if (is_range_exp == 1)
3182 end_elem.opr.name = end_name_buf;
3183 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3184 dfa, syntax, 1);
3185 if (BE (ret != REG_NOERROR, 0))
3187 *err = ret;
3188 goto parse_bracket_exp_free_return;
3191 token_len = peek_token_bracket (token, regexp, syntax);
3193 #ifdef _LIBC
3194 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3195 &start_elem, &end_elem);
3196 #else
3197 # ifdef RE_ENABLE_I18N
3198 *err = build_range_exp (sbcset,
3199 dfa->mb_cur_max > 1 ? mbcset : NULL,
3200 &range_alloc, &start_elem, &end_elem);
3201 # else
3202 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3203 # endif
3204 #endif /* RE_ENABLE_I18N */
3205 if (BE (*err != REG_NOERROR, 0))
3206 goto parse_bracket_exp_free_return;
3208 else
3210 switch (start_elem.type)
3212 case SB_CHAR:
3213 bitset_set (sbcset, start_elem.opr.ch);
3214 break;
3215 #ifdef RE_ENABLE_I18N
3216 case MB_CHAR:
3217 /* Check whether the array has enough space. */
3218 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3220 wchar_t *new_mbchars;
3221 /* Not enough, realloc it. */
3222 /* +1 in case of mbcset->nmbchars is 0. */
3223 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3224 /* Use realloc since array is NULL if *alloc == 0. */
3225 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3226 mbchar_alloc);
3227 if (BE (new_mbchars == NULL, 0))
3228 goto parse_bracket_exp_espace;
3229 mbcset->mbchars = new_mbchars;
3231 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3232 break;
3233 #endif /* RE_ENABLE_I18N */
3234 case EQUIV_CLASS:
3235 *err = build_equiv_class (sbcset,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset, &equiv_class_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem.opr.name);
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3242 break;
3243 case COLL_SYM:
3244 *err = build_collating_symbol (sbcset,
3245 #ifdef RE_ENABLE_I18N
3246 mbcset, &coll_sym_alloc,
3247 #endif /* RE_ENABLE_I18N */
3248 start_elem.opr.name);
3249 if (BE (*err != REG_NOERROR, 0))
3250 goto parse_bracket_exp_free_return;
3251 break;
3252 case CHAR_CLASS:
3253 *err = build_charclass (regexp->trans, sbcset,
3254 #ifdef RE_ENABLE_I18N
3255 mbcset, &char_class_alloc,
3256 #endif /* RE_ENABLE_I18N */
3257 start_elem.opr.name, syntax);
3258 if (BE (*err != REG_NOERROR, 0))
3259 goto parse_bracket_exp_free_return;
3260 break;
3261 default:
3262 assert (0);
3263 break;
3266 if (BE (token->type == END_OF_RE, 0))
3268 *err = REG_EBRACK;
3269 goto parse_bracket_exp_free_return;
3271 if (token->type == OP_CLOSE_BRACKET)
3272 break;
3275 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3277 /* If it is non-matching list. */
3278 if (non_match)
3279 bitset_not (sbcset);
3281 #ifdef RE_ENABLE_I18N
3282 /* Ensure only single byte characters are set. */
3283 if (dfa->mb_cur_max > 1)
3284 bitset_mask (sbcset, dfa->sb_char);
3285 #endif /* RE_ENABLE_I18N */
3287 /* Build a tree for simple bracket. */
3288 br_token.type = SIMPLE_BRACKET;
3289 br_token.opr.sbcset = sbcset;
3290 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3291 if (BE (work_tree == NULL, 0))
3292 goto parse_bracket_exp_espace;
3294 #ifdef RE_ENABLE_I18N
3295 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3296 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3297 || mbcset->non_match)))
3299 re_token_t alt_token;
3300 bin_tree_t *mbc_tree;
3301 int sbc_idx;
3302 /* Build a tree for complex bracket. */
3303 dfa->has_mb_node = 1;
3304 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3305 if (sbcset[sbc_idx])
3306 break;
3307 /* If there are no bits set in sbcset, there is no point
3308 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3309 if (sbc_idx == BITSET_UINTS)
3311 re_free (sbcset);
3312 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3313 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3314 return work_tree;
3316 br_token.type = COMPLEX_BRACKET;
3317 br_token.opr.mbcset = mbcset;
3318 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3319 if (BE (mbc_tree == NULL, 0))
3320 goto parse_bracket_exp_espace;
3321 /* Then join them by ALT node. */
3322 alt_token.type = OP_ALT;
3323 dfa->has_plural_match = 1;
3324 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3325 if (BE (mbc_tree != NULL, 1))
3326 return work_tree;
3328 else
3330 free_charset (mbcset);
3331 return work_tree;
3333 #else /* not RE_ENABLE_I18N */
3334 return work_tree;
3335 #endif /* not RE_ENABLE_I18N */
3337 parse_bracket_exp_espace:
3338 *err = REG_ESPACE;
3339 parse_bracket_exp_free_return:
3340 re_free (sbcset);
3341 #ifdef RE_ENABLE_I18N
3342 free_charset (mbcset);
3343 #endif /* RE_ENABLE_I18N */
3344 return NULL;
3347 /* Parse an element in the bracket expression. */
3349 static reg_errcode_t
3350 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3351 accept_hyphen)
3352 bracket_elem_t *elem;
3353 re_string_t *regexp;
3354 re_token_t *token;
3355 int token_len;
3356 re_dfa_t *dfa;
3357 reg_syntax_t syntax;
3358 int accept_hyphen;
3360 #ifdef RE_ENABLE_I18N
3361 int cur_char_size;
3362 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3363 if (cur_char_size > 1)
3365 elem->type = MB_CHAR;
3366 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3367 re_string_skip_bytes (regexp, cur_char_size);
3368 return REG_NOERROR;
3370 #endif /* RE_ENABLE_I18N */
3371 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3372 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3373 || token->type == OP_OPEN_EQUIV_CLASS)
3374 return parse_bracket_symbol (elem, regexp, token);
3375 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3377 /* A '-' must only appear as anything but a range indicator before
3378 the closing bracket. Everything else is an error. */
3379 re_token_t token2;
3380 (void) peek_token_bracket (&token2, regexp, syntax);
3381 if (token2.type != OP_CLOSE_BRACKET)
3382 /* The actual error value is not standardized since this whole
3383 case is undefined. But ERANGE makes good sense. */
3384 return REG_ERANGE;
3386 elem->type = SB_CHAR;
3387 elem->opr.ch = token->opr.c;
3388 return REG_NOERROR;
3391 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3392 such as [:<character_class>:], [.<collating_element>.], and
3393 [=<equivalent_class>=]. */
3395 static reg_errcode_t
3396 parse_bracket_symbol (elem, regexp, token)
3397 bracket_elem_t *elem;
3398 re_string_t *regexp;
3399 re_token_t *token;
3401 unsigned char ch, delim = token->opr.c;
3402 int i = 0;
3403 if (re_string_eoi(regexp))
3404 return REG_EBRACK;
3405 for (;; ++i)
3407 if (i >= BRACKET_NAME_BUF_SIZE)
3408 return REG_EBRACK;
3409 if (token->type == OP_OPEN_CHAR_CLASS)
3410 ch = re_string_fetch_byte_case (regexp);
3411 else
3412 ch = re_string_fetch_byte (regexp);
3413 if (re_string_eoi(regexp))
3414 return REG_EBRACK;
3415 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3416 break;
3417 elem->opr.name[i] = ch;
3419 re_string_skip_bytes (regexp, 1);
3420 elem->opr.name[i] = '\0';
3421 switch (token->type)
3423 case OP_OPEN_COLL_ELEM:
3424 elem->type = COLL_SYM;
3425 break;
3426 case OP_OPEN_EQUIV_CLASS:
3427 elem->type = EQUIV_CLASS;
3428 break;
3429 case OP_OPEN_CHAR_CLASS:
3430 elem->type = CHAR_CLASS;
3431 break;
3432 default:
3433 break;
3435 return REG_NOERROR;
3438 /* Helper function for parse_bracket_exp.
3439 Build the equivalence class which is represented by NAME.
3440 The result are written to MBCSET and SBCSET.
3441 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3442 is a pointer argument sinse we may update it. */
3444 static reg_errcode_t
3445 #ifdef RE_ENABLE_I18N
3446 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3447 re_charset_t *mbcset;
3448 int *equiv_class_alloc;
3449 #else /* not RE_ENABLE_I18N */
3450 build_equiv_class (sbcset, name)
3451 #endif /* not RE_ENABLE_I18N */
3452 re_bitset_ptr_t sbcset;
3453 const unsigned char *name;
3455 #if defined _LIBC
3456 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3457 if (nrules != 0)
3459 const int32_t *table, *indirect;
3460 const unsigned char *weights, *extra, *cp;
3461 unsigned char char_buf[2];
3462 int32_t idx1, idx2;
3463 unsigned int ch;
3464 size_t len;
3465 /* This #include defines a local function! */
3466 # include <locale/weight.h>
3467 /* Calculate the index for equivalence class. */
3468 cp = name;
3469 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3470 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3471 _NL_COLLATE_WEIGHTMB);
3472 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3473 _NL_COLLATE_EXTRAMB);
3474 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3475 _NL_COLLATE_INDIRECTMB);
3476 idx1 = findidx (&cp);
3477 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3478 /* This isn't a valid character. */
3479 return REG_ECOLLATE;
3481 /* Build single byte matcing table for this equivalence class. */
3482 char_buf[1] = (unsigned char) '\0';
3483 len = weights[idx1];
3484 for (ch = 0; ch < SBC_MAX; ++ch)
3486 char_buf[0] = ch;
3487 cp = char_buf;
3488 idx2 = findidx (&cp);
3490 idx2 = table[ch];
3492 if (idx2 == 0)
3493 /* This isn't a valid character. */
3494 continue;
3495 if (len == weights[idx2])
3497 int cnt = 0;
3498 while (cnt <= len &&
3499 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3500 ++cnt;
3502 if (cnt > len)
3503 bitset_set (sbcset, ch);
3506 /* Check whether the array has enough space. */
3507 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3509 /* Not enough, realloc it. */
3510 /* +1 in case of mbcset->nequiv_classes is 0. */
3511 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3512 /* Use realloc since the array is NULL if *alloc == 0. */
3513 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3514 int32_t,
3515 new_equiv_class_alloc);
3516 if (BE (new_equiv_classes == NULL, 0))
3517 return REG_ESPACE;
3518 mbcset->equiv_classes = new_equiv_classes;
3519 *equiv_class_alloc = new_equiv_class_alloc;
3521 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3523 else
3524 #endif /* _LIBC */
3526 if (BE (strlen ((const char *) name) != 1, 0))
3527 return REG_ECOLLATE;
3528 bitset_set (sbcset, *name);
3530 return REG_NOERROR;
3533 /* Helper function for parse_bracket_exp.
3534 Build the character class which is represented by NAME.
3535 The result are written to MBCSET and SBCSET.
3536 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3537 is a pointer argument sinse we may update it. */
3539 static reg_errcode_t
3540 #ifdef RE_ENABLE_I18N
3541 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3542 re_charset_t *mbcset;
3543 int *char_class_alloc;
3544 #else /* not RE_ENABLE_I18N */
3545 build_charclass (trans, sbcset, class_name, syntax)
3546 #endif /* not RE_ENABLE_I18N */
3547 unsigned RE_TRANSLATE_TYPE trans;
3548 re_bitset_ptr_t sbcset;
3549 const unsigned char *class_name;
3550 reg_syntax_t syntax;
3552 int i;
3553 const char *name = (const char *) class_name;
3555 /* In case of REG_ICASE "upper" and "lower" match the both of
3556 upper and lower cases. */
3557 if ((syntax & RE_ICASE)
3558 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3559 name = "alpha";
3561 #ifdef RE_ENABLE_I18N
3562 /* Check the space of the arrays. */
3563 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3565 /* Not enough, realloc it. */
3566 /* +1 in case of mbcset->nchar_classes is 0. */
3567 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3568 /* Use realloc since array is NULL if *alloc == 0. */
3569 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3570 new_char_class_alloc);
3571 if (BE (new_char_classes == NULL, 0))
3572 return REG_ESPACE;
3573 mbcset->char_classes = new_char_classes;
3574 *char_class_alloc = new_char_class_alloc;
3576 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3577 #endif /* RE_ENABLE_I18N */
3579 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3580 for (i = 0; i < SBC_MAX; ++i) \
3582 if (ctype_func (i)) \
3584 int ch = trans ? trans[i] : i; \
3585 bitset_set (sbcset, ch); \
3589 if (strcmp (name, "alnum") == 0)
3590 BUILD_CHARCLASS_LOOP (isalnum)
3591 else if (strcmp (name, "cntrl") == 0)
3592 BUILD_CHARCLASS_LOOP (iscntrl)
3593 else if (strcmp (name, "lower") == 0)
3594 BUILD_CHARCLASS_LOOP (islower)
3595 else if (strcmp (name, "space") == 0)
3596 BUILD_CHARCLASS_LOOP (isspace)
3597 else if (strcmp (name, "alpha") == 0)
3598 BUILD_CHARCLASS_LOOP (isalpha)
3599 else if (strcmp (name, "digit") == 0)
3600 BUILD_CHARCLASS_LOOP (isdigit)
3601 else if (strcmp (name, "print") == 0)
3602 BUILD_CHARCLASS_LOOP (isprint)
3603 else if (strcmp (name, "upper") == 0)
3604 BUILD_CHARCLASS_LOOP (isupper)
3605 else if (strcmp (name, "blank") == 0)
3606 BUILD_CHARCLASS_LOOP (isblank)
3607 else if (strcmp (name, "graph") == 0)
3608 BUILD_CHARCLASS_LOOP (isgraph)
3609 else if (strcmp (name, "punct") == 0)
3610 BUILD_CHARCLASS_LOOP (ispunct)
3611 else if (strcmp (name, "xdigit") == 0)
3612 BUILD_CHARCLASS_LOOP (isxdigit)
3613 else
3614 return REG_ECTYPE;
3616 return REG_NOERROR;
3619 static bin_tree_t *
3620 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3621 re_dfa_t *dfa;
3622 unsigned RE_TRANSLATE_TYPE trans;
3623 const unsigned char *class_name;
3624 const unsigned char *extra;
3625 int non_match;
3626 reg_errcode_t *err;
3628 re_bitset_ptr_t sbcset;
3629 #ifdef RE_ENABLE_I18N
3630 re_charset_t *mbcset;
3631 int alloc = 0;
3632 #endif /* not RE_ENABLE_I18N */
3633 reg_errcode_t ret;
3634 re_token_t br_token;
3635 bin_tree_t *tree;
3637 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3638 #ifdef RE_ENABLE_I18N
3639 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3640 #endif /* RE_ENABLE_I18N */
3642 #ifdef RE_ENABLE_I18N
3643 if (BE (sbcset == NULL || mbcset == NULL, 0))
3644 #else /* not RE_ENABLE_I18N */
3645 if (BE (sbcset == NULL, 0))
3646 #endif /* not RE_ENABLE_I18N */
3648 *err = REG_ESPACE;
3649 return NULL;
3652 if (non_match)
3654 #ifdef RE_ENABLE_I18N
3656 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3657 bitset_set(cset->sbcset, '\0');
3659 mbcset->non_match = 1;
3660 #endif /* not RE_ENABLE_I18N */
3663 /* We don't care the syntax in this case. */
3664 ret = build_charclass (trans, sbcset,
3665 #ifdef RE_ENABLE_I18N
3666 mbcset, &alloc,
3667 #endif /* RE_ENABLE_I18N */
3668 class_name, 0);
3670 if (BE (ret != REG_NOERROR, 0))
3672 re_free (sbcset);
3673 #ifdef RE_ENABLE_I18N
3674 free_charset (mbcset);
3675 #endif /* RE_ENABLE_I18N */
3676 *err = ret;
3677 return NULL;
3679 /* \w match '_' also. */
3680 for (; *extra; extra++)
3681 bitset_set (sbcset, *extra);
3683 /* If it is non-matching list. */
3684 if (non_match)
3685 bitset_not (sbcset);
3687 #ifdef RE_ENABLE_I18N
3688 /* Ensure only single byte characters are set. */
3689 if (dfa->mb_cur_max > 1)
3690 bitset_mask (sbcset, dfa->sb_char);
3691 #endif
3693 /* Build a tree for simple bracket. */
3694 br_token.type = SIMPLE_BRACKET;
3695 br_token.opr.sbcset = sbcset;
3696 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3697 if (BE (tree == NULL, 0))
3698 goto build_word_op_espace;
3700 #ifdef RE_ENABLE_I18N
3701 if (dfa->mb_cur_max > 1)
3703 re_token_t alt_token;
3704 bin_tree_t *mbc_tree;
3705 /* Build a tree for complex bracket. */
3706 br_token.type = COMPLEX_BRACKET;
3707 br_token.opr.mbcset = mbcset;
3708 dfa->has_mb_node = 1;
3709 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3710 if (BE (mbc_tree == NULL, 0))
3711 goto build_word_op_espace;
3712 /* Then join them by ALT node. */
3713 alt_token.type = OP_ALT;
3714 dfa->has_plural_match = 1;
3715 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3716 if (BE (mbc_tree != NULL, 1))
3717 return tree;
3719 else
3721 free_charset (mbcset);
3722 return tree;
3724 #else /* not RE_ENABLE_I18N */
3725 return tree;
3726 #endif /* not RE_ENABLE_I18N */
3728 build_word_op_espace:
3729 re_free (sbcset);
3730 #ifdef RE_ENABLE_I18N
3731 free_charset (mbcset);
3732 #endif /* RE_ENABLE_I18N */
3733 *err = REG_ESPACE;
3734 return NULL;
3737 /* This is intended for the expressions like "a{1,3}".
3738 Fetch a number from `input', and return the number.
3739 Return -1, if the number field is empty like "{,1}".
3740 Return -2, If an error is occured. */
3742 static int
3743 fetch_number (input, token, syntax)
3744 re_string_t *input;
3745 re_token_t *token;
3746 reg_syntax_t syntax;
3748 int num = -1;
3749 unsigned char c;
3750 while (1)
3752 fetch_token (token, input, syntax);
3753 c = token->opr.c;
3754 if (BE (token->type == END_OF_RE, 0))
3755 return -2;
3756 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3757 break;
3758 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3759 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3760 num = (num > RE_DUP_MAX) ? -2 : num;
3762 return num;
3765 #ifdef RE_ENABLE_I18N
3766 static void
3767 free_charset (re_charset_t *cset)
3769 re_free (cset->mbchars);
3770 # ifdef _LIBC
3771 re_free (cset->coll_syms);
3772 re_free (cset->equiv_classes);
3773 re_free (cset->range_starts);
3774 re_free (cset->range_ends);
3775 # endif
3776 re_free (cset->char_classes);
3777 re_free (cset);
3779 #endif /* RE_ENABLE_I18N */
3781 /* Functions for binary tree operation. */
3783 /* Create a tree node. */
3785 static bin_tree_t *
3786 create_tree (dfa, left, right, type, index)
3787 re_dfa_t *dfa;
3788 bin_tree_t *left;
3789 bin_tree_t *right;
3790 re_token_type_t type;
3791 int index;
3793 bin_tree_t *tree;
3794 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3796 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3798 if (storage == NULL)
3799 return NULL;
3800 storage->next = dfa->str_tree_storage;
3801 dfa->str_tree_storage = storage;
3802 dfa->str_tree_storage_idx = 0;
3804 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3806 tree->parent = NULL;
3807 tree->left = left;
3808 tree->right = right;
3809 tree->type = type;
3810 tree->node_idx = index;
3811 tree->first = -1;
3812 tree->next = -1;
3813 re_node_set_init_empty (&tree->eclosure);
3815 if (left != NULL)
3816 left->parent = tree;
3817 if (right != NULL)
3818 right->parent = tree;
3819 return tree;
3822 /* Create both a DFA node and a tree for it. */
3824 static bin_tree_t *
3825 re_dfa_add_tree_node (dfa, left, right, token)
3826 re_dfa_t *dfa;
3827 bin_tree_t *left;
3828 bin_tree_t *right;
3829 const re_token_t *token;
3831 int new_idx = re_dfa_add_node (dfa, *token, 0);
3833 if (new_idx == -1)
3834 return NULL;
3836 return create_tree (dfa, left, right, 0, new_idx);
3839 /* Mark the tree SRC as an optional subexpression. */
3841 static void
3842 mark_opt_subexp (src, dfa)
3843 const bin_tree_t *src;
3844 re_dfa_t *dfa;
3846 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3847 a subexpression. */
3848 if (src->type == CONCAT
3849 && src->left->type == NON_TYPE
3850 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3851 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3855 /* Recursive tree walker for mark_opt_subexp. */
3857 static void
3858 mark_opt_subexp_iter (src, dfa, idx)
3859 const bin_tree_t *src;
3860 re_dfa_t *dfa;
3861 int idx;
3863 int node_idx;
3865 if (src->type == NON_TYPE)
3867 node_idx = src->node_idx;
3868 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3869 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3870 && dfa->nodes[node_idx].opr.idx == idx)
3871 dfa->nodes[node_idx].opt_subexp = 1;
3874 if (src->left != NULL)
3875 mark_opt_subexp_iter (src->left, dfa, idx);
3877 if (src->right != NULL)
3878 mark_opt_subexp_iter (src->right, dfa, idx);
3882 /* Duplicate the node SRC, and return new node. */
3884 static bin_tree_t *
3885 duplicate_tree (src, dfa)
3886 const bin_tree_t *src;
3887 re_dfa_t *dfa;
3889 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3890 int new_node_idx;
3891 /* Since node indies must be according to Post-order of the tree,
3892 we must duplicate the left at first. */
3893 if (src->left != NULL)
3895 left = duplicate_tree (src->left, dfa);
3896 if (left == NULL)
3897 return NULL;
3900 /* Secondaly, duplicate the right. */
3901 if (src->right != NULL)
3903 right = duplicate_tree (src->right, dfa);
3904 if (right == NULL)
3905 return NULL;
3908 /* At last, duplicate itself. */
3909 if (src->type == NON_TYPE)
3911 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3912 dfa->nodes[new_node_idx].duplicated = 1;
3913 if (BE (new_node_idx == -1, 0))
3914 return NULL;
3916 else
3917 new_node_idx = src->type;
3919 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3920 return new_tree;