Add new function __libc_message which performs the printing and simple format string...
[glibc.git] / posix / regcomp.c
blobba7a1cc5d47a69718ced643cbba6b466eb5ecb2d
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (re_dfa_t *dfa);
37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
42 int top_clone_node, int root_node,
43 unsigned int constraint);
44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
45 unsigned int constraint);
46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
50 int node, int root);
51 static void calc_inveclosure (re_dfa_t *dfa);
52 static int fetch_number (re_string_t *input, re_token_t *token,
53 reg_syntax_t syntax);
54 static void fetch_token (re_token_t *result, re_string_t *input,
55 reg_syntax_t syntax);
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax);
58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax);
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifndef _LIBC
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
96 re_charset_t *mbcset,
97 int *coll_sym_alloc,
98 const unsigned char *name);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 const unsigned char *name);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
112 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
113 re_bitset_ptr_t sbcset,
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
120 const unsigned char *name);
121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
127 unsigned RE_TRANSLATE_TYPE trans,
128 const unsigned char *class_name,
129 const unsigned char *extra,
130 int non_match, reg_errcode_t *err);
131 static bin_tree_t *create_tree (re_dfa_t *dfa,
132 bin_tree_t *left, bin_tree_t *right,
133 re_token_type_t type, int index);
134 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
135 bin_tree_t *left, bin_tree_t *right,
136 const re_token_t *token)
137 __attribute ((noinline));
138 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
139 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
140 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
142 /* This table gives an error message for each of the error codes listed
143 in regex.h. Obviously the order here has to be same as there.
144 POSIX doesn't require that we do anything for REG_NOERROR,
145 but why not be nice? */
147 const char __re_error_msgid[] attribute_hidden =
149 #define REG_NOERROR_IDX 0
150 gettext_noop ("Success") /* REG_NOERROR */
151 "\0"
152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
153 gettext_noop ("No match") /* REG_NOMATCH */
154 "\0"
155 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
156 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
157 "\0"
158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
159 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
160 "\0"
161 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
162 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
163 "\0"
164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
165 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
166 "\0"
167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
168 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
169 "\0"
170 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
171 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
172 "\0"
173 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
174 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
175 "\0"
176 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
177 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
178 "\0"
179 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
180 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
181 "\0"
182 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
183 gettext_noop ("Invalid range end") /* REG_ERANGE */
184 "\0"
185 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
186 gettext_noop ("Memory exhausted") /* REG_ESPACE */
187 "\0"
188 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
189 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
190 "\0"
191 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
192 gettext_noop ("Premature end of regular expression") /* REG_EEND */
193 "\0"
194 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
195 gettext_noop ("Regular expression too big") /* REG_ESIZE */
196 "\0"
197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
198 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
201 const size_t __re_error_msgid_idx[] attribute_hidden =
203 REG_NOERROR_IDX,
204 REG_NOMATCH_IDX,
205 REG_BADPAT_IDX,
206 REG_ECOLLATE_IDX,
207 REG_ECTYPE_IDX,
208 REG_EESCAPE_IDX,
209 REG_ESUBREG_IDX,
210 REG_EBRACK_IDX,
211 REG_EPAREN_IDX,
212 REG_EBRACE_IDX,
213 REG_BADBR_IDX,
214 REG_ERANGE_IDX,
215 REG_ESPACE_IDX,
216 REG_BADRPT_IDX,
217 REG_EEND_IDX,
218 REG_ESIZE_IDX,
219 REG_ERPAREN_IDX
222 /* Entry points for GNU code. */
224 /* re_compile_pattern is the GNU regular expression compiler: it
225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
226 Returns 0 if the pattern was valid, otherwise an error string.
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
231 const char *
232 re_compile_pattern (pattern, length, bufp)
233 const char *pattern;
234 size_t length;
235 struct re_pattern_buffer *bufp;
237 reg_errcode_t ret;
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
241 setting no_sub. */
242 bufp->no_sub = 0;
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
249 if (!ret)
250 return NULL;
251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
253 #ifdef _LIBC
254 weak_alias (__re_compile_pattern, re_compile_pattern)
255 #endif
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
272 reg_syntax_t
273 re_set_syntax (syntax)
274 reg_syntax_t syntax;
276 reg_syntax_t ret = re_syntax_options;
278 re_syntax_options = syntax;
279 return ret;
281 #ifdef _LIBC
282 weak_alias (__re_set_syntax, re_set_syntax)
283 #endif
286 re_compile_fastmap (bufp)
287 struct re_pattern_buffer *bufp;
289 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
290 char *fastmap = bufp->fastmap;
292 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
293 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
294 if (dfa->init_state != dfa->init_state_word)
295 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
296 if (dfa->init_state != dfa->init_state_nl)
297 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
298 if (dfa->init_state != dfa->init_state_begbuf)
299 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
300 bufp->fastmap_accurate = 1;
301 return 0;
303 #ifdef _LIBC
304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
305 #endif
307 static inline void
308 __attribute ((always_inline))
309 re_set_fastmap (char *fastmap, int icase, int ch)
311 fastmap[ch] = 1;
312 if (icase)
313 fastmap[tolower (ch)] = 1;
316 /* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
319 static void
320 re_compile_fastmap_iter (bufp, init_state, fastmap)
321 regex_t *bufp;
322 const re_dfastate_t *init_state;
323 char *fastmap;
325 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
326 int node_cnt;
327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
333 if (type == CHARACTER)
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
339 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
340 wchar_t wc;
341 mbstate_t state;
343 p = buf;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, 0, sizeof (state));
350 if (mbrtowc (&wc, (const char *) buf, p - buf,
351 &state) == p - buf
352 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
353 re_set_fastmap (fastmap, 0, buf[0]);
355 #endif
357 else if (type == SIMPLE_BRACKET)
359 int i, j, ch;
360 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
361 for (j = 0; j < UINT_BITS; ++j, ++ch)
362 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
363 re_set_fastmap (fastmap, icase, ch);
365 #ifdef RE_ENABLE_I18N
366 else if (type == COMPLEX_BRACKET)
368 int i;
369 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
370 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
371 || cset->nranges || cset->nchar_classes)
373 # ifdef _LIBC
374 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
376 /* In this case we want to catch the bytes which are
377 the first byte of any collation elements.
378 e.g. In da_DK, we want to catch 'a' since "aa"
379 is a valid collation element, and don't catch
380 'b' since 'b' is the only collation element
381 which starts from 'b'. */
382 int j, ch;
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
386 for (j = 0; j < UINT_BITS; ++j, ++ch)
387 if (table[ch] < 0)
388 re_set_fastmap (fastmap, icase, ch);
390 # else
391 if (dfa->mb_cur_max > 1)
392 for (i = 0; i < SBC_MAX; ++i)
393 if (__btowc (i) == WEOF)
394 re_set_fastmap (fastmap, icase, i);
395 # endif /* not _LIBC */
397 for (i = 0; i < cset->nmbchars; ++i)
399 char buf[256];
400 mbstate_t state;
401 memset (&state, '\0', sizeof (state));
402 __wcrtomb (buf, cset->mbchars[i], &state);
403 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
404 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
407 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
411 #endif /* RE_ENABLE_I18N */
412 else if (type == OP_PERIOD
413 #ifdef RE_ENABLE_I18N
414 || type == OP_UTF8_PERIOD
415 #endif /* RE_ENABLE_I18N */
416 || type == END_OF_RE)
418 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419 if (type == END_OF_RE)
420 bufp->can_be_null = 1;
421 return;
426 /* Entry point for POSIX code. */
427 /* regcomp takes a regular expression as a string and compiles it.
429 PREG is a regex_t *. We do not expect any fields to be initialized,
430 since POSIX says we shouldn't. Thus, we set
432 `buffer' to the compiled pattern;
433 `used' to the length of the compiled pattern;
434 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
435 REG_EXTENDED bit in CFLAGS is set; otherwise, to
436 RE_SYNTAX_POSIX_BASIC;
437 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
438 `fastmap' to an allocated space for the fastmap;
439 `fastmap_accurate' to zero;
440 `re_nsub' to the number of subexpressions in PATTERN.
442 PATTERN is the address of the pattern string.
444 CFLAGS is a series of bits which affect compilation.
446 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
447 use POSIX basic syntax.
449 If REG_NEWLINE is set, then . and [^...] don't match newline.
450 Also, regexec will try a match beginning after every newline.
452 If REG_ICASE is set, then we considers upper- and lowercase
453 versions of letters to be equivalent when matching.
455 If REG_NOSUB is set, then when PREG is passed to regexec, that
456 routine will report only success or failure, and nothing about the
457 registers.
459 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
460 the return codes and their meanings.) */
463 regcomp (preg, pattern, cflags)
464 regex_t *__restrict preg;
465 const char *__restrict pattern;
466 int cflags;
468 reg_errcode_t ret;
469 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
470 : RE_SYNTAX_POSIX_BASIC);
472 preg->buffer = NULL;
473 preg->allocated = 0;
474 preg->used = 0;
476 /* Try to allocate space for the fastmap. */
477 preg->fastmap = re_malloc (char, SBC_MAX);
478 if (BE (preg->fastmap == NULL, 0))
479 return REG_ESPACE;
481 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags & REG_NEWLINE)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax &= ~RE_DOT_NEWLINE;
487 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
488 /* It also changes the matching behavior. */
489 preg->newline_anchor = 1;
491 else
492 preg->newline_anchor = 0;
493 preg->no_sub = !!(cflags & REG_NOSUB);
494 preg->translate = NULL;
496 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret == REG_ERPAREN)
501 ret = REG_EPAREN;
503 /* We have already checked preg->fastmap != NULL. */
504 if (BE (ret == REG_NOERROR, 1))
505 /* Compute the fastmap now, since regexec cannot modify the pattern
506 buffer. This function never fails in this implementation. */
507 (void) re_compile_fastmap (preg);
508 else
510 /* Some error occurred while compiling the expression. */
511 re_free (preg->fastmap);
512 preg->fastmap = NULL;
515 return (int) ret;
517 #ifdef _LIBC
518 weak_alias (__regcomp, regcomp)
519 #endif
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
524 size_t
525 regerror (errcode, preg, errbuf, errbuf_size)
526 int errcode;
527 const regex_t *preg;
528 char *errbuf;
529 size_t errbuf_size;
531 const char *msg;
532 size_t msg_size;
534 if (BE (errcode < 0
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553 #else
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
556 #endif
558 else
559 memcpy (errbuf, msg, msg_size);
562 return msg_size;
564 #ifdef _LIBC
565 weak_alias (__regerror, regerror)
566 #endif
569 #ifdef RE_ENABLE_I18N
570 /* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574 static const bitset utf8_sb_map =
576 /* Set the first 128 bits. */
577 # if UINT_MAX == 0xffffffff
578 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
579 # else
580 # error "Add case for new unsigned int size"
581 # endif
583 #endif
586 static void
587 free_dfa_content (re_dfa_t *dfa)
589 int i, j;
591 re_free (dfa->subexps);
593 if (dfa->nodes)
594 for (i = 0; i < dfa->nodes_len; ++i)
596 re_token_t *node = dfa->nodes + i;
597 #ifdef RE_ENABLE_I18N
598 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
599 free_charset (node->opr.mbcset);
600 else
601 #endif /* RE_ENABLE_I18N */
602 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
603 re_free (node->opr.sbcset);
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
620 if (dfa->state_table)
621 for (i = 0; i <= dfa->state_hash_mask; ++i)
623 struct re_state_table_entry *entry = dfa->state_table + i;
624 for (j = 0; j < entry->num; ++j)
626 re_dfastate_t *state = entry->array[j];
627 free_state (state);
629 re_free (entry->array);
631 re_free (dfa->state_table);
632 #ifdef RE_ENABLE_I18N
633 if (dfa->sb_char != utf8_sb_map)
634 re_free (dfa->sb_char);
635 #endif
636 #ifdef DEBUG
637 re_free (dfa->re_str);
638 #endif
640 re_free (dfa);
644 /* Free dynamically allocated space used by PREG. */
646 void
647 regfree (preg)
648 regex_t *preg;
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
653 preg->buffer = NULL;
654 preg->allocated = 0;
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
659 re_free (preg->translate);
660 preg->translate = NULL;
662 #ifdef _LIBC
663 weak_alias (__regfree, regfree)
664 #endif
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (s)
682 const char *s;
684 reg_errcode_t ret;
685 char *fastmap;
687 if (!s)
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
694 if (re_comp_buf.buffer)
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
703 if (re_comp_buf.fastmap == NULL)
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
719 if (!ret)
720 return NULL;
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
726 #ifdef _LIBC
727 libc_freeres_fn (free_mem)
729 __regfree (&re_comp_buf);
731 #endif
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
739 static reg_errcode_t
740 re_compile_internal (preg, pattern, length, syntax)
741 regex_t *preg;
742 const char * pattern;
743 int length;
744 reg_syntax_t syntax;
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
773 preg->used = sizeof (re_dfa_t);
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
783 #ifdef DEBUG
784 dfa->re_str = re_malloc (char, length + 1);
785 strncpy (dfa->re_str, pattern, length + 1);
786 #endif
788 err = re_string_construct (&regexp, pattern, length, preg->translate,
789 syntax & RE_ICASE, dfa);
790 if (BE (err != REG_NOERROR, 0))
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (&regexp);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
807 #ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
809 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
810 optimize_utf8 (dfa);
811 #endif
813 /* Analyze the tree and collect information which is necessary to
814 create the dfa. */
815 err = analyze (dfa);
816 if (BE (err != REG_NOERROR, 0))
817 goto re_compile_internal_free_return;
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (&regexp);
826 if (BE (err != REG_NOERROR, 0))
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
833 return err;
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
839 static reg_errcode_t
840 init_dfa (dfa, pat_len)
841 re_dfa_t *dfa;
842 int pat_len;
844 int table_size;
845 #ifndef _LIBC
846 char *codeset_name;
847 #endif
849 memset (dfa, '\0', sizeof (re_dfa_t));
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854 dfa->nodes_alloc = pat_len + 1;
855 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857 dfa->states_alloc = pat_len + 1;
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; table_size > 0; table_size <<= 1)
861 if (table_size > pat_len)
862 break;
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
867 dfa->subexps_alloc = 1;
868 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
870 dfa->mb_cur_max = MB_CUR_MAX;
871 #ifdef _LIBC
872 if (dfa->mb_cur_max == 6
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->is_utf8 = 1;
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876 != 0);
877 #else
878 # ifdef HAVE_LANGINFO_CODESET
879 codeset_name = nl_langinfo (CODESET);
880 # else
881 codeset_name = getenv ("LC_ALL");
882 if (codeset_name == NULL || codeset[0] == '\0')
883 codeset_name = getenv ("LC_CTYPE");
884 if (codeset_name == NULL || codeset[0] == '\0')
885 codeset_name = getenv ("LANG");
886 if (codeset_name == NULL)
887 codeset_name = "";
888 else if (strchr (codeset_name, '.') != NULL)
889 codeset_name = strchr (codeset_name, '.') + 1;
890 # endif
892 if (strcasecmp (codeset_name, "UTF-8") == 0
893 || strcasecmp (codeset_name, "UTF8") == 0)
894 dfa->is_utf8 = 1;
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
899 #endif
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
904 if (dfa->is_utf8)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906 else
908 int i, j, ch;
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
911 if (BE (dfa->sb_char == NULL, 0))
912 return REG_ESPACE;
914 /* Clear all bits by, then set those corresponding to single
915 byte chars. */
916 bitset_empty (dfa->sb_char);
918 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
919 for (j = 0; j < UINT_BITS; ++j, ++ch)
921 wchar_t wch = __btowc (ch);
922 if (wch != WEOF)
923 dfa->sb_char[i] |= 1 << j;
924 # ifndef _LIBC
925 if (isascii (ch) && wch != (wchar_t) ch)
926 dfa->map_notascii = 1;
927 # endif
931 #endif
933 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
934 || dfa->subexps == NULL, 0))
935 return REG_ESPACE;
936 return REG_NOERROR;
939 /* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
943 static void
944 init_word_char (dfa)
945 re_dfa_t *dfa;
947 int i, j, ch;
948 dfa->word_ops_used = 1;
949 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
950 for (j = 0; j < UINT_BITS; ++j, ++ch)
951 if (isalnum (ch) || ch == '_')
952 dfa->word_char[i] |= 1 << j;
955 /* Free the work area which are only used while compiling. */
957 static void
958 free_workarea_compile (preg)
959 regex_t *preg;
961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962 bin_tree_storage_t *storage, *next;
963 for (storage = dfa->str_tree_storage; storage; storage = next)
965 next = storage->next;
966 re_free (storage);
968 dfa->str_tree_storage = NULL;
969 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
970 dfa->str_tree = NULL;
971 re_free (dfa->org_indices);
972 dfa->org_indices = NULL;
975 /* Create initial states for all contexts. */
977 static reg_errcode_t
978 create_initial_state (dfa)
979 re_dfa_t *dfa;
981 int first, i;
982 reg_errcode_t err;
983 re_node_set init_nodes;
985 /* Initial states have the epsilon closure of the node which is
986 the first node of the regular expression. */
987 first = dfa->str_tree->first;
988 dfa->init_node = first;
989 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
990 if (BE (err != REG_NOERROR, 0))
991 return err;
993 /* The back-references which are in initial states can epsilon transit,
994 since in this case all of the subexpressions can be null.
995 Then we add epsilon closures of the nodes which are the next nodes of
996 the back-references. */
997 if (dfa->nbackref > 0)
998 for (i = 0; i < init_nodes.nelem; ++i)
1000 int node_idx = init_nodes.elems[i];
1001 re_token_type_t type = dfa->nodes[node_idx].type;
1003 int clexp_idx;
1004 if (type != OP_BACK_REF)
1005 continue;
1006 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1008 re_token_t *clexp_node;
1009 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010 if (clexp_node->type == OP_CLOSE_SUBEXP
1011 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
1012 break;
1014 if (clexp_idx == init_nodes.nelem)
1015 continue;
1017 if (type == OP_BACK_REF)
1019 int dest_idx = dfa->edests[node_idx].elems[0];
1020 if (!re_node_set_contains (&init_nodes, dest_idx))
1022 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1023 i = 0;
1028 /* It must be the first time to invoke acquire_state. */
1029 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030 /* We don't check ERR here, since the initial state must not be NULL. */
1031 if (BE (dfa->init_state == NULL, 0))
1032 return err;
1033 if (dfa->init_state->has_constraint)
1035 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1036 CONTEXT_WORD);
1037 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1038 CONTEXT_NEWLINE);
1039 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1040 &init_nodes,
1041 CONTEXT_NEWLINE
1042 | CONTEXT_BEGBUF);
1043 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1044 || dfa->init_state_begbuf == NULL, 0))
1045 return err;
1047 else
1048 dfa->init_state_word = dfa->init_state_nl
1049 = dfa->init_state_begbuf = dfa->init_state;
1051 re_node_set_free (&init_nodes);
1052 return REG_NOERROR;
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058 DFA nodes where needed. */
1060 static void
1061 optimize_utf8 (dfa)
1062 re_dfa_t *dfa;
1064 int node, i, mb_chars = 0, has_period = 0;
1066 for (node = 0; node < dfa->nodes_len; ++node)
1067 switch (dfa->nodes[node].type)
1069 case CHARACTER:
1070 if (dfa->nodes[node].opr.c >= 0x80)
1071 mb_chars = 1;
1072 break;
1073 case ANCHOR:
1074 switch (dfa->nodes[node].opr.idx)
1076 case LINE_FIRST:
1077 case LINE_LAST:
1078 case BUF_FIRST:
1079 case BUF_LAST:
1080 break;
1081 default:
1082 /* Word anchors etc. cannot be handled. */
1083 return;
1085 break;
1086 case OP_PERIOD:
1087 has_period = 1;
1088 break;
1089 case OP_BACK_REF:
1090 case OP_ALT:
1091 case END_OF_RE:
1092 case OP_DUP_ASTERISK:
1093 case OP_DUP_QUESTION:
1094 case OP_OPEN_SUBEXP:
1095 case OP_CLOSE_SUBEXP:
1096 break;
1097 case SIMPLE_BRACKET:
1098 /* Just double check. */
1099 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1100 if (dfa->nodes[node].opr.sbcset[i])
1101 return;
1102 break;
1103 default:
1104 return;
1107 if (mb_chars || has_period)
1108 for (node = 0; node < dfa->nodes_len; ++node)
1110 if (dfa->nodes[node].type == CHARACTER
1111 && dfa->nodes[node].opr.c >= 0x80)
1112 dfa->nodes[node].mb_partial = 0;
1113 else if (dfa->nodes[node].type == OP_PERIOD)
1114 dfa->nodes[node].type = OP_UTF8_PERIOD;
1117 /* The search can be in single byte locale. */
1118 dfa->mb_cur_max = 1;
1119 dfa->is_utf8 = 0;
1120 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1122 #endif
1124 /* Analyze the structure tree, and calculate "first", "next", "edest",
1125 "eclosure", and "inveclosure". */
1127 static reg_errcode_t
1128 analyze (dfa)
1129 re_dfa_t *dfa;
1131 int i;
1132 reg_errcode_t ret;
1134 /* Allocate arrays. */
1135 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1136 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1137 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1138 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1139 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1140 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1141 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1142 return REG_ESPACE;
1143 /* Initialize them. */
1144 for (i = 0; i < dfa->nodes_len; ++i)
1146 dfa->nexts[i] = -1;
1147 re_node_set_init_empty (dfa->edests + i);
1148 re_node_set_init_empty (dfa->eclosures + i);
1149 re_node_set_init_empty (dfa->inveclosures + i);
1152 ret = analyze_tree (dfa, dfa->str_tree);
1153 if (BE (ret == REG_NOERROR, 1))
1155 ret = calc_eclosure (dfa);
1156 if (ret == REG_NOERROR)
1157 calc_inveclosure (dfa);
1159 return ret;
1162 /* Helper functions for analyze.
1163 This function calculate "first", "next", and "edest" for the subtree
1164 whose root is NODE. */
1166 static reg_errcode_t
1167 analyze_tree (dfa, node)
1168 re_dfa_t *dfa;
1169 bin_tree_t *node;
1171 reg_errcode_t ret;
1172 if (node->first == -1)
1173 calc_first (dfa, node);
1174 if (node->next == -1)
1175 calc_next (dfa, node);
1176 if (node->eclosure.nelem == 0)
1177 calc_epsdest (dfa, node);
1178 /* Calculate "first" etc. for the left child. */
1179 if (node->left != NULL)
1181 ret = analyze_tree (dfa, node->left);
1182 if (BE (ret != REG_NOERROR, 0))
1183 return ret;
1185 /* Calculate "first" etc. for the right child. */
1186 if (node->right != NULL)
1188 ret = analyze_tree (dfa, node->right);
1189 if (BE (ret != REG_NOERROR, 0))
1190 return ret;
1192 return REG_NOERROR;
1195 /* Calculate "first" for the node NODE. */
1196 static void
1197 calc_first (dfa, node)
1198 re_dfa_t *dfa;
1199 bin_tree_t *node;
1201 int idx, type;
1202 idx = node->node_idx;
1203 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1205 switch (type)
1207 #ifdef DEBUG
1208 case OP_OPEN_BRACKET:
1209 case OP_CLOSE_BRACKET:
1210 case OP_OPEN_DUP_NUM:
1211 case OP_CLOSE_DUP_NUM:
1212 case OP_DUP_PLUS:
1213 case OP_NON_MATCH_LIST:
1214 case OP_OPEN_COLL_ELEM:
1215 case OP_CLOSE_COLL_ELEM:
1216 case OP_OPEN_EQUIV_CLASS:
1217 case OP_CLOSE_EQUIV_CLASS:
1218 case OP_OPEN_CHAR_CLASS:
1219 case OP_CLOSE_CHAR_CLASS:
1220 /* These must not appear here. */
1221 assert (0);
1222 #endif
1223 case END_OF_RE:
1224 case CHARACTER:
1225 case OP_PERIOD:
1226 case OP_DUP_ASTERISK:
1227 case OP_DUP_QUESTION:
1228 #ifdef RE_ENABLE_I18N
1229 case OP_UTF8_PERIOD:
1230 case COMPLEX_BRACKET:
1231 #endif /* RE_ENABLE_I18N */
1232 case SIMPLE_BRACKET:
1233 case OP_BACK_REF:
1234 case ANCHOR:
1235 case OP_OPEN_SUBEXP:
1236 case OP_CLOSE_SUBEXP:
1237 node->first = idx;
1238 break;
1239 case OP_ALT:
1240 node->first = idx;
1241 break;
1242 /* else fall through */
1243 default:
1244 #ifdef DEBUG
1245 assert (node->left != NULL);
1246 #endif
1247 if (node->left->first == -1)
1248 calc_first (dfa, node->left);
1249 node->first = node->left->first;
1250 break;
1254 /* Calculate "next" for the node NODE. */
1256 static void
1257 calc_next (dfa, node)
1258 re_dfa_t *dfa;
1259 bin_tree_t *node;
1261 int idx, type;
1262 bin_tree_t *parent = node->parent;
1263 if (parent == NULL)
1265 node->next = -1;
1266 idx = node->node_idx;
1267 if (node->type == 0)
1268 dfa->nexts[idx] = node->next;
1269 return;
1272 idx = parent->node_idx;
1273 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1275 switch (type)
1277 case OP_DUP_ASTERISK:
1278 node->next = idx;
1279 break;
1280 case CONCAT:
1281 if (parent->left == node)
1283 if (parent->right->first == -1)
1284 calc_first (dfa, parent->right);
1285 node->next = parent->right->first;
1286 break;
1288 /* else fall through */
1289 default:
1290 if (parent->next == -1)
1291 calc_next (dfa, parent);
1292 node->next = parent->next;
1293 break;
1295 idx = node->node_idx;
1296 if (node->type == 0)
1297 dfa->nexts[idx] = node->next;
1300 /* Calculate "edest" for the node NODE. */
1302 static void
1303 calc_epsdest (dfa, node)
1304 re_dfa_t *dfa;
1305 bin_tree_t *node;
1307 int idx;
1308 idx = node->node_idx;
1309 if (node->type == 0)
1311 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1312 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1314 if (node->left->first == -1)
1315 calc_first (dfa, node->left);
1316 if (node->next == -1)
1317 calc_next (dfa, node);
1318 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1319 node->next);
1321 else if (dfa->nodes[idx].type == OP_ALT)
1323 int left, right;
1324 if (node->left != NULL)
1326 if (node->left->first == -1)
1327 calc_first (dfa, node->left);
1328 left = node->left->first;
1330 else
1332 if (node->next == -1)
1333 calc_next (dfa, node);
1334 left = node->next;
1336 if (node->right != NULL)
1338 if (node->right->first == -1)
1339 calc_first (dfa, node->right);
1340 right = node->right->first;
1342 else
1344 if (node->next == -1)
1345 calc_next (dfa, node);
1346 right = node->next;
1348 re_node_set_init_2 (dfa->edests + idx, left, right);
1350 else if (dfa->nodes[idx].type == ANCHOR
1351 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1352 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1353 || dfa->nodes[idx].type == OP_BACK_REF)
1354 re_node_set_init_1 (dfa->edests + idx, node->next);
1355 else
1356 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1360 /* Duplicate the epsilon closure of the node ROOT_NODE.
1361 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1362 to their own constraint. */
1364 static reg_errcode_t
1365 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1366 init_constraint)
1367 re_dfa_t *dfa;
1368 int top_org_node, top_clone_node, root_node;
1369 unsigned int init_constraint;
1371 reg_errcode_t err;
1372 int org_node, clone_node, ret;
1373 unsigned int constraint = init_constraint;
1374 for (org_node = top_org_node, clone_node = top_clone_node;;)
1376 int org_dest, clone_dest;
1377 if (dfa->nodes[org_node].type == OP_BACK_REF)
1379 /* If the back reference epsilon-transit, its destination must
1380 also have the constraint. Then duplicate the epsilon closure
1381 of the destination of the back reference, and store it in
1382 edests of the back reference. */
1383 org_dest = dfa->nexts[org_node];
1384 re_node_set_empty (dfa->edests + clone_node);
1385 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1386 if (BE (err != REG_NOERROR, 0))
1387 return err;
1388 dfa->nexts[clone_node] = dfa->nexts[org_node];
1389 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1390 if (BE (ret < 0, 0))
1391 return REG_ESPACE;
1393 else if (dfa->edests[org_node].nelem == 0)
1395 /* In case of the node can't epsilon-transit, don't duplicate the
1396 destination and store the original destination as the
1397 destination of the node. */
1398 dfa->nexts[clone_node] = dfa->nexts[org_node];
1399 break;
1401 else if (dfa->edests[org_node].nelem == 1)
1403 /* In case of the node can epsilon-transit, and it has only one
1404 destination. */
1405 org_dest = dfa->edests[org_node].elems[0];
1406 re_node_set_empty (dfa->edests + clone_node);
1407 if (dfa->nodes[org_node].type == ANCHOR)
1409 /* In case of the node has another constraint, append it. */
1410 if (org_node == root_node && clone_node != org_node)
1412 /* ...but if the node is root_node itself, it means the
1413 epsilon closure have a loop, then tie it to the
1414 destination of the root_node. */
1415 ret = re_node_set_insert (dfa->edests + clone_node,
1416 org_dest);
1417 if (BE (ret < 0, 0))
1418 return REG_ESPACE;
1419 break;
1421 constraint |= dfa->nodes[org_node].opr.ctx_type;
1423 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1424 if (BE (err != REG_NOERROR, 0))
1425 return err;
1426 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1427 if (BE (ret < 0, 0))
1428 return REG_ESPACE;
1430 else /* dfa->edests[org_node].nelem == 2 */
1432 /* In case of the node can epsilon-transit, and it has two
1433 destinations. E.g. '|', '*', '+', '?'. */
1434 org_dest = dfa->edests[org_node].elems[0];
1435 re_node_set_empty (dfa->edests + clone_node);
1436 /* Search for a duplicated node which satisfies the constraint. */
1437 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1438 if (clone_dest == -1)
1440 /* There are no such a duplicated node, create a new one. */
1441 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1442 if (BE (err != REG_NOERROR, 0))
1443 return err;
1444 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1445 if (BE (ret < 0, 0))
1446 return REG_ESPACE;
1447 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1448 root_node, constraint);
1449 if (BE (err != REG_NOERROR, 0))
1450 return err;
1452 else
1454 /* There are a duplicated node which satisfy the constraint,
1455 use it to avoid infinite loop. */
1456 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457 if (BE (ret < 0, 0))
1458 return REG_ESPACE;
1461 org_dest = dfa->edests[org_node].elems[1];
1462 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1463 if (BE (err != REG_NOERROR, 0))
1464 return err;
1465 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466 if (BE (ret < 0, 0))
1467 return REG_ESPACE;
1469 org_node = org_dest;
1470 clone_node = clone_dest;
1472 return REG_NOERROR;
1475 /* Search for a node which is duplicated from the node ORG_NODE, and
1476 satisfies the constraint CONSTRAINT. */
1478 static int
1479 search_duplicated_node (dfa, org_node, constraint)
1480 re_dfa_t *dfa;
1481 int org_node;
1482 unsigned int constraint;
1484 int idx;
1485 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1487 if (org_node == dfa->org_indices[idx]
1488 && constraint == dfa->nodes[idx].constraint)
1489 return idx; /* Found. */
1491 return -1; /* Not found. */
1494 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1495 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1496 otherwise return the error code. */
1498 static reg_errcode_t
1499 duplicate_node (new_idx, dfa, org_idx, constraint)
1500 re_dfa_t *dfa;
1501 int *new_idx, org_idx;
1502 unsigned int constraint;
1504 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1505 if (BE (dup_idx == -1, 0))
1506 return REG_ESPACE;
1507 dfa->nodes[dup_idx].constraint = constraint;
1508 if (dfa->nodes[org_idx].type == ANCHOR)
1509 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1510 dfa->nodes[dup_idx].duplicated = 1;
1511 re_node_set_init_empty (dfa->edests + dup_idx);
1512 re_node_set_init_empty (dfa->eclosures + dup_idx);
1513 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1515 /* Store the index of the original node. */
1516 dfa->org_indices[dup_idx] = org_idx;
1517 *new_idx = dup_idx;
1518 return REG_NOERROR;
1521 static void
1522 calc_inveclosure (dfa)
1523 re_dfa_t *dfa;
1525 int src, idx, dest;
1526 for (src = 0; src < dfa->nodes_len; ++src)
1528 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1530 dest = dfa->eclosures[src].elems[idx];
1531 re_node_set_insert (dfa->inveclosures + dest, src);
1536 /* Calculate "eclosure" for all the node in DFA. */
1538 static reg_errcode_t
1539 calc_eclosure (dfa)
1540 re_dfa_t *dfa;
1542 int node_idx, incomplete;
1543 #ifdef DEBUG
1544 assert (dfa->nodes_len > 0);
1545 #endif
1546 incomplete = 0;
1547 /* For each nodes, calculate epsilon closure. */
1548 for (node_idx = 0; ; ++node_idx)
1550 reg_errcode_t err;
1551 re_node_set eclosure_elem;
1552 if (node_idx == dfa->nodes_len)
1554 if (!incomplete)
1555 break;
1556 incomplete = 0;
1557 node_idx = 0;
1560 #ifdef DEBUG
1561 assert (dfa->eclosures[node_idx].nelem != -1);
1562 #endif
1563 /* If we have already calculated, skip it. */
1564 if (dfa->eclosures[node_idx].nelem != 0)
1565 continue;
1566 /* Calculate epsilon closure of `node_idx'. */
1567 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1568 if (BE (err != REG_NOERROR, 0))
1569 return err;
1571 if (dfa->eclosures[node_idx].nelem == 0)
1573 incomplete = 1;
1574 re_node_set_free (&eclosure_elem);
1577 return REG_NOERROR;
1580 /* Calculate epsilon closure of NODE. */
1582 static reg_errcode_t
1583 calc_eclosure_iter (new_set, dfa, node, root)
1584 re_node_set *new_set;
1585 re_dfa_t *dfa;
1586 int node, root;
1588 reg_errcode_t err;
1589 unsigned int constraint;
1590 int i, incomplete;
1591 re_node_set eclosure;
1592 incomplete = 0;
1593 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1594 if (BE (err != REG_NOERROR, 0))
1595 return err;
1597 /* This indicates that we are calculating this node now.
1598 We reference this value to avoid infinite loop. */
1599 dfa->eclosures[node].nelem = -1;
1601 constraint = ((dfa->nodes[node].type == ANCHOR)
1602 ? dfa->nodes[node].opr.ctx_type : 0);
1603 /* If the current node has constraints, duplicate all nodes.
1604 Since they must inherit the constraints. */
1605 if (constraint
1606 && dfa->edests[node].nelem
1607 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1609 int org_node, cur_node;
1610 org_node = cur_node = node;
1611 err = duplicate_node_closure (dfa, node, node, node, constraint);
1612 if (BE (err != REG_NOERROR, 0))
1613 return err;
1616 /* Expand each epsilon destination nodes. */
1617 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1618 for (i = 0; i < dfa->edests[node].nelem; ++i)
1620 re_node_set eclosure_elem;
1621 int edest = dfa->edests[node].elems[i];
1622 /* If calculating the epsilon closure of `edest' is in progress,
1623 return intermediate result. */
1624 if (dfa->eclosures[edest].nelem == -1)
1626 incomplete = 1;
1627 continue;
1629 /* If we haven't calculated the epsilon closure of `edest' yet,
1630 calculate now. Otherwise use calculated epsilon closure. */
1631 if (dfa->eclosures[edest].nelem == 0)
1633 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1634 if (BE (err != REG_NOERROR, 0))
1635 return err;
1637 else
1638 eclosure_elem = dfa->eclosures[edest];
1639 /* Merge the epsilon closure of `edest'. */
1640 re_node_set_merge (&eclosure, &eclosure_elem);
1641 /* If the epsilon closure of `edest' is incomplete,
1642 the epsilon closure of this node is also incomplete. */
1643 if (dfa->eclosures[edest].nelem == 0)
1645 incomplete = 1;
1646 re_node_set_free (&eclosure_elem);
1650 /* Epsilon closures include itself. */
1651 re_node_set_insert (&eclosure, node);
1652 if (incomplete && !root)
1653 dfa->eclosures[node].nelem = 0;
1654 else
1655 dfa->eclosures[node] = eclosure;
1656 *new_set = eclosure;
1657 return REG_NOERROR;
1660 /* Functions for token which are used in the parser. */
1662 /* Fetch a token from INPUT.
1663 We must not use this function inside bracket expressions. */
1665 static void
1666 fetch_token (result, input, syntax)
1667 re_token_t *result;
1668 re_string_t *input;
1669 reg_syntax_t syntax;
1671 re_string_skip_bytes (input, peek_token (result, input, syntax));
1674 /* Peek a token from INPUT, and return the length of the token.
1675 We must not use this function inside bracket expressions. */
1677 static int
1678 peek_token (token, input, syntax)
1679 re_token_t *token;
1680 re_string_t *input;
1681 reg_syntax_t syntax;
1683 unsigned char c;
1685 if (re_string_eoi (input))
1687 token->type = END_OF_RE;
1688 return 0;
1691 c = re_string_peek_byte (input, 0);
1692 token->opr.c = c;
1694 token->word_char = 0;
1695 #ifdef RE_ENABLE_I18N
1696 token->mb_partial = 0;
1697 if (input->mb_cur_max > 1 &&
1698 !re_string_first_byte (input, re_string_cur_idx (input)))
1700 token->type = CHARACTER;
1701 token->mb_partial = 1;
1702 return 1;
1704 #endif
1705 if (c == '\\')
1707 unsigned char c2;
1708 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1710 token->type = BACK_SLASH;
1711 return 1;
1714 c2 = re_string_peek_byte_case (input, 1);
1715 token->opr.c = c2;
1716 token->type = CHARACTER;
1717 #ifdef RE_ENABLE_I18N
1718 if (input->mb_cur_max > 1)
1720 wint_t wc = re_string_wchar_at (input,
1721 re_string_cur_idx (input) + 1);
1722 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1724 else
1725 #endif
1726 token->word_char = IS_WORD_CHAR (c2) != 0;
1728 switch (c2)
1730 case '|':
1731 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1732 token->type = OP_ALT;
1733 break;
1734 case '1': case '2': case '3': case '4': case '5':
1735 case '6': case '7': case '8': case '9':
1736 if (!(syntax & RE_NO_BK_REFS))
1738 token->type = OP_BACK_REF;
1739 token->opr.idx = c2 - '0';
1741 break;
1742 case '<':
1743 if (!(syntax & RE_NO_GNU_OPS))
1745 token->type = ANCHOR;
1746 token->opr.ctx_type = WORD_FIRST;
1748 break;
1749 case '>':
1750 if (!(syntax & RE_NO_GNU_OPS))
1752 token->type = ANCHOR;
1753 token->opr.ctx_type = WORD_LAST;
1755 break;
1756 case 'b':
1757 if (!(syntax & RE_NO_GNU_OPS))
1759 token->type = ANCHOR;
1760 token->opr.ctx_type = WORD_DELIM;
1762 break;
1763 case 'B':
1764 if (!(syntax & RE_NO_GNU_OPS))
1766 token->type = ANCHOR;
1767 token->opr.ctx_type = INSIDE_WORD;
1769 break;
1770 case 'w':
1771 if (!(syntax & RE_NO_GNU_OPS))
1772 token->type = OP_WORD;
1773 break;
1774 case 'W':
1775 if (!(syntax & RE_NO_GNU_OPS))
1776 token->type = OP_NOTWORD;
1777 break;
1778 case 's':
1779 if (!(syntax & RE_NO_GNU_OPS))
1780 token->type = OP_SPACE;
1781 break;
1782 case 'S':
1783 if (!(syntax & RE_NO_GNU_OPS))
1784 token->type = OP_NOTSPACE;
1785 break;
1786 case '`':
1787 if (!(syntax & RE_NO_GNU_OPS))
1789 token->type = ANCHOR;
1790 token->opr.ctx_type = BUF_FIRST;
1792 break;
1793 case '\'':
1794 if (!(syntax & RE_NO_GNU_OPS))
1796 token->type = ANCHOR;
1797 token->opr.ctx_type = BUF_LAST;
1799 break;
1800 case '(':
1801 if (!(syntax & RE_NO_BK_PARENS))
1802 token->type = OP_OPEN_SUBEXP;
1803 break;
1804 case ')':
1805 if (!(syntax & RE_NO_BK_PARENS))
1806 token->type = OP_CLOSE_SUBEXP;
1807 break;
1808 case '+':
1809 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1810 token->type = OP_DUP_PLUS;
1811 break;
1812 case '?':
1813 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1814 token->type = OP_DUP_QUESTION;
1815 break;
1816 case '{':
1817 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1818 token->type = OP_OPEN_DUP_NUM;
1819 break;
1820 case '}':
1821 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1822 token->type = OP_CLOSE_DUP_NUM;
1823 break;
1824 default:
1825 break;
1827 return 2;
1830 token->type = CHARACTER;
1831 #ifdef RE_ENABLE_I18N
1832 if (input->mb_cur_max > 1)
1834 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1835 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1837 else
1838 #endif
1839 token->word_char = IS_WORD_CHAR (token->opr.c);
1841 switch (c)
1843 case '\n':
1844 if (syntax & RE_NEWLINE_ALT)
1845 token->type = OP_ALT;
1846 break;
1847 case '|':
1848 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1849 token->type = OP_ALT;
1850 break;
1851 case '*':
1852 token->type = OP_DUP_ASTERISK;
1853 break;
1854 case '+':
1855 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1856 token->type = OP_DUP_PLUS;
1857 break;
1858 case '?':
1859 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1860 token->type = OP_DUP_QUESTION;
1861 break;
1862 case '{':
1863 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1864 token->type = OP_OPEN_DUP_NUM;
1865 break;
1866 case '}':
1867 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1868 token->type = OP_CLOSE_DUP_NUM;
1869 break;
1870 case '(':
1871 if (syntax & RE_NO_BK_PARENS)
1872 token->type = OP_OPEN_SUBEXP;
1873 break;
1874 case ')':
1875 if (syntax & RE_NO_BK_PARENS)
1876 token->type = OP_CLOSE_SUBEXP;
1877 break;
1878 case '[':
1879 token->type = OP_OPEN_BRACKET;
1880 break;
1881 case '.':
1882 token->type = OP_PERIOD;
1883 break;
1884 case '^':
1885 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1886 re_string_cur_idx (input) != 0)
1888 char prev = re_string_peek_byte (input, -1);
1889 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1890 break;
1892 token->type = ANCHOR;
1893 token->opr.ctx_type = LINE_FIRST;
1894 break;
1895 case '$':
1896 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1897 re_string_cur_idx (input) + 1 != re_string_length (input))
1899 re_token_t next;
1900 re_string_skip_bytes (input, 1);
1901 peek_token (&next, input, syntax);
1902 re_string_skip_bytes (input, -1);
1903 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1904 break;
1906 token->type = ANCHOR;
1907 token->opr.ctx_type = LINE_LAST;
1908 break;
1909 default:
1910 break;
1912 return 1;
1915 /* Peek a token from INPUT, and return the length of the token.
1916 We must not use this function out of bracket expressions. */
1918 static int
1919 peek_token_bracket (token, input, syntax)
1920 re_token_t *token;
1921 re_string_t *input;
1922 reg_syntax_t syntax;
1924 unsigned char c;
1925 if (re_string_eoi (input))
1927 token->type = END_OF_RE;
1928 return 0;
1930 c = re_string_peek_byte (input, 0);
1931 token->opr.c = c;
1933 #ifdef RE_ENABLE_I18N
1934 if (input->mb_cur_max > 1 &&
1935 !re_string_first_byte (input, re_string_cur_idx (input)))
1937 token->type = CHARACTER;
1938 return 1;
1940 #endif /* RE_ENABLE_I18N */
1942 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1943 && re_string_cur_idx (input) + 1 < re_string_length (input))
1945 /* In this case, '\' escape a character. */
1946 unsigned char c2;
1947 re_string_skip_bytes (input, 1);
1948 c2 = re_string_peek_byte (input, 0);
1949 token->opr.c = c2;
1950 token->type = CHARACTER;
1951 return 1;
1953 if (c == '[') /* '[' is a special char in a bracket exps. */
1955 unsigned char c2;
1956 int token_len;
1957 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1958 c2 = re_string_peek_byte (input, 1);
1959 else
1960 c2 = 0;
1961 token->opr.c = c2;
1962 token_len = 2;
1963 switch (c2)
1965 case '.':
1966 token->type = OP_OPEN_COLL_ELEM;
1967 break;
1968 case '=':
1969 token->type = OP_OPEN_EQUIV_CLASS;
1970 break;
1971 case ':':
1972 if (syntax & RE_CHAR_CLASSES)
1974 token->type = OP_OPEN_CHAR_CLASS;
1975 break;
1977 /* else fall through. */
1978 default:
1979 token->type = CHARACTER;
1980 token->opr.c = c;
1981 token_len = 1;
1982 break;
1984 return token_len;
1986 switch (c)
1988 case '-':
1989 token->type = OP_CHARSET_RANGE;
1990 break;
1991 case ']':
1992 token->type = OP_CLOSE_BRACKET;
1993 break;
1994 case '^':
1995 token->type = OP_NON_MATCH_LIST;
1996 break;
1997 default:
1998 token->type = CHARACTER;
2000 return 1;
2003 /* Functions for parser. */
2005 /* Entry point of the parser.
2006 Parse the regular expression REGEXP and return the structure tree.
2007 If an error is occured, ERR is set by error code, and return NULL.
2008 This function build the following tree, from regular expression <reg_exp>:
2012 <reg_exp> EOR
2014 CAT means concatenation.
2015 EOR means end of regular expression. */
2017 static bin_tree_t *
2018 parse (regexp, preg, syntax, err)
2019 re_string_t *regexp;
2020 regex_t *preg;
2021 reg_syntax_t syntax;
2022 reg_errcode_t *err;
2024 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2025 bin_tree_t *tree, *eor, *root;
2026 re_token_t current_token;
2027 dfa->syntax = syntax;
2028 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2029 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2030 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2031 return NULL;
2032 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2033 if (tree != NULL)
2034 root = create_tree (dfa, tree, eor, CONCAT, 0);
2035 else
2036 root = eor;
2037 if (BE (eor == NULL || root == NULL, 0))
2039 *err = REG_ESPACE;
2040 return NULL;
2042 return root;
2045 /* This function build the following tree, from regular expression
2046 <branch1>|<branch2>:
2050 <branch1> <branch2>
2052 ALT means alternative, which represents the operator `|'. */
2054 static bin_tree_t *
2055 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2056 re_string_t *regexp;
2057 regex_t *preg;
2058 re_token_t *token;
2059 reg_syntax_t syntax;
2060 int nest;
2061 reg_errcode_t *err;
2063 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2064 bin_tree_t *tree, *branch = NULL;
2065 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2066 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067 return NULL;
2069 while (token->type == OP_ALT)
2071 re_token_t alt_token = *token;
2072 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073 if (token->type != OP_ALT && token->type != END_OF_RE
2074 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2076 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2077 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2078 return NULL;
2080 else
2081 branch = NULL;
2082 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2083 if (BE (tree == NULL, 0))
2085 *err = REG_ESPACE;
2086 return NULL;
2088 dfa->has_plural_match = 1;
2090 return tree;
2093 /* This function build the following tree, from regular expression
2094 <exp1><exp2>:
2098 <exp1> <exp2>
2100 CAT means concatenation. */
2102 static bin_tree_t *
2103 parse_branch (regexp, preg, token, syntax, nest, err)
2104 re_string_t *regexp;
2105 regex_t *preg;
2106 re_token_t *token;
2107 reg_syntax_t syntax;
2108 int nest;
2109 reg_errcode_t *err;
2111 bin_tree_t *tree, *exp;
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2114 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115 return NULL;
2117 while (token->type != OP_ALT && token->type != END_OF_RE
2118 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2121 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2123 return NULL;
2125 if (tree != NULL && exp != NULL)
2127 tree = create_tree (dfa, tree, exp, CONCAT, 0);
2128 if (tree == NULL)
2130 *err = REG_ESPACE;
2131 return NULL;
2134 else if (tree == NULL)
2135 tree = exp;
2136 /* Otherwise exp == NULL, we don't need to create new tree. */
2138 return tree;
2141 /* This function build the following tree, from regular expression a*:
2147 static bin_tree_t *
2148 parse_expression (regexp, preg, token, syntax, nest, err)
2149 re_string_t *regexp;
2150 regex_t *preg;
2151 re_token_t *token;
2152 reg_syntax_t syntax;
2153 int nest;
2154 reg_errcode_t *err;
2156 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2157 bin_tree_t *tree;
2158 switch (token->type)
2160 case CHARACTER:
2161 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2162 if (BE (tree == NULL, 0))
2164 *err = REG_ESPACE;
2165 return NULL;
2167 #ifdef RE_ENABLE_I18N
2168 if (dfa->mb_cur_max > 1)
2170 while (!re_string_eoi (regexp)
2171 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2173 bin_tree_t *mbc_remain;
2174 fetch_token (token, regexp, syntax);
2175 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2176 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2177 if (BE (mbc_remain == NULL || tree == NULL, 0))
2179 *err = REG_ESPACE;
2180 return NULL;
2184 #endif
2185 break;
2186 case OP_OPEN_SUBEXP:
2187 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 return NULL;
2190 break;
2191 case OP_OPEN_BRACKET:
2192 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2193 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2194 return NULL;
2195 break;
2196 case OP_BACK_REF:
2197 if (BE (preg->re_nsub < token->opr.idx
2198 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2200 *err = REG_ESUBREG;
2201 return NULL;
2203 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2204 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2205 if (BE (tree == NULL, 0))
2207 *err = REG_ESPACE;
2208 return NULL;
2210 ++dfa->nbackref;
2211 dfa->has_mb_node = 1;
2212 break;
2213 case OP_OPEN_DUP_NUM:
2214 if (syntax & RE_CONTEXT_INVALID_DUP)
2216 *err = REG_BADRPT;
2217 return NULL;
2219 /* FALLTHROUGH */
2220 case OP_DUP_ASTERISK:
2221 case OP_DUP_PLUS:
2222 case OP_DUP_QUESTION:
2223 if (syntax & RE_CONTEXT_INVALID_OPS)
2225 *err = REG_BADRPT;
2226 return NULL;
2228 else if (syntax & RE_CONTEXT_INDEP_OPS)
2230 fetch_token (token, regexp, syntax);
2231 return parse_expression (regexp, preg, token, syntax, nest, err);
2233 /* else fall through */
2234 case OP_CLOSE_SUBEXP:
2235 if ((token->type == OP_CLOSE_SUBEXP) &&
2236 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2238 *err = REG_ERPAREN;
2239 return NULL;
2241 /* else fall through */
2242 case OP_CLOSE_DUP_NUM:
2243 /* We treat it as a normal character. */
2245 /* Then we can these characters as normal characters. */
2246 token->type = CHARACTER;
2247 /* mb_partial and word_char bits should be initialized already
2248 by peek_token. */
2249 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2250 if (BE (tree == NULL, 0))
2252 *err = REG_ESPACE;
2253 return NULL;
2255 break;
2256 case ANCHOR:
2257 if ((token->opr.ctx_type
2258 & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2259 && dfa->word_ops_used == 0)
2260 init_word_char (dfa);
2261 if (token->opr.ctx_type == WORD_DELIM)
2263 bin_tree_t *tree_first, *tree_last;
2264 token->opr.ctx_type = WORD_FIRST;
2265 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2266 token->opr.ctx_type = WORD_LAST;
2267 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2268 token->type = OP_ALT;
2269 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2270 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2272 *err = REG_ESPACE;
2273 return NULL;
2276 else
2278 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2279 if (BE (tree == NULL, 0))
2281 *err = REG_ESPACE;
2282 return NULL;
2285 /* We must return here, since ANCHORs can't be followed
2286 by repetition operators.
2287 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2288 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2289 fetch_token (token, regexp, syntax);
2290 return tree;
2291 case OP_PERIOD:
2292 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2293 if (BE (tree == NULL, 0))
2295 *err = REG_ESPACE;
2296 return NULL;
2298 if (dfa->mb_cur_max > 1)
2299 dfa->has_mb_node = 1;
2300 break;
2301 case OP_WORD:
2302 case OP_NOTWORD:
2303 tree = build_charclass_op (dfa, regexp->trans,
2304 (const unsigned char *) "alnum",
2305 (const unsigned char *) "_",
2306 token->type == OP_NOTWORD, err);
2307 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2308 return NULL;
2309 break;
2310 case OP_SPACE:
2311 case OP_NOTSPACE:
2312 tree = build_charclass_op (dfa, regexp->trans,
2313 (const unsigned char *) "space",
2314 (const unsigned char *) "",
2315 token->type == OP_NOTSPACE, err);
2316 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2317 return NULL;
2318 break;
2319 case OP_ALT:
2320 case END_OF_RE:
2321 return NULL;
2322 case BACK_SLASH:
2323 *err = REG_EESCAPE;
2324 return NULL;
2325 default:
2326 /* Must not happen? */
2327 #ifdef DEBUG
2328 assert (0);
2329 #endif
2330 return NULL;
2332 fetch_token (token, regexp, syntax);
2334 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2335 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2337 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2338 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2339 return NULL;
2340 /* In BRE consecutive duplications are not allowed. */
2341 if ((syntax & RE_CONTEXT_INVALID_DUP)
2342 && (token->type == OP_DUP_ASTERISK
2343 || token->type == OP_OPEN_DUP_NUM))
2345 *err = REG_BADRPT;
2346 return NULL;
2348 dfa->has_plural_match = 1;
2351 return tree;
2354 /* This function build the following tree, from regular expression
2355 (<reg_exp>):
2356 SUBEXP
2358 <reg_exp>
2361 static bin_tree_t *
2362 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2363 re_string_t *regexp;
2364 regex_t *preg;
2365 re_token_t *token;
2366 reg_syntax_t syntax;
2367 int nest;
2368 reg_errcode_t *err;
2370 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2371 bin_tree_t *tree, *left_par, *right_par;
2372 size_t cur_nsub;
2373 cur_nsub = preg->re_nsub++;
2374 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2376 re_subexp_t *new_array;
2377 dfa->subexps_alloc *= 2;
2378 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2379 if (BE (new_array == NULL, 0))
2381 dfa->subexps_alloc /= 2;
2382 *err = REG_ESPACE;
2383 return NULL;
2385 dfa->subexps = new_array;
2387 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2388 dfa->subexps[cur_nsub].end = -1;
2390 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2391 if (BE (left_par == NULL, 0))
2393 *err = REG_ESPACE;
2394 return NULL;
2396 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2397 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2399 /* The subexpression may be a null string. */
2400 if (token->type == OP_CLOSE_SUBEXP)
2401 tree = NULL;
2402 else
2404 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2405 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2406 return NULL;
2408 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2410 *err = REG_EPAREN;
2411 return NULL;
2413 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2414 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2415 tree = ((tree == NULL) ? right_par
2416 : create_tree (dfa, tree, right_par, CONCAT, 0));
2417 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2418 if (BE (right_par == NULL || tree == NULL, 0))
2420 *err = REG_ESPACE;
2421 return NULL;
2423 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2425 return tree;
2428 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2430 static bin_tree_t *
2431 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2432 bin_tree_t *elem;
2433 re_string_t *regexp;
2434 re_dfa_t *dfa;
2435 re_token_t *token;
2436 reg_syntax_t syntax;
2437 reg_errcode_t *err;
2439 re_token_t dup_token;
2440 bin_tree_t *tree = NULL;
2441 int i, start, end, start_idx = re_string_cur_idx (regexp);
2442 re_token_t start_token = *token;
2444 if (token->type == OP_OPEN_DUP_NUM)
2446 end = 0;
2447 start = fetch_number (regexp, token, syntax);
2448 if (start == -1)
2450 if (token->type == CHARACTER && token->opr.c == ',')
2451 start = 0; /* We treat "{,m}" as "{0,m}". */
2452 else
2454 *err = REG_BADBR; /* <re>{} is invalid. */
2455 return NULL;
2458 if (BE (start != -2, 1))
2460 /* We treat "{n}" as "{n,n}". */
2461 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2462 : ((token->type == CHARACTER && token->opr.c == ',')
2463 ? fetch_number (regexp, token, syntax) : -2));
2465 if (BE (start == -2 || end == -2, 0))
2467 /* Invalid sequence. */
2468 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2470 if (token->type == END_OF_RE)
2471 *err = REG_EBRACE;
2472 else
2473 *err = REG_BADBR;
2475 return NULL;
2478 /* If the syntax bit is set, rollback. */
2479 re_string_set_index (regexp, start_idx);
2480 *token = start_token;
2481 token->type = CHARACTER;
2482 /* mb_partial and word_char bits should be already initialized by
2483 peek_token. */
2484 return elem;
2487 if (BE (end != -1 && start > end, 0))
2489 /* First number greater than second. */
2490 *err = REG_BADBR;
2491 return NULL;
2494 else
2496 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2497 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2500 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2501 if (BE (elem == NULL, 0))
2502 start = end = 0;
2504 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2505 else if (BE (start > 0, 0))
2507 tree = elem;
2508 for (i = 2; i <= start; ++i)
2510 elem = duplicate_tree (elem, dfa);
2511 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2512 if (BE (elem == NULL || tree == NULL, 0))
2513 goto parse_dup_op_espace;
2517 if (BE (end != start, 1))
2519 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2520 if (BE (start > 0, 0))
2522 elem = duplicate_tree (elem, dfa);
2523 if (BE (elem == NULL, 0))
2524 goto parse_dup_op_espace;
2526 /* This subexpression will be marked as optional, so that
2527 empty matches do not touch the registers. */
2528 mark_opt_subexp (elem, dfa);
2530 /* Prepare the tree with the modifier. */
2531 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2532 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2534 else
2536 /* We do not need to duplicate the tree because we have not
2537 created it yet. */
2538 mark_opt_subexp (elem, dfa);
2539 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2542 if (BE (elem == NULL || tree == NULL, 0))
2543 goto parse_dup_op_espace;
2545 /* This loop is actually executed only when end != -1,
2546 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2547 already created the start+1-th copy. */
2548 for (i = start + 2; i <= end; ++i)
2550 elem = duplicate_tree (elem, dfa);
2551 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2552 if (BE (elem == NULL || tree == NULL, 0))
2554 *err = REG_ESPACE;
2555 return NULL;
2560 fetch_token (token, regexp, syntax);
2561 return tree;
2563 parse_dup_op_espace:
2564 *err = REG_ESPACE;
2565 return NULL;
2568 /* Size of the names for collating symbol/equivalence_class/character_class.
2569 I'm not sure, but maybe enough. */
2570 #define BRACKET_NAME_BUF_SIZE 32
2572 #ifndef _LIBC
2573 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2574 Build the range expression which starts from START_ELEM, and ends
2575 at END_ELEM. The result are written to MBCSET and SBCSET.
2576 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2577 mbcset->range_ends, is a pointer argument sinse we may
2578 update it. */
2580 static reg_errcode_t
2581 # ifdef RE_ENABLE_I18N
2582 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2583 re_charset_t *mbcset;
2584 int *range_alloc;
2585 # else /* not RE_ENABLE_I18N */
2586 build_range_exp (sbcset, start_elem, end_elem)
2587 # endif /* not RE_ENABLE_I18N */
2588 re_bitset_ptr_t sbcset;
2589 bracket_elem_t *start_elem, *end_elem;
2591 unsigned int start_ch, end_ch;
2592 /* Equivalence Classes and Character Classes can't be a range start/end. */
2593 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2594 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2596 return REG_ERANGE;
2598 /* We can handle no multi character collating elements without libc
2599 support. */
2600 if (BE ((start_elem->type == COLL_SYM
2601 && strlen ((char *) start_elem->opr.name) > 1)
2602 || (end_elem->type == COLL_SYM
2603 && strlen ((char *) end_elem->opr.name) > 1), 0))
2604 return REG_ECOLLATE;
2606 # ifdef RE_ENABLE_I18N
2608 wchar_t wc, start_wc, end_wc;
2609 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2611 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2612 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2613 : 0));
2614 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2615 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2616 : 0));
2617 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2618 ? __btowc (start_ch) : start_elem->opr.wch);
2619 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2620 ? __btowc (end_ch) : end_elem->opr.wch);
2621 if (start_wc == WEOF || end_wc == WEOF)
2622 return REG_ECOLLATE;
2623 cmp_buf[0] = start_wc;
2624 cmp_buf[4] = end_wc;
2625 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2626 return REG_ERANGE;
2628 /* Got valid collation sequence values, add them as a new entry.
2629 However, for !_LIBC we have no collation elements: if the
2630 character set is single byte, the single byte character set
2631 that we build below suffices. parse_bracket_exp passes
2632 no MBCSET if dfa->mb_cur_max == 1. */
2633 if (mbcset)
2635 /* Check the space of the arrays. */
2636 if (BE (*range_alloc == mbcset->nranges, 0))
2638 /* There is not enough space, need realloc. */
2639 wchar_t *new_array_start, *new_array_end;
2640 int new_nranges;
2642 /* +1 in case of mbcset->nranges is 0. */
2643 new_nranges = 2 * mbcset->nranges + 1;
2644 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2645 are NULL if *range_alloc == 0. */
2646 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2647 new_nranges);
2648 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2649 new_nranges);
2651 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2652 return REG_ESPACE;
2654 mbcset->range_starts = new_array_start;
2655 mbcset->range_ends = new_array_end;
2656 *range_alloc = new_nranges;
2659 mbcset->range_starts[mbcset->nranges] = start_wc;
2660 mbcset->range_ends[mbcset->nranges++] = end_wc;
2663 /* Build the table for single byte characters. */
2664 for (wc = 0; wc < SBC_MAX; ++wc)
2666 cmp_buf[2] = wc;
2667 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2668 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2669 bitset_set (sbcset, wc);
2672 # else /* not RE_ENABLE_I18N */
2674 unsigned int ch;
2675 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2676 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2677 : 0));
2678 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2679 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2680 : 0));
2681 if (start_ch > end_ch)
2682 return REG_ERANGE;
2683 /* Build the table for single byte characters. */
2684 for (ch = 0; ch < SBC_MAX; ++ch)
2685 if (start_ch <= ch && ch <= end_ch)
2686 bitset_set (sbcset, ch);
2688 # endif /* not RE_ENABLE_I18N */
2689 return REG_NOERROR;
2691 #endif /* not _LIBC */
2693 #ifndef _LIBC
2694 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2695 Build the collating element which is represented by NAME.
2696 The result are written to MBCSET and SBCSET.
2697 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2698 pointer argument since we may update it. */
2700 static reg_errcode_t
2701 # ifdef RE_ENABLE_I18N
2702 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2703 re_charset_t *mbcset;
2704 int *coll_sym_alloc;
2705 # else /* not RE_ENABLE_I18N */
2706 build_collating_symbol (sbcset, name)
2707 # endif /* not RE_ENABLE_I18N */
2708 re_bitset_ptr_t sbcset;
2709 const unsigned char *name;
2711 size_t name_len = strlen ((const char *) name);
2712 if (BE (name_len != 1, 0))
2713 return REG_ECOLLATE;
2714 else
2716 bitset_set (sbcset, name[0]);
2717 return REG_NOERROR;
2720 #endif /* not _LIBC */
2722 /* This function parse bracket expression like "[abc]", "[a-c]",
2723 "[[.a-a.]]" etc. */
2725 static bin_tree_t *
2726 parse_bracket_exp (regexp, dfa, token, syntax, err)
2727 re_string_t *regexp;
2728 re_dfa_t *dfa;
2729 re_token_t *token;
2730 reg_syntax_t syntax;
2731 reg_errcode_t *err;
2733 #ifdef _LIBC
2734 const unsigned char *collseqmb;
2735 const char *collseqwc;
2736 uint32_t nrules;
2737 int32_t table_size;
2738 const int32_t *symb_table;
2739 const unsigned char *extra;
2741 /* Local function for parse_bracket_exp used in _LIBC environement.
2742 Seek the collating symbol entry correspondings to NAME.
2743 Return the index of the symbol in the SYMB_TABLE. */
2745 auto inline int32_t
2746 __attribute ((always_inline))
2747 seek_collating_symbol_entry (name, name_len)
2748 const unsigned char *name;
2749 size_t name_len;
2751 int32_t hash = elem_hash ((const char *) name, name_len);
2752 int32_t elem = hash % table_size;
2753 int32_t second = hash % (table_size - 2);
2754 while (symb_table[2 * elem] != 0)
2756 /* First compare the hashing value. */
2757 if (symb_table[2 * elem] == hash
2758 /* Compare the length of the name. */
2759 && name_len == extra[symb_table[2 * elem + 1]]
2760 /* Compare the name. */
2761 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2762 name_len) == 0)
2764 /* Yep, this is the entry. */
2765 break;
2768 /* Next entry. */
2769 elem += second;
2771 return elem;
2774 /* Local function for parse_bracket_exp used in _LIBC environement.
2775 Look up the collation sequence value of BR_ELEM.
2776 Return the value if succeeded, UINT_MAX otherwise. */
2778 auto inline unsigned int
2779 __attribute ((always_inline))
2780 lookup_collation_sequence_value (br_elem)
2781 bracket_elem_t *br_elem;
2783 if (br_elem->type == SB_CHAR)
2786 if (MB_CUR_MAX == 1)
2788 if (nrules == 0)
2789 return collseqmb[br_elem->opr.ch];
2790 else
2792 wint_t wc = __btowc (br_elem->opr.ch);
2793 return __collseq_table_lookup (collseqwc, wc);
2796 else if (br_elem->type == MB_CHAR)
2798 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2800 else if (br_elem->type == COLL_SYM)
2802 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2803 if (nrules != 0)
2805 int32_t elem, idx;
2806 elem = seek_collating_symbol_entry (br_elem->opr.name,
2807 sym_name_len);
2808 if (symb_table[2 * elem] != 0)
2810 /* We found the entry. */
2811 idx = symb_table[2 * elem + 1];
2812 /* Skip the name of collating element name. */
2813 idx += 1 + extra[idx];
2814 /* Skip the byte sequence of the collating element. */
2815 idx += 1 + extra[idx];
2816 /* Adjust for the alignment. */
2817 idx = (idx + 3) & ~3;
2818 /* Skip the multibyte collation sequence value. */
2819 idx += sizeof (unsigned int);
2820 /* Skip the wide char sequence of the collating element. */
2821 idx += sizeof (unsigned int) *
2822 (1 + *(unsigned int *) (extra + idx));
2823 /* Return the collation sequence value. */
2824 return *(unsigned int *) (extra + idx);
2826 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2828 /* No valid character. Match it as a single byte
2829 character. */
2830 return collseqmb[br_elem->opr.name[0]];
2833 else if (sym_name_len == 1)
2834 return collseqmb[br_elem->opr.name[0]];
2836 return UINT_MAX;
2839 /* Local function for parse_bracket_exp used in _LIBC environement.
2840 Build the range expression which starts from START_ELEM, and ends
2841 at END_ELEM. The result are written to MBCSET and SBCSET.
2842 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2843 mbcset->range_ends, is a pointer argument sinse we may
2844 update it. */
2846 auto inline reg_errcode_t
2847 __attribute ((always_inline))
2848 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2849 re_charset_t *mbcset;
2850 int *range_alloc;
2851 re_bitset_ptr_t sbcset;
2852 bracket_elem_t *start_elem, *end_elem;
2854 unsigned int ch;
2855 uint32_t start_collseq;
2856 uint32_t end_collseq;
2858 /* Equivalence Classes and Character Classes can't be a range
2859 start/end. */
2860 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2861 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2863 return REG_ERANGE;
2865 start_collseq = lookup_collation_sequence_value (start_elem);
2866 end_collseq = lookup_collation_sequence_value (end_elem);
2867 /* Check start/end collation sequence values. */
2868 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2869 return REG_ECOLLATE;
2870 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2871 return REG_ERANGE;
2873 /* Got valid collation sequence values, add them as a new entry.
2874 However, if we have no collation elements, and the character set
2875 is single byte, the single byte character set that we
2876 build below suffices. */
2877 if (nrules > 0 || dfa->mb_cur_max > 1)
2879 /* Check the space of the arrays. */
2880 if (BE (*range_alloc == mbcset->nranges, 0))
2882 /* There is not enough space, need realloc. */
2883 uint32_t *new_array_start;
2884 uint32_t *new_array_end;
2885 int new_nranges;
2887 /* +1 in case of mbcset->nranges is 0. */
2888 new_nranges = 2 * mbcset->nranges + 1;
2889 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2890 new_nranges);
2891 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2892 new_nranges);
2894 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2895 return REG_ESPACE;
2897 mbcset->range_starts = new_array_start;
2898 mbcset->range_ends = new_array_end;
2899 *range_alloc = new_nranges;
2902 mbcset->range_starts[mbcset->nranges] = start_collseq;
2903 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2906 /* Build the table for single byte characters. */
2907 for (ch = 0; ch < SBC_MAX; ch++)
2909 uint32_t ch_collseq;
2911 if (MB_CUR_MAX == 1)
2913 if (nrules == 0)
2914 ch_collseq = collseqmb[ch];
2915 else
2916 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2917 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2918 bitset_set (sbcset, ch);
2920 return REG_NOERROR;
2923 /* Local function for parse_bracket_exp used in _LIBC environement.
2924 Build the collating element which is represented by NAME.
2925 The result are written to MBCSET and SBCSET.
2926 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2927 pointer argument sinse we may update it. */
2929 auto inline reg_errcode_t
2930 __attribute ((always_inline))
2931 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2932 re_charset_t *mbcset;
2933 int *coll_sym_alloc;
2934 re_bitset_ptr_t sbcset;
2935 const unsigned char *name;
2937 int32_t elem, idx;
2938 size_t name_len = strlen ((const char *) name);
2939 if (nrules != 0)
2941 elem = seek_collating_symbol_entry (name, name_len);
2942 if (symb_table[2 * elem] != 0)
2944 /* We found the entry. */
2945 idx = symb_table[2 * elem + 1];
2946 /* Skip the name of collating element name. */
2947 idx += 1 + extra[idx];
2949 else if (symb_table[2 * elem] == 0 && name_len == 1)
2951 /* No valid character, treat it as a normal
2952 character. */
2953 bitset_set (sbcset, name[0]);
2954 return REG_NOERROR;
2956 else
2957 return REG_ECOLLATE;
2959 /* Got valid collation sequence, add it as a new entry. */
2960 /* Check the space of the arrays. */
2961 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2963 /* Not enough, realloc it. */
2964 /* +1 in case of mbcset->ncoll_syms is 0. */
2965 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2966 /* Use realloc since mbcset->coll_syms is NULL
2967 if *alloc == 0. */
2968 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2969 new_coll_sym_alloc);
2970 if (BE (new_coll_syms == NULL, 0))
2971 return REG_ESPACE;
2972 mbcset->coll_syms = new_coll_syms;
2973 *coll_sym_alloc = new_coll_sym_alloc;
2975 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2976 return REG_NOERROR;
2978 else
2980 if (BE (name_len != 1, 0))
2981 return REG_ECOLLATE;
2982 else
2984 bitset_set (sbcset, name[0]);
2985 return REG_NOERROR;
2989 #endif
2991 re_token_t br_token;
2992 re_bitset_ptr_t sbcset;
2993 #ifdef RE_ENABLE_I18N
2994 re_charset_t *mbcset;
2995 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2996 int equiv_class_alloc = 0, char_class_alloc = 0;
2997 #endif /* not RE_ENABLE_I18N */
2998 int non_match = 0;
2999 bin_tree_t *work_tree;
3000 int token_len;
3001 int first_round = 1;
3002 #ifdef _LIBC
3003 collseqmb = (const unsigned char *)
3004 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3005 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3006 if (nrules)
3009 if (MB_CUR_MAX > 1)
3011 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3012 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3013 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3014 _NL_COLLATE_SYMB_TABLEMB);
3015 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3016 _NL_COLLATE_SYMB_EXTRAMB);
3018 #endif
3019 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3020 #ifdef RE_ENABLE_I18N
3021 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3022 #endif /* RE_ENABLE_I18N */
3023 #ifdef RE_ENABLE_I18N
3024 if (BE (sbcset == NULL || mbcset == NULL, 0))
3025 #else
3026 if (BE (sbcset == NULL, 0))
3027 #endif /* RE_ENABLE_I18N */
3029 *err = REG_ESPACE;
3030 return NULL;
3033 token_len = peek_token_bracket (token, regexp, syntax);
3034 if (BE (token->type == END_OF_RE, 0))
3036 *err = REG_BADPAT;
3037 goto parse_bracket_exp_free_return;
3039 if (token->type == OP_NON_MATCH_LIST)
3041 #ifdef RE_ENABLE_I18N
3042 mbcset->non_match = 1;
3043 #endif /* not RE_ENABLE_I18N */
3044 non_match = 1;
3045 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3046 bitset_set (sbcset, '\0');
3047 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3048 token_len = peek_token_bracket (token, regexp, syntax);
3049 if (BE (token->type == END_OF_RE, 0))
3051 *err = REG_BADPAT;
3052 goto parse_bracket_exp_free_return;
3056 /* We treat the first ']' as a normal character. */
3057 if (token->type == OP_CLOSE_BRACKET)
3058 token->type = CHARACTER;
3060 while (1)
3062 bracket_elem_t start_elem, end_elem;
3063 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3064 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3065 reg_errcode_t ret;
3066 int token_len2 = 0, is_range_exp = 0;
3067 re_token_t token2;
3069 start_elem.opr.name = start_name_buf;
3070 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3071 syntax, first_round);
3072 if (BE (ret != REG_NOERROR, 0))
3074 *err = ret;
3075 goto parse_bracket_exp_free_return;
3077 first_round = 0;
3079 /* Get information about the next token. We need it in any case. */
3080 token_len = peek_token_bracket (token, regexp, syntax);
3082 /* Do not check for ranges if we know they are not allowed. */
3083 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3085 if (BE (token->type == END_OF_RE, 0))
3087 *err = REG_EBRACK;
3088 goto parse_bracket_exp_free_return;
3090 if (token->type == OP_CHARSET_RANGE)
3092 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3093 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3094 if (BE (token2.type == END_OF_RE, 0))
3096 *err = REG_EBRACK;
3097 goto parse_bracket_exp_free_return;
3099 if (token2.type == OP_CLOSE_BRACKET)
3101 /* We treat the last '-' as a normal character. */
3102 re_string_skip_bytes (regexp, -token_len);
3103 token->type = CHARACTER;
3105 else
3106 is_range_exp = 1;
3110 if (is_range_exp == 1)
3112 end_elem.opr.name = end_name_buf;
3113 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3114 dfa, syntax, 1);
3115 if (BE (ret != REG_NOERROR, 0))
3117 *err = ret;
3118 goto parse_bracket_exp_free_return;
3121 token_len = peek_token_bracket (token, regexp, syntax);
3123 #ifdef _LIBC
3124 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3125 &start_elem, &end_elem);
3126 #else
3127 # ifdef RE_ENABLE_I18N
3128 *err = build_range_exp (sbcset,
3129 dfa->mb_cur_max > 1 ? mbcset : NULL,
3130 &range_alloc, &start_elem, &end_elem);
3131 # else
3132 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3133 # endif
3134 #endif /* RE_ENABLE_I18N */
3135 if (BE (*err != REG_NOERROR, 0))
3136 goto parse_bracket_exp_free_return;
3138 else
3140 switch (start_elem.type)
3142 case SB_CHAR:
3143 bitset_set (sbcset, start_elem.opr.ch);
3144 break;
3145 #ifdef RE_ENABLE_I18N
3146 case MB_CHAR:
3147 /* Check whether the array has enough space. */
3148 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3150 wchar_t *new_mbchars;
3151 /* Not enough, realloc it. */
3152 /* +1 in case of mbcset->nmbchars is 0. */
3153 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3154 /* Use realloc since array is NULL if *alloc == 0. */
3155 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3156 mbchar_alloc);
3157 if (BE (new_mbchars == NULL, 0))
3158 goto parse_bracket_exp_espace;
3159 mbcset->mbchars = new_mbchars;
3161 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3162 break;
3163 #endif /* RE_ENABLE_I18N */
3164 case EQUIV_CLASS:
3165 *err = build_equiv_class (sbcset,
3166 #ifdef RE_ENABLE_I18N
3167 mbcset, &equiv_class_alloc,
3168 #endif /* RE_ENABLE_I18N */
3169 start_elem.opr.name);
3170 if (BE (*err != REG_NOERROR, 0))
3171 goto parse_bracket_exp_free_return;
3172 break;
3173 case COLL_SYM:
3174 *err = build_collating_symbol (sbcset,
3175 #ifdef RE_ENABLE_I18N
3176 mbcset, &coll_sym_alloc,
3177 #endif /* RE_ENABLE_I18N */
3178 start_elem.opr.name);
3179 if (BE (*err != REG_NOERROR, 0))
3180 goto parse_bracket_exp_free_return;
3181 break;
3182 case CHAR_CLASS:
3183 *err = build_charclass (regexp->trans, sbcset,
3184 #ifdef RE_ENABLE_I18N
3185 mbcset, &char_class_alloc,
3186 #endif /* RE_ENABLE_I18N */
3187 start_elem.opr.name, syntax);
3188 if (BE (*err != REG_NOERROR, 0))
3189 goto parse_bracket_exp_free_return;
3190 break;
3191 default:
3192 assert (0);
3193 break;
3196 if (BE (token->type == END_OF_RE, 0))
3198 *err = REG_EBRACK;
3199 goto parse_bracket_exp_free_return;
3201 if (token->type == OP_CLOSE_BRACKET)
3202 break;
3205 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3207 /* If it is non-matching list. */
3208 if (non_match)
3209 bitset_not (sbcset);
3211 #ifdef RE_ENABLE_I18N
3212 /* Ensure only single byte characters are set. */
3213 if (dfa->mb_cur_max > 1)
3214 bitset_mask (sbcset, dfa->sb_char);
3215 #endif /* RE_ENABLE_I18N */
3217 /* Build a tree for simple bracket. */
3218 br_token.type = SIMPLE_BRACKET;
3219 br_token.opr.sbcset = sbcset;
3220 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3221 if (BE (work_tree == NULL, 0))
3222 goto parse_bracket_exp_espace;
3224 #ifdef RE_ENABLE_I18N
3225 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3226 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3227 || mbcset->non_match)))
3229 re_token_t alt_token;
3230 bin_tree_t *mbc_tree;
3231 int sbc_idx;
3232 /* Build a tree for complex bracket. */
3233 dfa->has_mb_node = 1;
3234 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3235 if (sbcset[sbc_idx])
3236 break;
3237 /* If there are no bits set in sbcset, there is no point
3238 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3239 if (sbc_idx == BITSET_UINTS)
3241 re_free (sbcset);
3242 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3243 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3244 return work_tree;
3246 br_token.type = COMPLEX_BRACKET;
3247 br_token.opr.mbcset = mbcset;
3248 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3249 if (BE (mbc_tree == NULL, 0))
3250 goto parse_bracket_exp_espace;
3251 /* Then join them by ALT node. */
3252 alt_token.type = OP_ALT;
3253 dfa->has_plural_match = 1;
3254 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3255 if (BE (mbc_tree != NULL, 1))
3256 return work_tree;
3258 else
3260 free_charset (mbcset);
3261 return work_tree;
3263 #else /* not RE_ENABLE_I18N */
3264 return work_tree;
3265 #endif /* not RE_ENABLE_I18N */
3267 parse_bracket_exp_espace:
3268 *err = REG_ESPACE;
3269 parse_bracket_exp_free_return:
3270 re_free (sbcset);
3271 #ifdef RE_ENABLE_I18N
3272 free_charset (mbcset);
3273 #endif /* RE_ENABLE_I18N */
3274 return NULL;
3277 /* Parse an element in the bracket expression. */
3279 static reg_errcode_t
3280 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3281 accept_hyphen)
3282 bracket_elem_t *elem;
3283 re_string_t *regexp;
3284 re_token_t *token;
3285 int token_len;
3286 re_dfa_t *dfa;
3287 reg_syntax_t syntax;
3288 int accept_hyphen;
3290 #ifdef RE_ENABLE_I18N
3291 int cur_char_size;
3292 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3293 if (cur_char_size > 1)
3295 elem->type = MB_CHAR;
3296 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3297 re_string_skip_bytes (regexp, cur_char_size);
3298 return REG_NOERROR;
3300 #endif /* RE_ENABLE_I18N */
3301 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3302 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3303 || token->type == OP_OPEN_EQUIV_CLASS)
3304 return parse_bracket_symbol (elem, regexp, token);
3305 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3307 /* A '-' must only appear as anything but a range indicator before
3308 the closing bracket. Everything else is an error. */
3309 re_token_t token2;
3310 (void) peek_token_bracket (&token2, regexp, syntax);
3311 if (token2.type != OP_CLOSE_BRACKET)
3312 /* The actual error value is not standardized since this whole
3313 case is undefined. But ERANGE makes good sense. */
3314 return REG_ERANGE;
3316 elem->type = SB_CHAR;
3317 elem->opr.ch = token->opr.c;
3318 return REG_NOERROR;
3321 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3322 such as [:<character_class>:], [.<collating_element>.], and
3323 [=<equivalent_class>=]. */
3325 static reg_errcode_t
3326 parse_bracket_symbol (elem, regexp, token)
3327 bracket_elem_t *elem;
3328 re_string_t *regexp;
3329 re_token_t *token;
3331 unsigned char ch, delim = token->opr.c;
3332 int i = 0;
3333 if (re_string_eoi(regexp))
3334 return REG_EBRACK;
3335 for (;; ++i)
3337 if (i >= BRACKET_NAME_BUF_SIZE)
3338 return REG_EBRACK;
3339 if (token->type == OP_OPEN_CHAR_CLASS)
3340 ch = re_string_fetch_byte_case (regexp);
3341 else
3342 ch = re_string_fetch_byte (regexp);
3343 if (re_string_eoi(regexp))
3344 return REG_EBRACK;
3345 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3346 break;
3347 elem->opr.name[i] = ch;
3349 re_string_skip_bytes (regexp, 1);
3350 elem->opr.name[i] = '\0';
3351 switch (token->type)
3353 case OP_OPEN_COLL_ELEM:
3354 elem->type = COLL_SYM;
3355 break;
3356 case OP_OPEN_EQUIV_CLASS:
3357 elem->type = EQUIV_CLASS;
3358 break;
3359 case OP_OPEN_CHAR_CLASS:
3360 elem->type = CHAR_CLASS;
3361 break;
3362 default:
3363 break;
3365 return REG_NOERROR;
3368 /* Helper function for parse_bracket_exp.
3369 Build the equivalence class which is represented by NAME.
3370 The result are written to MBCSET and SBCSET.
3371 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3372 is a pointer argument sinse we may update it. */
3374 static reg_errcode_t
3375 #ifdef RE_ENABLE_I18N
3376 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3377 re_charset_t *mbcset;
3378 int *equiv_class_alloc;
3379 #else /* not RE_ENABLE_I18N */
3380 build_equiv_class (sbcset, name)
3381 #endif /* not RE_ENABLE_I18N */
3382 re_bitset_ptr_t sbcset;
3383 const unsigned char *name;
3385 #if defined _LIBC
3386 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3387 if (nrules != 0)
3389 const int32_t *table, *indirect;
3390 const unsigned char *weights, *extra, *cp;
3391 unsigned char char_buf[2];
3392 int32_t idx1, idx2;
3393 unsigned int ch;
3394 size_t len;
3395 /* This #include defines a local function! */
3396 # include <locale/weight.h>
3397 /* Calculate the index for equivalence class. */
3398 cp = name;
3399 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3400 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3401 _NL_COLLATE_WEIGHTMB);
3402 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3403 _NL_COLLATE_EXTRAMB);
3404 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3405 _NL_COLLATE_INDIRECTMB);
3406 idx1 = findidx (&cp);
3407 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3408 /* This isn't a valid character. */
3409 return REG_ECOLLATE;
3411 /* Build single byte matcing table for this equivalence class. */
3412 char_buf[1] = (unsigned char) '\0';
3413 len = weights[idx1];
3414 for (ch = 0; ch < SBC_MAX; ++ch)
3416 char_buf[0] = ch;
3417 cp = char_buf;
3418 idx2 = findidx (&cp);
3420 idx2 = table[ch];
3422 if (idx2 == 0)
3423 /* This isn't a valid character. */
3424 continue;
3425 if (len == weights[idx2])
3427 int cnt = 0;
3428 while (cnt <= len &&
3429 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3430 ++cnt;
3432 if (cnt > len)
3433 bitset_set (sbcset, ch);
3436 /* Check whether the array has enough space. */
3437 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439 /* Not enough, realloc it. */
3440 /* +1 in case of mbcset->nequiv_classes is 0. */
3441 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3442 /* Use realloc since the array is NULL if *alloc == 0. */
3443 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3444 int32_t,
3445 new_equiv_class_alloc);
3446 if (BE (new_equiv_classes == NULL, 0))
3447 return REG_ESPACE;
3448 mbcset->equiv_classes = new_equiv_classes;
3449 *equiv_class_alloc = new_equiv_class_alloc;
3451 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3453 else
3454 #endif /* _LIBC */
3456 if (BE (strlen ((const char *) name) != 1, 0))
3457 return REG_ECOLLATE;
3458 bitset_set (sbcset, *name);
3460 return REG_NOERROR;
3463 /* Helper function for parse_bracket_exp.
3464 Build the character class which is represented by NAME.
3465 The result are written to MBCSET and SBCSET.
3466 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3467 is a pointer argument sinse we may update it. */
3469 static reg_errcode_t
3470 #ifdef RE_ENABLE_I18N
3471 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3472 re_charset_t *mbcset;
3473 int *char_class_alloc;
3474 #else /* not RE_ENABLE_I18N */
3475 build_charclass (trans, sbcset, class_name, syntax)
3476 #endif /* not RE_ENABLE_I18N */
3477 unsigned RE_TRANSLATE_TYPE trans;
3478 re_bitset_ptr_t sbcset;
3479 const unsigned char *class_name;
3480 reg_syntax_t syntax;
3482 int i;
3483 const char *name = (const char *) class_name;
3485 /* In case of REG_ICASE "upper" and "lower" match the both of
3486 upper and lower cases. */
3487 if ((syntax & RE_ICASE)
3488 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3489 name = "alpha";
3491 #ifdef RE_ENABLE_I18N
3492 /* Check the space of the arrays. */
3493 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3495 /* Not enough, realloc it. */
3496 /* +1 in case of mbcset->nchar_classes is 0. */
3497 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3498 /* Use realloc since array is NULL if *alloc == 0. */
3499 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3500 new_char_class_alloc);
3501 if (BE (new_char_classes == NULL, 0))
3502 return REG_ESPACE;
3503 mbcset->char_classes = new_char_classes;
3504 *char_class_alloc = new_char_class_alloc;
3506 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3507 #endif /* RE_ENABLE_I18N */
3509 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3510 for (i = 0; i < SBC_MAX; ++i) \
3512 if (ctype_func (i)) \
3514 int ch = trans ? trans[i] : i; \
3515 bitset_set (sbcset, ch); \
3519 if (strcmp (name, "alnum") == 0)
3520 BUILD_CHARCLASS_LOOP (isalnum)
3521 else if (strcmp (name, "cntrl") == 0)
3522 BUILD_CHARCLASS_LOOP (iscntrl)
3523 else if (strcmp (name, "lower") == 0)
3524 BUILD_CHARCLASS_LOOP (islower)
3525 else if (strcmp (name, "space") == 0)
3526 BUILD_CHARCLASS_LOOP (isspace)
3527 else if (strcmp (name, "alpha") == 0)
3528 BUILD_CHARCLASS_LOOP (isalpha)
3529 else if (strcmp (name, "digit") == 0)
3530 BUILD_CHARCLASS_LOOP (isdigit)
3531 else if (strcmp (name, "print") == 0)
3532 BUILD_CHARCLASS_LOOP (isprint)
3533 else if (strcmp (name, "upper") == 0)
3534 BUILD_CHARCLASS_LOOP (isupper)
3535 else if (strcmp (name, "blank") == 0)
3536 BUILD_CHARCLASS_LOOP (isblank)
3537 else if (strcmp (name, "graph") == 0)
3538 BUILD_CHARCLASS_LOOP (isgraph)
3539 else if (strcmp (name, "punct") == 0)
3540 BUILD_CHARCLASS_LOOP (ispunct)
3541 else if (strcmp (name, "xdigit") == 0)
3542 BUILD_CHARCLASS_LOOP (isxdigit)
3543 else
3544 return REG_ECTYPE;
3546 return REG_NOERROR;
3549 static bin_tree_t *
3550 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3551 re_dfa_t *dfa;
3552 unsigned RE_TRANSLATE_TYPE trans;
3553 const unsigned char *class_name;
3554 const unsigned char *extra;
3555 int non_match;
3556 reg_errcode_t *err;
3558 re_bitset_ptr_t sbcset;
3559 #ifdef RE_ENABLE_I18N
3560 re_charset_t *mbcset;
3561 int alloc = 0;
3562 #endif /* not RE_ENABLE_I18N */
3563 reg_errcode_t ret;
3564 re_token_t br_token;
3565 bin_tree_t *tree;
3567 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3568 #ifdef RE_ENABLE_I18N
3569 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3570 #endif /* RE_ENABLE_I18N */
3572 #ifdef RE_ENABLE_I18N
3573 if (BE (sbcset == NULL || mbcset == NULL, 0))
3574 #else /* not RE_ENABLE_I18N */
3575 if (BE (sbcset == NULL, 0))
3576 #endif /* not RE_ENABLE_I18N */
3578 *err = REG_ESPACE;
3579 return NULL;
3582 if (non_match)
3584 #ifdef RE_ENABLE_I18N
3586 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3587 bitset_set(cset->sbcset, '\0');
3589 mbcset->non_match = 1;
3590 #endif /* not RE_ENABLE_I18N */
3593 /* We don't care the syntax in this case. */
3594 ret = build_charclass (trans, sbcset,
3595 #ifdef RE_ENABLE_I18N
3596 mbcset, &alloc,
3597 #endif /* RE_ENABLE_I18N */
3598 class_name, 0);
3600 if (BE (ret != REG_NOERROR, 0))
3602 re_free (sbcset);
3603 #ifdef RE_ENABLE_I18N
3604 free_charset (mbcset);
3605 #endif /* RE_ENABLE_I18N */
3606 *err = ret;
3607 return NULL;
3609 /* \w match '_' also. */
3610 for (; *extra; extra++)
3611 bitset_set (sbcset, *extra);
3613 /* If it is non-matching list. */
3614 if (non_match)
3615 bitset_not (sbcset);
3617 #ifdef RE_ENABLE_I18N
3618 /* Ensure only single byte characters are set. */
3619 if (dfa->mb_cur_max > 1)
3620 bitset_mask (sbcset, dfa->sb_char);
3621 #endif
3623 /* Build a tree for simple bracket. */
3624 br_token.type = SIMPLE_BRACKET;
3625 br_token.opr.sbcset = sbcset;
3626 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3627 if (BE (tree == NULL, 0))
3628 goto build_word_op_espace;
3630 #ifdef RE_ENABLE_I18N
3631 if (dfa->mb_cur_max > 1)
3633 re_token_t alt_token;
3634 bin_tree_t *mbc_tree;
3635 /* Build a tree for complex bracket. */
3636 br_token.type = COMPLEX_BRACKET;
3637 br_token.opr.mbcset = mbcset;
3638 dfa->has_mb_node = 1;
3639 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3640 if (BE (mbc_tree == NULL, 0))
3641 goto build_word_op_espace;
3642 /* Then join them by ALT node. */
3643 alt_token.type = OP_ALT;
3644 dfa->has_plural_match = 1;
3645 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3646 if (BE (mbc_tree != NULL, 1))
3647 return tree;
3649 else
3651 free_charset (mbcset);
3652 return tree;
3654 #else /* not RE_ENABLE_I18N */
3655 return tree;
3656 #endif /* not RE_ENABLE_I18N */
3658 build_word_op_espace:
3659 re_free (sbcset);
3660 #ifdef RE_ENABLE_I18N
3661 free_charset (mbcset);
3662 #endif /* RE_ENABLE_I18N */
3663 *err = REG_ESPACE;
3664 return NULL;
3667 /* This is intended for the expressions like "a{1,3}".
3668 Fetch a number from `input', and return the number.
3669 Return -1, if the number field is empty like "{,1}".
3670 Return -2, If an error is occured. */
3672 static int
3673 fetch_number (input, token, syntax)
3674 re_string_t *input;
3675 re_token_t *token;
3676 reg_syntax_t syntax;
3678 int num = -1;
3679 unsigned char c;
3680 while (1)
3682 fetch_token (token, input, syntax);
3683 c = token->opr.c;
3684 if (BE (token->type == END_OF_RE, 0))
3685 return -2;
3686 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3687 break;
3688 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3689 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3690 num = (num > RE_DUP_MAX) ? -2 : num;
3692 return num;
3695 #ifdef RE_ENABLE_I18N
3696 static void
3697 free_charset (re_charset_t *cset)
3699 re_free (cset->mbchars);
3700 # ifdef _LIBC
3701 re_free (cset->coll_syms);
3702 re_free (cset->equiv_classes);
3703 re_free (cset->range_starts);
3704 re_free (cset->range_ends);
3705 # endif
3706 re_free (cset->char_classes);
3707 re_free (cset);
3709 #endif /* RE_ENABLE_I18N */
3711 /* Functions for binary tree operation. */
3713 /* Create a tree node. */
3715 static bin_tree_t *
3716 create_tree (dfa, left, right, type, index)
3717 re_dfa_t *dfa;
3718 bin_tree_t *left;
3719 bin_tree_t *right;
3720 re_token_type_t type;
3721 int index;
3723 bin_tree_t *tree;
3724 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3726 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3728 if (storage == NULL)
3729 return NULL;
3730 storage->next = dfa->str_tree_storage;
3731 dfa->str_tree_storage = storage;
3732 dfa->str_tree_storage_idx = 0;
3734 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3736 tree->parent = NULL;
3737 tree->left = left;
3738 tree->right = right;
3739 tree->type = type;
3740 tree->node_idx = index;
3741 tree->first = -1;
3742 tree->next = -1;
3743 re_node_set_init_empty (&tree->eclosure);
3745 if (left != NULL)
3746 left->parent = tree;
3747 if (right != NULL)
3748 right->parent = tree;
3749 return tree;
3752 /* Create both a DFA node and a tree for it. */
3754 static bin_tree_t *
3755 re_dfa_add_tree_node (dfa, left, right, token)
3756 re_dfa_t *dfa;
3757 bin_tree_t *left;
3758 bin_tree_t *right;
3759 const re_token_t *token;
3761 int new_idx = re_dfa_add_node (dfa, *token, 0);
3763 if (new_idx == -1)
3764 return NULL;
3766 return create_tree (dfa, left, right, 0, new_idx);
3769 /* Mark the tree SRC as an optional subexpression. */
3771 static void
3772 mark_opt_subexp (src, dfa)
3773 const bin_tree_t *src;
3774 re_dfa_t *dfa;
3776 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3777 a subexpression. */
3778 if (src->type == CONCAT
3779 && src->left->type == NON_TYPE
3780 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3781 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3785 /* Recursive tree walker for mark_opt_subexp. */
3787 static void
3788 mark_opt_subexp_iter (src, dfa, idx)
3789 const bin_tree_t *src;
3790 re_dfa_t *dfa;
3791 int idx;
3793 int node_idx;
3795 if (src->type == NON_TYPE)
3797 node_idx = src->node_idx;
3798 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3799 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3800 && dfa->nodes[node_idx].opr.idx == idx)
3801 dfa->nodes[node_idx].opt_subexp = 1;
3804 if (src->left != NULL)
3805 mark_opt_subexp_iter (src->left, dfa, idx);
3807 if (src->right != NULL)
3808 mark_opt_subexp_iter (src->right, dfa, idx);
3812 /* Duplicate the node SRC, and return new node. */
3814 static bin_tree_t *
3815 duplicate_tree (src, dfa)
3816 const bin_tree_t *src;
3817 re_dfa_t *dfa;
3819 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3820 int new_node_idx;
3821 /* Since node indies must be according to Post-order of the tree,
3822 we must duplicate the left at first. */
3823 if (src->left != NULL)
3825 left = duplicate_tree (src->left, dfa);
3826 if (left == NULL)
3827 return NULL;
3830 /* Secondaly, duplicate the right. */
3831 if (src->right != NULL)
3833 right = duplicate_tree (src->right, dfa);
3834 if (right == NULL)
3835 return NULL;
3838 /* At last, duplicate itself. */
3839 if (src->type == NON_TYPE)
3841 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3842 dfa->nodes[new_node_idx].duplicated = 1;
3843 if (BE (new_node_idx == -1, 0))
3844 return NULL;
3846 else
3847 new_node_idx = src->type;
3849 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3850 return new_tree;