1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
21 size_t length
, reg_syntax_t syntax
);
22 static void re_compile_fastmap_iter (regex_t
*bufp
,
23 const re_dfastate_t
*init_state
,
25 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
27 static void free_charset (re_charset_t
*cset
);
29 static void free_workarea_compile (regex_t
*preg
);
30 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
32 static void optimize_utf8 (re_dfa_t
*dfa
);
34 static reg_errcode_t
analyze (regex_t
*preg
);
35 static reg_errcode_t
preorder (bin_tree_t
*root
,
36 reg_errcode_t (fn (void *, bin_tree_t
*)),
38 static reg_errcode_t
postorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
42 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
43 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
45 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
48 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
49 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
50 unsigned int constraint
);
51 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
52 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
54 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
55 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
57 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 reg_syntax_t syntax
) internal_function
;
59 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
60 reg_syntax_t syntax
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 int nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 int nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
74 re_dfa_t
*dfa
, re_token_t
*token
,
75 reg_syntax_t syntax
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
77 re_token_t
*token
, reg_syntax_t syntax
,
79 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
81 re_token_t
*token
, int token_len
,
85 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
89 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
91 int *equiv_class_alloc
,
92 const unsigned char *name
);
93 static reg_errcode_t
build_charclass (__RE_TRANSLATE_TYPE trans
,
96 int *char_class_alloc
,
97 const unsigned char *class_name
,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
101 const unsigned char *name
);
102 static reg_errcode_t
build_charclass (__RE_TRANSLATE_TYPE trans
,
104 const unsigned char *class_name
,
105 reg_syntax_t syntax
);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
108 __RE_TRANSLATE_TYPE trans
,
109 const unsigned char *class_name
,
110 const unsigned char *extra
,
111 int non_match
, reg_errcode_t
*err
);
112 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
113 bin_tree_t
*left
, bin_tree_t
*right
,
114 re_token_type_t type
);
115 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
116 bin_tree_t
*left
, bin_tree_t
*right
,
117 const re_token_t
*token
);
118 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
119 static void free_token (re_token_t
*node
);
120 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
121 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid
[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const uint16_t __re_error_msgid_idx
[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern
,
215 struct re_pattern_buffer
*bufp
)
219 /* And GNU code determines whether or not to get register information
220 by passing null for the REGS argument to re_match, etc., not by
221 setting no_sub, unless RE_NO_SUB is set. */
222 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
224 /* Match anchors at newline. */
225 bufp
->newline_anchor
= 1;
227 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
231 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
234 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
235 also be assigned to arbitrarily: each pattern buffer stores its own
236 syntax, so it can be changed between regex compilations. */
237 /* This has no initializer because initialized variables in Emacs
238 become read-only after dumping. */
239 reg_syntax_t re_syntax_options
;
242 /* Specify the precise syntax of regexps for compilation. This provides
243 for compatibility for various utilities which historically have
244 different, incompatible syntaxes.
246 The argument SYNTAX is a bit mask comprised of the various bits
247 defined in regex.h. We return the old syntax. */
250 re_set_syntax (reg_syntax_t syntax
)
252 reg_syntax_t ret
= re_syntax_options
;
254 re_syntax_options
= syntax
;
259 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
261 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
262 char *fastmap
= bufp
->fastmap
;
264 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
265 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
266 if (dfa
->init_state
!= dfa
->init_state_word
)
267 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
268 if (dfa
->init_state
!= dfa
->init_state_nl
)
269 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
270 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
271 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
272 bufp
->fastmap_accurate
= 1;
275 libc_hidden_def(re_compile_fastmap
)
277 static __inline__
void
278 __attribute ((always_inline
))
279 re_set_fastmap (char *fastmap
, int icase
, int ch
)
283 fastmap
[tolower (ch
)] = 1;
286 /* Helper function for re_compile_fastmap.
287 Compile fastmap for the initial_state INIT_STATE. */
290 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
293 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
295 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
296 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
298 int node
= init_state
->nodes
.elems
[node_cnt
];
299 re_token_type_t type
= dfa
->nodes
[node
].type
;
301 if (type
== CHARACTER
)
303 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
304 #ifdef RE_ENABLE_I18N
305 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
307 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
312 *p
++ = dfa
->nodes
[node
].opr
.c
;
313 while (++node
< dfa
->nodes_len
314 && dfa
->nodes
[node
].type
== CHARACTER
315 && dfa
->nodes
[node
].mb_partial
)
316 *p
++ = dfa
->nodes
[node
].opr
.c
;
317 memset (&state
, '\0', sizeof (state
));
318 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
320 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
322 re_set_fastmap (fastmap
, 0, buf
[0]);
326 else if (type
== SIMPLE_BRACKET
)
329 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
332 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
333 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
334 if (w
& ((bitset_word_t
) 1 << j
))
335 re_set_fastmap (fastmap
, icase
, ch
);
338 #ifdef RE_ENABLE_I18N
339 else if (type
== COMPLEX_BRACKET
)
342 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
343 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
344 || cset
->nranges
|| cset
->nchar_classes
)
347 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
349 /* In this case we want to catch the bytes which are
350 the first byte of any collation elements.
351 e.g. In da_DK, we want to catch 'a' since "aa"
352 is a valid collation element, and don't catch
353 'b' since 'b' is the only collation element
354 which starts from 'b'. */
355 const int32_t *table
= (const int32_t *)
356 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
357 for (i
= 0; i
< SBC_MAX
; ++i
)
359 re_set_fastmap (fastmap
, icase
, i
);
362 if (dfa
->mb_cur_max
> 1)
363 for (i
= 0; i
< SBC_MAX
; ++i
)
364 if (__btowc (i
) == WEOF
)
365 re_set_fastmap (fastmap
, icase
, i
);
368 for (i
= 0; i
< cset
->nmbchars
; ++i
)
372 memset (&state
, '\0', sizeof (state
));
373 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
374 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
375 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
377 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
379 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
383 #endif /* RE_ENABLE_I18N */
384 else if (type
== OP_PERIOD
385 #ifdef RE_ENABLE_I18N
386 || type
== OP_UTF8_PERIOD
388 || type
== END_OF_RE
)
390 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
391 if (type
== END_OF_RE
)
392 bufp
->can_be_null
= 1;
398 /* Entry point for POSIX code. */
399 /* regcomp takes a regular expression as a string and compiles it.
401 PREG is a regex_t *. We do not expect any fields to be initialized,
402 since POSIX says we shouldn't. Thus, we set
404 `buffer' to the compiled pattern;
405 `used' to the length of the compiled pattern;
406 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
407 REG_EXTENDED bit in CFLAGS is set; otherwise, to
408 RE_SYNTAX_POSIX_BASIC;
409 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
410 `fastmap' to an allocated space for the fastmap;
411 `fastmap_accurate' to zero;
412 `re_nsub' to the number of subexpressions in PATTERN.
414 PATTERN is the address of the pattern string.
416 CFLAGS is a series of bits which affect compilation.
418 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
419 use POSIX basic syntax.
421 If REG_NEWLINE is set, then . and [^...] don't match newline.
422 Also, regexec will try a match beginning after every newline.
424 If REG_ICASE is set, then we considers upper- and lowercase
425 versions of letters to be equivalent when matching.
427 If REG_NOSUB is set, then when PREG is passed to regexec, that
428 routine will report only success or failure, and nothing about the
431 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
432 the return codes and their meanings.) */
435 regcomp (regex_t
*__restrict preg
,
436 const char *__restrict pattern
,
440 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
441 : RE_SYNTAX_POSIX_BASIC
);
447 /* Try to allocate space for the fastmap. */
448 preg
->fastmap
= re_malloc (char, SBC_MAX
);
449 if (BE (preg
->fastmap
== NULL
, 0))
452 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
454 /* If REG_NEWLINE is set, newlines are treated differently. */
455 if (cflags
& REG_NEWLINE
)
456 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
457 syntax
&= ~RE_DOT_NEWLINE
;
458 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
459 /* It also changes the matching behavior. */
460 preg
->newline_anchor
= 1;
463 preg
->newline_anchor
= 0;
464 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
465 preg
->translate
= NULL
;
467 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
469 /* POSIX doesn't distinguish between an unmatched open-group and an
470 unmatched close-group: both are REG_EPAREN. */
471 if (ret
== REG_ERPAREN
)
474 /* We have already checked preg->fastmap != NULL. */
475 if (BE (ret
== REG_NOERROR
, 1))
476 /* Compute the fastmap now, since regexec cannot modify the pattern
477 buffer. This function never fails in this implementation. */
478 (void) re_compile_fastmap (preg
);
481 /* Some error occurred while compiling the expression. */
482 re_free (preg
->fastmap
);
483 preg
->fastmap
= NULL
;
489 /* Returns a message corresponding to an error code, ERRCODE, returned
490 from either regcomp or regexec. We don't use PREG here. */
493 regerror (int errcode
,
494 const regex_t
*__restrict preg
,
495 char *__restrict errbuf
,
502 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
503 / sizeof (__re_error_msgid_idx
[0])), 0))
504 /* Only error codes returned by the rest of the code should be passed
505 to this routine. If we are given anything else, or if other regex
506 code generates an invalid error code, then the program has a bug.
507 Dump core so we can fix it. */
510 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
512 msg_size
= strlen (msg
) + 1; /* Includes the null. */
514 if (BE (errbuf_size
!= 0, 1))
516 if (BE (msg_size
> errbuf_size
, 0))
518 memcpy (errbuf
, msg
, errbuf_size
- 1);
519 errbuf
[errbuf_size
- 1] = 0;
522 memcpy (errbuf
, msg
, msg_size
);
529 #ifdef RE_ENABLE_I18N
530 /* This static array is used for the map to single-byte characters when
531 UTF-8 is used. Otherwise we would allocate memory just to initialize
532 it the same all the time. UTF-8 is the preferred encoding so this is
533 a worthwhile optimization. */
534 static const bitset_t utf8_sb_map
=
536 /* Set the first 128 bits. */
537 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
543 free_dfa_content (re_dfa_t
*dfa
)
548 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
549 free_token (dfa
->nodes
+ i
);
550 re_free (dfa
->nexts
);
551 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
553 if (dfa
->eclosures
!= NULL
)
554 re_node_set_free (dfa
->eclosures
+ i
);
555 if (dfa
->inveclosures
!= NULL
)
556 re_node_set_free (dfa
->inveclosures
+ i
);
557 if (dfa
->edests
!= NULL
)
558 re_node_set_free (dfa
->edests
+ i
);
560 re_free (dfa
->edests
);
561 re_free (dfa
->eclosures
);
562 re_free (dfa
->inveclosures
);
563 re_free (dfa
->nodes
);
565 if (dfa
->state_table
)
566 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
568 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
569 for (j
= 0; j
< entry
->num
; ++j
)
571 re_dfastate_t
*state
= entry
->array
[j
];
574 re_free (entry
->array
);
576 re_free (dfa
->state_table
);
577 #ifdef RE_ENABLE_I18N
578 if (dfa
->sb_char
!= utf8_sb_map
)
579 re_free (dfa
->sb_char
);
581 re_free (dfa
->subexp_map
);
583 re_free (dfa
->re_str
);
590 /* Free dynamically allocated space used by PREG. */
593 regfree (regex_t
*preg
)
595 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
596 if (BE (dfa
!= NULL
, 1))
597 free_dfa_content (dfa
);
601 re_free (preg
->fastmap
);
602 preg
->fastmap
= NULL
;
604 re_free (preg
->translate
);
605 preg
->translate
= NULL
;
607 libc_hidden_def(regfree
)
609 /* Entry points compatible with 4.2 BSD regex library. We don't define
610 them unless specifically requested. */
612 #if defined _REGEX_RE_COMP || defined __UCLIBC__
614 /* BSD has one and only one pattern buffer. */
615 static struct re_pattern_buffer
*re_comp_buf
;
618 /* Make BCD definitions weak in libc, so POSIX programs can redefine
619 these names if they don't use our functions, and still use
620 regcomp/regexec above without link errors. */
622 re_comp (const char *s
)
626 /* "If re_comp() is passed NULL or a null string, it returns
627 * without changing the currently compiled regular expression." */
631 return gettext ("No previous regular expression");
637 re_comp_buf
= calloc (1, sizeof(*re_comp_buf
));
645 if (re_comp_buf
->buffer
)
647 regfree (re_comp_buf
);
648 memset (re_comp_buf
, '\0', sizeof(*re_comp_buf
));
651 if (re_comp_buf
->fastmap
== NULL
)
653 re_comp_buf
->fastmap
= malloc (SBC_MAX
);
654 if (re_comp_buf
->fastmap
== NULL
)
661 /* Since `re_exec' always passes NULL for the `regs' argument, we
662 don't need to initialize the pattern buffer fields which affect it. */
664 /* Match anchors at newlines. */
665 re_comp_buf
->newline_anchor
= 1;
667 ret
= re_compile_internal (re_comp_buf
, s
, strlen (s
), re_syntax_options
);
675 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
676 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
680 libc_freeres_fn (free_mem
)
682 regfree (re_comp_buf
);
688 #endif /* _REGEX_RE_COMP */
690 /* Internal entry point.
691 Compile the regular expression PATTERN, whose length is LENGTH.
692 SYNTAX indicate regular expression's syntax. */
695 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
698 reg_errcode_t err
= REG_NOERROR
;
702 /* Initialize the pattern buffer. */
703 preg
->fastmap_accurate
= 0;
704 preg
->syntax
= syntax
;
705 preg
->not_bol
= preg
->not_eol
= 0;
708 preg
->can_be_null
= 0;
709 preg
->regs_allocated
= REGS_UNALLOCATED
;
711 /* Initialize the dfa. */
712 dfa
= (re_dfa_t
*) preg
->buffer
;
713 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
715 /* If zero allocated, but buffer is non-null, try to realloc
716 enough space. This loses if buffer's address is bogus, but
717 that is the user's responsibility. If ->buffer is NULL this
718 is a simple allocation. */
719 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
722 preg
->allocated
= sizeof (re_dfa_t
);
723 preg
->buffer
= (unsigned char *) dfa
;
725 preg
->used
= sizeof (re_dfa_t
);
727 err
= init_dfa (dfa
, length
);
728 if (BE (err
!= REG_NOERROR
, 0))
730 free_dfa_content (dfa
);
736 /* Note: length+1 will not overflow since it is checked in init_dfa. */
737 dfa
->re_str
= re_malloc (char, length
+ 1);
738 strncpy (dfa
->re_str
, pattern
, length
+ 1);
741 __libc_lock_init (dfa
->lock
);
743 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
744 syntax
& RE_ICASE
, dfa
);
745 if (BE (err
!= REG_NOERROR
, 0))
747 re_compile_internal_free_return
:
748 free_workarea_compile (preg
);
749 re_string_destruct (®exp
);
750 free_dfa_content (dfa
);
756 /* Parse the regular expression, and build a structure tree. */
758 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
759 if (BE (dfa
->str_tree
== NULL
, 0))
760 goto re_compile_internal_free_return
;
762 /* Analyze the tree and create the nfa. */
763 err
= analyze (preg
);
764 if (BE (err
!= REG_NOERROR
, 0))
765 goto re_compile_internal_free_return
;
767 #ifdef RE_ENABLE_I18N
768 /* If possible, do searching in single byte encoding to speed things up. */
769 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
773 /* Then create the initial state of the dfa. */
774 err
= create_initial_state (dfa
);
776 /* Release work areas. */
777 free_workarea_compile (preg
);
778 re_string_destruct (®exp
);
780 if (BE (err
!= REG_NOERROR
, 0))
782 free_dfa_content (dfa
);
790 /* Initialize DFA. We use the length of the regular expression PAT_LEN
791 as the initial length of some arrays. */
794 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
796 unsigned int table_size
;
801 memset (dfa
, '\0', sizeof (re_dfa_t
));
803 /* Force allocation of str_tree_storage the first time. */
804 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
806 /* Avoid overflows. */
807 if (pat_len
== SIZE_MAX
)
810 dfa
->nodes_alloc
= pat_len
+ 1;
811 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size
= 1; ; table_size
<<= 1)
815 if (table_size
> pat_len
)
818 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
819 dfa
->state_hash_mask
= table_size
- 1;
821 dfa
->mb_cur_max
= MB_CUR_MAX
;
823 if (dfa
->mb_cur_max
== 6
824 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
826 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
829 # ifdef HAVE_LANGINFO_CODESET
830 codeset_name
= nl_langinfo (CODESET
);
832 codeset_name
= getenv ("LC_ALL");
833 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
834 codeset_name
= getenv ("LC_CTYPE");
835 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
836 codeset_name
= getenv ("LANG");
837 if (codeset_name
== NULL
)
839 else if (strchr (codeset_name
, '.') != NULL
)
840 codeset_name
= strchr (codeset_name
, '.') + 1;
843 if (strcasecmp (codeset_name
, "UTF-8") == 0
844 || strcasecmp (codeset_name
, "UTF8") == 0)
847 /* We check exhaustively in the loop below if this charset is a
848 superset of ASCII. */
849 dfa
->map_notascii
= 0;
852 #ifdef RE_ENABLE_I18N
853 if (dfa
->mb_cur_max
> 1)
856 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
861 dfa
->sb_char
= calloc (sizeof (bitset_t
), 1);
862 if (BE (dfa
->sb_char
== NULL
, 0))
865 /* Set the bits corresponding to single byte chars. */
866 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
867 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
869 wint_t wch
= __btowc (ch
);
871 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
873 if (isascii (ch
) && wch
!= ch
)
874 dfa
->map_notascii
= 1;
881 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
886 /* Initialize WORD_CHAR table, which indicate which character is
887 "word". In this case "word" means that it is the word construction
888 character used by some operators like "\<", "\>", etc. */
892 init_word_char (re_dfa_t
*dfa
)
895 dfa
->word_ops_used
= 1;
896 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
897 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
898 if (isalnum (ch
) || ch
== '_')
899 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
902 /* Free the work area which are only used while compiling. */
905 free_workarea_compile (regex_t
*preg
)
907 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
908 bin_tree_storage_t
*storage
, *next
;
909 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
911 next
= storage
->next
;
914 dfa
->str_tree_storage
= NULL
;
915 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
916 dfa
->str_tree
= NULL
;
917 re_free (dfa
->org_indices
);
918 dfa
->org_indices
= NULL
;
921 /* Create initial states for all contexts. */
924 create_initial_state (re_dfa_t
*dfa
)
928 re_node_set init_nodes
;
930 /* Initial states have the epsilon closure of the node which is
931 the first node of the regular expression. */
932 first
= dfa
->str_tree
->first
->node_idx
;
933 dfa
->init_node
= first
;
934 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
935 if (BE (err
!= REG_NOERROR
, 0))
938 /* The back-references which are in initial states can epsilon transit,
939 since in this case all of the subexpressions can be null.
940 Then we add epsilon closures of the nodes which are the next nodes of
941 the back-references. */
942 if (dfa
->nbackref
> 0)
943 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
945 int node_idx
= init_nodes
.elems
[i
];
946 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
949 if (type
!= OP_BACK_REF
)
951 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
953 re_token_t
*clexp_node
;
954 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
955 if (clexp_node
->type
== OP_CLOSE_SUBEXP
956 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
959 if (clexp_idx
== init_nodes
.nelem
)
962 if (type
== OP_BACK_REF
)
964 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
965 if (!re_node_set_contains (&init_nodes
, dest_idx
))
967 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
973 /* It must be the first time to invoke acquire_state. */
974 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
975 /* We don't check ERR here, since the initial state must not be NULL. */
976 if (BE (dfa
->init_state
== NULL
, 0))
978 if (dfa
->init_state
->has_constraint
)
980 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
982 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
984 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
988 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
989 || dfa
->init_state_begbuf
== NULL
, 0))
993 dfa
->init_state_word
= dfa
->init_state_nl
994 = dfa
->init_state_begbuf
= dfa
->init_state
;
996 re_node_set_free (&init_nodes
);
1000 #ifdef RE_ENABLE_I18N
1001 /* If it is possible to do searching in single byte encoding instead of UTF-8
1002 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1003 DFA nodes where needed. */
1006 optimize_utf8 (re_dfa_t
*dfa
)
1008 int node
, i
, mb_chars
= 0, has_period
= 0;
1010 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1011 switch (dfa
->nodes
[node
].type
)
1014 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1018 switch (dfa
->nodes
[node
].opr
.idx
)
1026 /* Word anchors etc. cannot be handled. */
1036 case OP_DUP_ASTERISK
:
1037 case OP_OPEN_SUBEXP
:
1038 case OP_CLOSE_SUBEXP
:
1040 case COMPLEX_BRACKET
:
1042 case SIMPLE_BRACKET
:
1043 /* Just double check. The non-ASCII range starts at 0x80. */
1044 assert (0x80 % BITSET_WORD_BITS
== 0);
1045 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1046 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1053 if (mb_chars
|| has_period
)
1054 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1056 if (dfa
->nodes
[node
].type
== CHARACTER
1057 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1058 dfa
->nodes
[node
].mb_partial
= 0;
1059 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1060 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1063 /* The search can be in single byte locale. */
1064 dfa
->mb_cur_max
= 1;
1066 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1070 /* Analyze the structure tree, and calculate "first", "next", "edest",
1071 "eclosure", and "inveclosure". */
1073 static reg_errcode_t
1074 analyze (regex_t
*preg
)
1076 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1079 /* Allocate arrays. */
1080 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1081 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1082 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1083 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1084 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1085 || dfa
->eclosures
== NULL
, 0))
1088 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1089 if (dfa
->subexp_map
!= NULL
)
1092 for (i
= 0; i
< preg
->re_nsub
; i
++)
1093 dfa
->subexp_map
[i
] = i
;
1094 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1095 for (i
= 0; i
< preg
->re_nsub
; i
++)
1096 if (dfa
->subexp_map
[i
] != i
)
1098 if (i
== preg
->re_nsub
)
1100 free (dfa
->subexp_map
);
1101 dfa
->subexp_map
= NULL
;
1105 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1106 if (BE (ret
!= REG_NOERROR
, 0))
1108 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1109 if (BE (ret
!= REG_NOERROR
, 0))
1111 preorder (dfa
->str_tree
, calc_next
, dfa
);
1112 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1113 if (BE (ret
!= REG_NOERROR
, 0))
1115 ret
= calc_eclosure (dfa
);
1116 if (BE (ret
!= REG_NOERROR
, 0))
1119 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1120 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1121 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1124 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1125 if (BE (dfa
->inveclosures
== NULL
, 0))
1127 ret
= calc_inveclosure (dfa
);
1133 /* Our parse trees are very unbalanced, so we cannot use a stack to
1134 implement parse tree visits. Instead, we use parent pointers and
1135 some hairy code in these two functions. */
1136 static reg_errcode_t
1137 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1140 bin_tree_t
*node
, *prev
;
1142 for (node
= root
; ; )
1144 /* Descend down the tree, preferably to the left (or to the right
1145 if that's the only child). */
1146 while (node
->left
|| node
->right
)
1154 reg_errcode_t err
= fn (extra
, node
);
1155 if (BE (err
!= REG_NOERROR
, 0))
1157 if (node
->parent
== NULL
)
1160 node
= node
->parent
;
1162 /* Go up while we have a node that is reached from the right. */
1163 while (node
->right
== prev
|| node
->right
== NULL
);
1168 static reg_errcode_t
1169 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1174 for (node
= root
; ; )
1176 reg_errcode_t err
= fn (extra
, node
);
1177 if (BE (err
!= REG_NOERROR
, 0))
1180 /* Go to the left node, or up and to the right. */
1185 bin_tree_t
*prev
= NULL
;
1186 while (node
->right
== prev
|| node
->right
== NULL
)
1189 node
= node
->parent
;
1198 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1199 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1200 backreferences as well. Requires a preorder visit. */
1201 static reg_errcode_t
1202 optimize_subexps (void *extra
, bin_tree_t
*node
)
1204 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1206 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1208 int idx
= node
->token
.opr
.idx
;
1209 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1210 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1213 else if (node
->token
.type
== SUBEXP
1214 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1216 int other_idx
= node
->left
->token
.opr
.idx
;
1218 node
->left
= node
->left
->left
;
1220 node
->left
->parent
= node
;
1222 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1223 if (other_idx
< BITSET_WORD_BITS
)
1224 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1230 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1231 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1232 static reg_errcode_t
1233 lower_subexps (void *extra
, bin_tree_t
*node
)
1235 regex_t
*preg
= (regex_t
*) extra
;
1236 reg_errcode_t err
= REG_NOERROR
;
1238 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1240 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1242 node
->left
->parent
= node
;
1244 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1246 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1248 node
->right
->parent
= node
;
1255 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1257 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1258 bin_tree_t
*body
= node
->left
;
1259 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1262 /* We do not optimize empty subexpressions, because otherwise we may
1263 have bad CONCAT nodes with NULL children. This is obviously not
1264 very common, so we do not lose much. An example that triggers
1265 this case is the sed "script" /\(\)/x. */
1266 && node
->left
!= NULL
1267 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1268 || !(dfa
->used_bkref_map
1269 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1272 /* Convert the SUBEXP node to the concatenation of an
1273 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1274 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1275 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1276 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1277 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1278 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1284 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1285 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1289 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1290 nodes. Requires a postorder visit. */
1291 static reg_errcode_t
1292 calc_first (void *extra
, bin_tree_t
*node
)
1294 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1295 if (node
->token
.type
== CONCAT
)
1297 node
->first
= node
->left
->first
;
1298 node
->node_idx
= node
->left
->node_idx
;
1303 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1304 if (BE (node
->node_idx
== -1, 0))
1310 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1311 static reg_errcode_t
1312 calc_next (void *extra
, bin_tree_t
*node
)
1314 switch (node
->token
.type
)
1316 case OP_DUP_ASTERISK
:
1317 node
->left
->next
= node
;
1320 node
->left
->next
= node
->right
->first
;
1321 node
->right
->next
= node
->next
;
1325 node
->left
->next
= node
->next
;
1327 node
->right
->next
= node
->next
;
1333 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1334 static reg_errcode_t
1335 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1337 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1338 int idx
= node
->node_idx
;
1339 reg_errcode_t err
= REG_NOERROR
;
1341 switch (node
->token
.type
)
1347 assert (node
->next
== NULL
);
1350 case OP_DUP_ASTERISK
:
1354 dfa
->has_plural_match
= 1;
1355 if (node
->left
!= NULL
)
1356 left
= node
->left
->first
->node_idx
;
1358 left
= node
->next
->node_idx
;
1359 if (node
->right
!= NULL
)
1360 right
= node
->right
->first
->node_idx
;
1362 right
= node
->next
->node_idx
;
1364 assert (right
> -1);
1365 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1370 case OP_OPEN_SUBEXP
:
1371 case OP_CLOSE_SUBEXP
:
1372 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1376 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1377 if (node
->token
.type
== OP_BACK_REF
)
1378 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1382 assert (!IS_EPSILON_NODE (node
->token
.type
));
1383 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1390 /* Duplicate the epsilon closure of the node ROOT_NODE.
1391 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1392 to their own constraint. */
1394 static reg_errcode_t
1396 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1397 int root_node
, unsigned int init_constraint
)
1399 int org_node
, clone_node
, ret
;
1400 unsigned int constraint
= init_constraint
;
1401 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1403 int org_dest
, clone_dest
;
1404 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1406 /* If the back reference epsilon-transit, its destination must
1407 also have the constraint. Then duplicate the epsilon closure
1408 of the destination of the back reference, and store it in
1409 edests of the back reference. */
1410 org_dest
= dfa
->nexts
[org_node
];
1411 re_node_set_empty (dfa
->edests
+ clone_node
);
1412 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1413 if (BE (clone_dest
== -1, 0))
1415 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1416 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1417 if (BE (ret
< 0, 0))
1420 else if (dfa
->edests
[org_node
].nelem
== 0)
1422 /* In case of the node can't epsilon-transit, don't duplicate the
1423 destination and store the original destination as the
1424 destination of the node. */
1425 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1428 else if (dfa
->edests
[org_node
].nelem
== 1)
1430 /* In case of the node can epsilon-transit, and it has only one
1432 org_dest
= dfa
->edests
[org_node
].elems
[0];
1433 re_node_set_empty (dfa
->edests
+ clone_node
);
1434 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1436 /* In case of the node has another constraint, append it. */
1437 if (org_node
== root_node
&& clone_node
!= org_node
)
1439 /* ...but if the node is root_node itself, it means the
1440 epsilon closure have a loop, then tie it to the
1441 destination of the root_node. */
1442 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1444 if (BE (ret
< 0, 0))
1448 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1450 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1451 if (BE (clone_dest
== -1, 0))
1453 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1454 if (BE (ret
< 0, 0))
1457 else /* dfa->edests[org_node].nelem == 2 */
1459 /* In case of the node can epsilon-transit, and it has two
1460 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1461 org_dest
= dfa
->edests
[org_node
].elems
[0];
1462 re_node_set_empty (dfa
->edests
+ clone_node
);
1463 /* Search for a duplicated node which satisfies the constraint. */
1464 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1465 if (clone_dest
== -1)
1467 /* There are no such a duplicated node, create a new one. */
1469 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1470 if (BE (clone_dest
== -1, 0))
1472 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1473 if (BE (ret
< 0, 0))
1475 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1476 root_node
, constraint
);
1477 if (BE (err
!= REG_NOERROR
, 0))
1482 /* There are a duplicated node which satisfy the constraint,
1483 use it to avoid infinite loop. */
1484 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1485 if (BE (ret
< 0, 0))
1489 org_dest
= dfa
->edests
[org_node
].elems
[1];
1490 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1491 if (BE (clone_dest
== -1, 0))
1493 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1494 if (BE (ret
< 0, 0))
1497 org_node
= org_dest
;
1498 clone_node
= clone_dest
;
1503 /* Search for a node which is duplicated from the node ORG_NODE, and
1504 satisfies the constraint CONSTRAINT. */
1507 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1508 unsigned int constraint
)
1511 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1513 if (org_node
== dfa
->org_indices
[idx
]
1514 && constraint
== dfa
->nodes
[idx
].constraint
)
1515 return idx
; /* Found. */
1517 return -1; /* Not found. */
1520 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1521 Return the index of the new node, or -1 if insufficient storage is
1525 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1527 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1528 if (BE (dup_idx
!= -1, 1))
1530 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1531 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1532 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1533 dfa
->nodes
[dup_idx
].duplicated
= 1;
1535 /* Store the index of the original node. */
1536 dfa
->org_indices
[dup_idx
] = org_idx
;
1541 static reg_errcode_t
1542 calc_inveclosure (re_dfa_t
*dfa
)
1545 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1546 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1548 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1550 int *elems
= dfa
->eclosures
[src
].elems
;
1551 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1553 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1554 if (BE (ret
== -1, 0))
1562 /* Calculate "eclosure" for all the node in DFA. */
1564 static reg_errcode_t
1565 calc_eclosure (re_dfa_t
*dfa
)
1567 int node_idx
, incomplete
;
1569 assert (dfa
->nodes_len
> 0);
1572 /* For each nodes, calculate epsilon closure. */
1573 for (node_idx
= 0; ; ++node_idx
)
1576 re_node_set eclosure_elem
;
1577 if (node_idx
== dfa
->nodes_len
)
1586 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1589 /* If we have already calculated, skip it. */
1590 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1592 /* Calculate epsilon closure of `node_idx'. */
1593 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1594 if (BE (err
!= REG_NOERROR
, 0))
1597 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1600 re_node_set_free (&eclosure_elem
);
1606 /* Calculate epsilon closure of NODE. */
1608 static reg_errcode_t
1609 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1612 unsigned int constraint
;
1614 re_node_set eclosure
;
1616 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1617 if (BE (err
!= REG_NOERROR
, 0))
1620 /* This indicates that we are calculating this node now.
1621 We reference this value to avoid infinite loop. */
1622 dfa
->eclosures
[node
].nelem
= -1;
1624 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1625 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1626 /* If the current node has constraints, duplicate all nodes.
1627 Since they must inherit the constraints. */
1629 && dfa
->edests
[node
].nelem
1630 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1632 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1633 if (BE (err
!= REG_NOERROR
, 0))
1637 /* Expand each epsilon destination nodes. */
1638 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1639 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1641 re_node_set eclosure_elem
;
1642 int edest
= dfa
->edests
[node
].elems
[i
];
1643 /* If calculating the epsilon closure of `edest' is in progress,
1644 return intermediate result. */
1645 if (dfa
->eclosures
[edest
].nelem
== -1)
1650 /* If we haven't calculated the epsilon closure of `edest' yet,
1651 calculate now. Otherwise use calculated epsilon closure. */
1652 if (dfa
->eclosures
[edest
].nelem
== 0)
1654 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1655 if (BE (err
!= REG_NOERROR
, 0))
1659 eclosure_elem
= dfa
->eclosures
[edest
];
1660 /* Merge the epsilon closure of `edest'. */
1661 re_node_set_merge (&eclosure
, &eclosure_elem
);
1662 /* If the epsilon closure of `edest' is incomplete,
1663 the epsilon closure of this node is also incomplete. */
1664 if (dfa
->eclosures
[edest
].nelem
== 0)
1667 re_node_set_free (&eclosure_elem
);
1671 /* Epsilon closures include itself. */
1672 re_node_set_insert (&eclosure
, node
);
1673 if (incomplete
&& !root
)
1674 dfa
->eclosures
[node
].nelem
= 0;
1676 dfa
->eclosures
[node
] = eclosure
;
1677 *new_set
= eclosure
;
1681 /* Functions for token which are used in the parser. */
1683 /* Fetch a token from INPUT.
1684 We must not use this function inside bracket expressions. */
1688 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1690 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1693 /* Peek a token from INPUT, and return the length of the token.
1694 We must not use this function inside bracket expressions. */
1698 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1702 if (re_string_eoi (input
))
1704 token
->type
= END_OF_RE
;
1708 c
= re_string_peek_byte (input
, 0);
1711 token
->word_char
= 0;
1712 #ifdef RE_ENABLE_I18N
1713 token
->mb_partial
= 0;
1714 if (input
->mb_cur_max
> 1 &&
1715 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1717 token
->type
= CHARACTER
;
1718 token
->mb_partial
= 1;
1725 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1727 token
->type
= BACK_SLASH
;
1731 c2
= re_string_peek_byte_case (input
, 1);
1733 token
->type
= CHARACTER
;
1734 #ifdef RE_ENABLE_I18N
1735 if (input
->mb_cur_max
> 1)
1737 wint_t wc
= re_string_wchar_at (input
,
1738 re_string_cur_idx (input
) + 1);
1739 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1743 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1748 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1749 token
->type
= OP_ALT
;
1751 case '1': case '2': case '3': case '4': case '5':
1752 case '6': case '7': case '8': case '9':
1753 if (!(syntax
& RE_NO_BK_REFS
))
1755 token
->type
= OP_BACK_REF
;
1756 token
->opr
.idx
= c2
- '1';
1760 if (!(syntax
& RE_NO_GNU_OPS
))
1762 token
->type
= ANCHOR
;
1763 token
->opr
.ctx_type
= WORD_FIRST
;
1767 if (!(syntax
& RE_NO_GNU_OPS
))
1769 token
->type
= ANCHOR
;
1770 token
->opr
.ctx_type
= WORD_LAST
;
1774 if (!(syntax
& RE_NO_GNU_OPS
))
1776 token
->type
= ANCHOR
;
1777 token
->opr
.ctx_type
= WORD_DELIM
;
1781 if (!(syntax
& RE_NO_GNU_OPS
))
1783 token
->type
= ANCHOR
;
1784 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1788 if (!(syntax
& RE_NO_GNU_OPS
))
1789 token
->type
= OP_WORD
;
1792 if (!(syntax
& RE_NO_GNU_OPS
))
1793 token
->type
= OP_NOTWORD
;
1796 if (!(syntax
& RE_NO_GNU_OPS
))
1797 token
->type
= OP_SPACE
;
1800 if (!(syntax
& RE_NO_GNU_OPS
))
1801 token
->type
= OP_NOTSPACE
;
1804 if (!(syntax
& RE_NO_GNU_OPS
))
1806 token
->type
= ANCHOR
;
1807 token
->opr
.ctx_type
= BUF_FIRST
;
1811 if (!(syntax
& RE_NO_GNU_OPS
))
1813 token
->type
= ANCHOR
;
1814 token
->opr
.ctx_type
= BUF_LAST
;
1818 if (!(syntax
& RE_NO_BK_PARENS
))
1819 token
->type
= OP_OPEN_SUBEXP
;
1822 if (!(syntax
& RE_NO_BK_PARENS
))
1823 token
->type
= OP_CLOSE_SUBEXP
;
1826 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1827 token
->type
= OP_DUP_PLUS
;
1830 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1831 token
->type
= OP_DUP_QUESTION
;
1834 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1835 token
->type
= OP_OPEN_DUP_NUM
;
1838 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1839 token
->type
= OP_CLOSE_DUP_NUM
;
1847 token
->type
= CHARACTER
;
1848 #ifdef RE_ENABLE_I18N
1849 if (input
->mb_cur_max
> 1)
1851 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1852 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1856 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1861 if (syntax
& RE_NEWLINE_ALT
)
1862 token
->type
= OP_ALT
;
1865 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1866 token
->type
= OP_ALT
;
1869 token
->type
= OP_DUP_ASTERISK
;
1872 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1873 token
->type
= OP_DUP_PLUS
;
1876 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1877 token
->type
= OP_DUP_QUESTION
;
1880 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1881 token
->type
= OP_OPEN_DUP_NUM
;
1884 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1885 token
->type
= OP_CLOSE_DUP_NUM
;
1888 if (syntax
& RE_NO_BK_PARENS
)
1889 token
->type
= OP_OPEN_SUBEXP
;
1892 if (syntax
& RE_NO_BK_PARENS
)
1893 token
->type
= OP_CLOSE_SUBEXP
;
1896 token
->type
= OP_OPEN_BRACKET
;
1899 token
->type
= OP_PERIOD
;
1902 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1903 re_string_cur_idx (input
) != 0)
1905 char prev
= re_string_peek_byte (input
, -1);
1906 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1909 token
->type
= ANCHOR
;
1910 token
->opr
.ctx_type
= LINE_FIRST
;
1913 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1914 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1917 re_string_skip_bytes (input
, 1);
1918 peek_token (&next
, input
, syntax
);
1919 re_string_skip_bytes (input
, -1);
1920 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1923 token
->type
= ANCHOR
;
1924 token
->opr
.ctx_type
= LINE_LAST
;
1932 /* Peek a token from INPUT, and return the length of the token.
1933 We must not use this function out of bracket expressions. */
1937 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1940 if (re_string_eoi (input
))
1942 token
->type
= END_OF_RE
;
1945 c
= re_string_peek_byte (input
, 0);
1948 #ifdef RE_ENABLE_I18N
1949 if (input
->mb_cur_max
> 1 &&
1950 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1952 token
->type
= CHARACTER
;
1957 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1958 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1960 /* In this case, '\' escape a character. */
1962 re_string_skip_bytes (input
, 1);
1963 c2
= re_string_peek_byte (input
, 0);
1965 token
->type
= CHARACTER
;
1968 if (c
== '[') /* '[' is a special char in a bracket exps. */
1972 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
1973 c2
= re_string_peek_byte (input
, 1);
1981 token
->type
= OP_OPEN_COLL_ELEM
;
1984 token
->type
= OP_OPEN_EQUIV_CLASS
;
1987 if (syntax
& RE_CHAR_CLASSES
)
1989 token
->type
= OP_OPEN_CHAR_CLASS
;
1992 /* else fall through. */
1994 token
->type
= CHARACTER
;
2004 token
->type
= OP_CHARSET_RANGE
;
2007 token
->type
= OP_CLOSE_BRACKET
;
2010 token
->type
= OP_NON_MATCH_LIST
;
2013 token
->type
= CHARACTER
;
2018 /* Functions for parser. */
2020 /* Entry point of the parser.
2021 Parse the regular expression REGEXP and return the structure tree.
2022 If an error is occured, ERR is set by error code, and return NULL.
2023 This function build the following tree, from regular expression <reg_exp>:
2029 CAT means concatenation.
2030 EOR means end of regular expression. */
2033 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2036 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2037 bin_tree_t
*tree
, *eor
, *root
;
2038 re_token_t current_token
;
2039 dfa
->syntax
= syntax
;
2040 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2041 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2042 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2044 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2046 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2049 if (BE (eor
== NULL
|| root
== NULL
, 0))
2057 /* This function build the following tree, from regular expression
2058 <branch1>|<branch2>:
2064 ALT means alternative, which represents the operator `|'. */
2067 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2068 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2070 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2071 bin_tree_t
*tree
, *branch
= NULL
;
2072 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2073 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2076 while (token
->type
== OP_ALT
)
2078 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2079 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2080 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2082 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2083 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2088 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2089 if (BE (tree
== NULL
, 0))
2098 /* This function build the following tree, from regular expression
2105 CAT means concatenation. */
2108 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2109 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2111 bin_tree_t
*tree
, *exp
;
2112 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2113 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2114 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2117 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2118 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2120 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2121 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2125 if (tree
!= NULL
&& exp
!= NULL
)
2127 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2134 else if (tree
== NULL
)
2136 /* Otherwise exp == NULL, we don't need to create new tree. */
2141 /* This function build the following tree, from regular expression a*:
2148 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2149 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2151 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2153 switch (token
->type
)
2156 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2157 if (BE (tree
== NULL
, 0))
2162 #ifdef RE_ENABLE_I18N
2163 if (dfa
->mb_cur_max
> 1)
2165 while (!re_string_eoi (regexp
)
2166 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2168 bin_tree_t
*mbc_remain
;
2169 fetch_token (token
, regexp
, syntax
);
2170 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2171 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2172 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2181 case OP_OPEN_SUBEXP
:
2182 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2183 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2186 case OP_OPEN_BRACKET
:
2187 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2188 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2192 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2197 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2198 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2199 if (BE (tree
== NULL
, 0))
2205 dfa
->has_mb_node
= 1;
2207 case OP_OPEN_DUP_NUM
:
2208 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2214 case OP_DUP_ASTERISK
:
2216 case OP_DUP_QUESTION
:
2217 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2222 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2224 fetch_token (token
, regexp
, syntax
);
2225 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2227 /* else fall through */
2228 case OP_CLOSE_SUBEXP
:
2229 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2230 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2235 /* else fall through */
2236 case OP_CLOSE_DUP_NUM
:
2237 /* We treat it as a normal character. */
2239 /* Then we can these characters as normal characters. */
2240 token
->type
= CHARACTER
;
2241 /* mb_partial and word_char bits should be initialized already
2243 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2244 if (BE (tree
== NULL
, 0))
2251 if ((token
->opr
.ctx_type
2252 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2253 && dfa
->word_ops_used
== 0)
2254 init_word_char (dfa
);
2255 if (token
->opr
.ctx_type
== WORD_DELIM
2256 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2258 bin_tree_t
*tree_first
, *tree_last
;
2259 if (token
->opr
.ctx_type
== WORD_DELIM
)
2261 token
->opr
.ctx_type
= WORD_FIRST
;
2262 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2263 token
->opr
.ctx_type
= WORD_LAST
;
2267 token
->opr
.ctx_type
= INSIDE_WORD
;
2268 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2269 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2271 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2272 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2273 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2281 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2282 if (BE (tree
== NULL
, 0))
2288 /* We must return here, since ANCHORs can't be followed
2289 by repetition operators.
2290 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2291 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2292 fetch_token (token
, regexp
, syntax
);
2295 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2296 if (BE (tree
== NULL
, 0))
2301 if (dfa
->mb_cur_max
> 1)
2302 dfa
->has_mb_node
= 1;
2306 tree
= build_charclass_op (dfa
, regexp
->trans
,
2307 (const unsigned char *) "alnum",
2308 (const unsigned char *) "_",
2309 token
->type
== OP_NOTWORD
, err
);
2310 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2315 tree
= build_charclass_op (dfa
, regexp
->trans
,
2316 (const unsigned char *) "space",
2317 (const unsigned char *) "",
2318 token
->type
== OP_NOTSPACE
, err
);
2319 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2329 /* Must not happen? */
2335 fetch_token (token
, regexp
, syntax
);
2337 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2338 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2340 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2341 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2343 /* In BRE consecutive duplications are not allowed. */
2344 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2345 && (token
->type
== OP_DUP_ASTERISK
2346 || token
->type
== OP_OPEN_DUP_NUM
))
2356 /* This function build the following tree, from regular expression
2364 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2365 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2367 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2370 cur_nsub
= preg
->re_nsub
++;
2372 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2374 /* The subexpression may be a null string. */
2375 if (token
->type
== OP_CLOSE_SUBEXP
)
2379 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2380 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2382 if (BE (*err
!= REG_NOERROR
, 0))
2386 if (cur_nsub
<= '9' - '1')
2387 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2389 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2390 if (BE (tree
== NULL
, 0))
2395 tree
->token
.opr
.idx
= cur_nsub
;
2399 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2402 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2403 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2405 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2406 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2407 re_token_t start_token
= *token
;
2409 if (token
->type
== OP_OPEN_DUP_NUM
)
2412 start
= fetch_number (regexp
, token
, syntax
);
2415 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2416 start
= 0; /* We treat "{,m}" as "{0,m}". */
2419 *err
= REG_BADBR
; /* <re>{} is invalid. */
2423 if (BE (start
!= -2, 1))
2425 /* We treat "{n}" as "{n,n}". */
2426 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2427 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2428 ? fetch_number (regexp
, token
, syntax
) : -2));
2430 if (BE (start
== -2 || end
== -2, 0))
2432 /* Invalid sequence. */
2433 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2435 if (token
->type
== END_OF_RE
)
2443 /* If the syntax bit is set, rollback. */
2444 re_string_set_index (regexp
, start_idx
);
2445 *token
= start_token
;
2446 token
->type
= CHARACTER
;
2447 /* mb_partial and word_char bits should be already initialized by
2452 if (BE (end
!= -1 && start
> end
, 0))
2454 /* First number greater than second. */
2461 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2462 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2465 fetch_token (token
, regexp
, syntax
);
2467 if (BE (elem
== NULL
, 0))
2469 if (BE (start
== 0 && end
== 0, 0))
2471 postorder (elem
, free_tree
, NULL
);
2475 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2476 if (BE (start
> 0, 0))
2479 for (i
= 2; i
<= start
; ++i
)
2481 elem
= duplicate_tree (elem
, dfa
);
2482 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2483 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2484 goto parse_dup_op_espace
;
2490 /* Duplicate ELEM before it is marked optional. */
2491 elem
= duplicate_tree (elem
, dfa
);
2497 if (elem
->token
.type
== SUBEXP
)
2498 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2500 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2501 if (BE (tree
== NULL
, 0))
2502 goto parse_dup_op_espace
;
2504 /* This loop is actually executed only when end != -1,
2505 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2506 already created the start+1-th copy. */
2507 for (i
= start
+ 2; i
<= end
; ++i
)
2509 elem
= duplicate_tree (elem
, dfa
);
2510 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2511 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2512 goto parse_dup_op_espace
;
2514 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2515 if (BE (tree
== NULL
, 0))
2516 goto parse_dup_op_espace
;
2520 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2524 parse_dup_op_espace
:
2529 /* Size of the names for collating symbol/equivalence_class/character_class.
2530 I'm not sure, but maybe enough. */
2531 #define BRACKET_NAME_BUF_SIZE 32
2534 /* Local function for parse_bracket_exp only used in case of NOT glibc.
2535 Build the range expression which starts from START_ELEM, and ends
2536 at END_ELEM. The result are written to MBCSET and SBCSET.
2537 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2538 mbcset->range_ends, is a pointer argument sinse we may
2541 static reg_errcode_t
2543 # ifdef RE_ENABLE_I18N
2544 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2545 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2547 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2548 bracket_elem_t
*end_elem
)
2551 unsigned int start_ch
, end_ch
;
2552 /* Equivalence Classes and Character Classes can't be a range start/end. */
2553 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2554 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2558 /* We can handle no multi character collating elements without libc
2560 if (BE ((start_elem
->type
== COLL_SYM
2561 && strlen ((char *) start_elem
->opr
.name
) > 1)
2562 || (end_elem
->type
== COLL_SYM
2563 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2564 return REG_ECOLLATE
;
2566 # ifdef RE_ENABLE_I18N
2571 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2573 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2574 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2576 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2577 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2579 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2580 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2581 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2582 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2583 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2584 return REG_ECOLLATE
;
2585 cmp_buf
[0] = start_wc
;
2586 cmp_buf
[4] = end_wc
;
2587 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2590 /* Got valid collation sequence values, add them as a new entry.
2591 However, for !glibc we have no collation elements: if the
2592 character set is single byte, the single byte character set
2593 that we build below suffices. parse_bracket_exp passes
2594 no MBCSET if dfa->mb_cur_max == 1. */
2597 /* Check the space of the arrays. */
2598 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2600 /* There is not enough space, need realloc. */
2601 wchar_t *new_array_start
, *new_array_end
;
2604 /* +1 in case of mbcset->nranges is 0. */
2605 new_nranges
= 2 * mbcset
->nranges
+ 1;
2606 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2607 are NULL if *range_alloc == 0. */
2608 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2610 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2613 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2616 mbcset
->range_starts
= new_array_start
;
2617 mbcset
->range_ends
= new_array_end
;
2618 *range_alloc
= new_nranges
;
2621 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2622 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2625 /* Build the table for single byte characters. */
2626 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2629 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2630 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2631 bitset_set (sbcset
, wc
);
2634 # else /* not RE_ENABLE_I18N */
2637 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2638 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2640 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2641 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2643 if (start_ch
> end_ch
)
2645 /* Build the table for single byte characters. */
2646 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2647 if (start_ch
<= ch
&& ch
<= end_ch
)
2648 bitset_set (sbcset
, ch
);
2650 # endif /* not RE_ENABLE_I18N */
2656 /* Helper function for parse_bracket_exp only used in case of NOT glibc.
2657 Build the collating element which is represented by NAME.
2658 The result are written to MBCSET and SBCSET.
2659 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2660 pointer argument since we may update it. */
2662 static reg_errcode_t
2664 # ifdef RE_ENABLE_I18N
2665 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2666 int *coll_sym_alloc
, const unsigned char *name
)
2668 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2671 size_t name_len
= strlen ((const char *) name
);
2672 if (BE (name_len
!= 1, 0))
2673 return REG_ECOLLATE
;
2674 bitset_set (sbcset
, name
[0]);
2679 /* This function parse bracket expression like "[abc]", "[a-c]",
2683 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2684 reg_syntax_t syntax
, reg_errcode_t
*err
)
2687 const unsigned char *collseqmb
;
2688 const char *collseqwc
;
2691 const int32_t *symb_table
;
2692 const unsigned char *extra
;
2694 /* Local function for parse_bracket_exp used in glibc.
2695 Seek the collating symbol entry correspondings to NAME.
2696 Return the index of the symbol in the SYMB_TABLE. */
2698 auto __inline__
int32_t
2699 __attribute ((always_inline
))
2700 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2702 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2703 int32_t elem
= hash
% table_size
;
2704 if (symb_table
[2 * elem
] != 0)
2706 int32_t second
= hash
% (table_size
- 2) + 1;
2710 /* First compare the hashing value. */
2711 if (symb_table
[2 * elem
] == hash
2712 /* Compare the length of the name. */
2713 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2714 /* Compare the name. */
2715 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2718 /* Yep, this is the entry. */
2725 while (symb_table
[2 * elem
] != 0);
2730 /* Local function for parse_bracket_exp used in glibc.
2731 Look up the collation sequence value of BR_ELEM.
2732 Return the value if succeeded, UINT_MAX otherwise. */
2734 auto __inline__
unsigned int
2735 __attribute ((always_inline
))
2736 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2738 if (br_elem
->type
== SB_CHAR
)
2741 if (MB_CUR_MAX == 1)
2744 return collseqmb
[br_elem
->opr
.ch
];
2747 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2748 return __collseq_table_lookup (collseqwc
, wc
);
2751 else if (br_elem
->type
== MB_CHAR
)
2753 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2755 else if (br_elem
->type
== COLL_SYM
)
2757 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2761 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2763 if (symb_table
[2 * elem
] != 0)
2765 /* We found the entry. */
2766 idx
= symb_table
[2 * elem
+ 1];
2767 /* Skip the name of collating element name. */
2768 idx
+= 1 + extra
[idx
];
2769 /* Skip the byte sequence of the collating element. */
2770 idx
+= 1 + extra
[idx
];
2771 /* Adjust for the alignment. */
2772 idx
= (idx
+ 3) & ~3;
2773 /* Skip the multibyte collation sequence value. */
2774 idx
+= sizeof (unsigned int);
2775 /* Skip the wide char sequence of the collating element. */
2776 idx
+= sizeof (unsigned int) *
2777 (1 + *(unsigned int *) (extra
+ idx
));
2778 /* Return the collation sequence value. */
2779 return *(unsigned int *) (extra
+ idx
);
2781 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2783 /* No valid character. Match it as a single byte
2785 return collseqmb
[br_elem
->opr
.name
[0]];
2788 else if (sym_name_len
== 1)
2789 return collseqmb
[br_elem
->opr
.name
[0]];
2794 /* Local function for parse_bracket_exp used in glibc.
2795 Build the range expression which starts from START_ELEM, and ends
2796 at END_ELEM. The result are written to MBCSET and SBCSET.
2797 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2798 mbcset->range_ends, is a pointer argument sinse we may
2801 auto __inline__ reg_errcode_t
2802 __attribute ((always_inline
))
2803 build_range_exp (re_charset_t
*mbcset
,
2806 bracket_elem_t
*start_elem
,
2807 bracket_elem_t
*end_elem
)
2810 uint32_t start_collseq
;
2811 uint32_t end_collseq
;
2813 /* Equivalence Classes and Character Classes can't be a range
2815 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2816 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2820 start_collseq
= lookup_collation_sequence_value (start_elem
);
2821 end_collseq
= lookup_collation_sequence_value (end_elem
);
2822 /* Check start/end collation sequence values. */
2823 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2824 return REG_ECOLLATE
;
2825 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2828 /* Got valid collation sequence values, add them as a new entry.
2829 However, if we have no collation elements, and the character set
2830 is single byte, the single byte character set that we
2831 build below suffices. */
2832 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2834 /* Check the space of the arrays. */
2835 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2837 /* There is not enough space, need realloc. */
2838 uint32_t *new_array_start
;
2839 uint32_t *new_array_end
;
2842 /* +1 in case of mbcset->nranges is 0. */
2843 new_nranges
= 2 * mbcset
->nranges
+ 1;
2844 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2846 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2849 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2852 mbcset
->range_starts
= new_array_start
;
2853 mbcset
->range_ends
= new_array_end
;
2854 *range_alloc
= new_nranges
;
2857 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2858 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2861 /* Build the table for single byte characters. */
2862 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2864 uint32_t ch_collseq
;
2866 if (MB_CUR_MAX == 1)
2869 ch_collseq
= collseqmb
[ch
];
2871 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2872 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2873 bitset_set (sbcset
, ch
);
2878 /* Local function for parse_bracket_exp used in glibc.
2879 Build the collating element which is represented by NAME.
2880 The result are written to MBCSET and SBCSET.
2881 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2882 pointer argument sinse we may update it. */
2884 auto __inline__ reg_errcode_t
2885 __attribute ((always_inline
))
2886 build_collating_symbol (re_charset_t
*mbcset
,
2887 int *coll_sym_alloc
,
2889 const unsigned char *name
)
2892 size_t name_len
= strlen ((const char *) name
);
2895 elem
= seek_collating_symbol_entry (name
, name_len
);
2896 if (symb_table
[2 * elem
] != 0)
2898 /* We found the entry. */
2899 idx
= symb_table
[2 * elem
+ 1];
2900 /* Skip the name of collating element name. */
2901 idx
+= 1 + extra
[idx
];
2903 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2905 /* No valid character, treat it as a normal
2907 bitset_set (sbcset
, name
[0]);
2911 return REG_ECOLLATE
;
2913 /* Got valid collation sequence, add it as a new entry. */
2914 /* Check the space of the arrays. */
2915 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2917 /* Not enough, realloc it. */
2918 /* +1 in case of mbcset->ncoll_syms is 0. */
2919 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2920 /* Use realloc since mbcset->coll_syms is NULL
2922 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2923 new_coll_sym_alloc
);
2924 if (BE (new_coll_syms
== NULL
, 0))
2926 mbcset
->coll_syms
= new_coll_syms
;
2927 *coll_sym_alloc
= new_coll_sym_alloc
;
2929 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2934 if (BE (name_len
!= 1, 0))
2935 return REG_ECOLLATE
;
2938 bitset_set (sbcset
, name
[0]);
2945 re_token_t br_token
;
2946 re_bitset_ptr_t sbcset
;
2947 #ifdef RE_ENABLE_I18N
2948 re_charset_t
*mbcset
;
2949 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2950 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2953 bin_tree_t
*work_tree
;
2955 int first_round
= 1;
2957 collseqmb
= (const unsigned char *)
2958 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2959 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2965 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2966 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2967 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2968 _NL_COLLATE_SYMB_TABLEMB
);
2969 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2970 _NL_COLLATE_SYMB_EXTRAMB
);
2973 sbcset
= calloc (sizeof (bitset_t
), 1);
2974 #ifdef RE_ENABLE_I18N
2975 mbcset
= calloc (sizeof (re_charset_t
), 1);
2977 #ifdef RE_ENABLE_I18N
2978 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2980 if (BE (sbcset
== NULL
, 0))
2987 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2988 if (BE (token
->type
== END_OF_RE
, 0))
2991 goto parse_bracket_exp_free_return
;
2993 if (token
->type
== OP_NON_MATCH_LIST
)
2995 #ifdef RE_ENABLE_I18N
2996 mbcset
->non_match
= 1;
2999 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3000 bitset_set (sbcset
, '\0');
3001 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3002 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3003 if (BE (token
->type
== END_OF_RE
, 0))
3006 goto parse_bracket_exp_free_return
;
3010 /* We treat the first ']' as a normal character. */
3011 if (token
->type
== OP_CLOSE_BRACKET
)
3012 token
->type
= CHARACTER
;
3016 bracket_elem_t start_elem
, end_elem
;
3017 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3018 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3020 int token_len2
= 0, is_range_exp
= 0;
3023 start_elem
.opr
.name
= start_name_buf
;
3024 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3025 syntax
, first_round
);
3026 if (BE (ret
!= REG_NOERROR
, 0))
3029 goto parse_bracket_exp_free_return
;
3033 /* Get information about the next token. We need it in any case. */
3034 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3036 /* Do not check for ranges if we know they are not allowed. */
3037 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3039 if (BE (token
->type
== END_OF_RE
, 0))
3042 goto parse_bracket_exp_free_return
;
3044 if (token
->type
== OP_CHARSET_RANGE
)
3046 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3047 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3048 if (BE (token2
.type
== END_OF_RE
, 0))
3051 goto parse_bracket_exp_free_return
;
3053 if (token2
.type
== OP_CLOSE_BRACKET
)
3055 /* We treat the last '-' as a normal character. */
3056 re_string_skip_bytes (regexp
, -token_len
);
3057 token
->type
= CHARACTER
;
3064 if (is_range_exp
== 1)
3066 end_elem
.opr
.name
= end_name_buf
;
3067 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3069 if (BE (ret
!= REG_NOERROR
, 0))
3072 goto parse_bracket_exp_free_return
;
3075 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3078 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3079 &start_elem
, &end_elem
);
3081 # ifdef RE_ENABLE_I18N
3082 *err
= build_range_exp (sbcset
,
3083 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3084 &range_alloc
, &start_elem
, &end_elem
);
3086 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3089 if (BE (*err
!= REG_NOERROR
, 0))
3090 goto parse_bracket_exp_free_return
;
3094 switch (start_elem
.type
)
3097 bitset_set (sbcset
, start_elem
.opr
.ch
);
3099 #ifdef RE_ENABLE_I18N
3101 /* Check whether the array has enough space. */
3102 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3104 wchar_t *new_mbchars
;
3105 /* Not enough, realloc it. */
3106 /* +1 in case of mbcset->nmbchars is 0. */
3107 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3108 /* Use realloc since array is NULL if *alloc == 0. */
3109 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3111 if (BE (new_mbchars
== NULL
, 0))
3112 goto parse_bracket_exp_espace
;
3113 mbcset
->mbchars
= new_mbchars
;
3115 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3117 #endif /* RE_ENABLE_I18N */
3119 *err
= build_equiv_class (sbcset
,
3120 #ifdef RE_ENABLE_I18N
3121 mbcset
, &equiv_class_alloc
,
3123 start_elem
.opr
.name
);
3124 if (BE (*err
!= REG_NOERROR
, 0))
3125 goto parse_bracket_exp_free_return
;
3128 *err
= build_collating_symbol (sbcset
,
3129 #ifdef RE_ENABLE_I18N
3130 mbcset
, &coll_sym_alloc
,
3132 start_elem
.opr
.name
);
3133 if (BE (*err
!= REG_NOERROR
, 0))
3134 goto parse_bracket_exp_free_return
;
3137 *err
= build_charclass (regexp
->trans
, sbcset
,
3138 #ifdef RE_ENABLE_I18N
3139 mbcset
, &char_class_alloc
,
3141 start_elem
.opr
.name
, syntax
);
3142 if (BE (*err
!= REG_NOERROR
, 0))
3143 goto parse_bracket_exp_free_return
;
3150 if (BE (token
->type
== END_OF_RE
, 0))
3153 goto parse_bracket_exp_free_return
;
3155 if (token
->type
== OP_CLOSE_BRACKET
)
3159 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3161 /* If it is non-matching list. */
3163 bitset_not (sbcset
);
3165 #ifdef RE_ENABLE_I18N
3166 /* Ensure only single byte characters are set. */
3167 if (dfa
->mb_cur_max
> 1)
3168 bitset_mask (sbcset
, dfa
->sb_char
);
3170 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3171 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3172 || mbcset
->non_match
)))
3174 bin_tree_t
*mbc_tree
;
3176 /* Build a tree for complex bracket. */
3177 dfa
->has_mb_node
= 1;
3178 br_token
.type
= COMPLEX_BRACKET
;
3179 br_token
.opr
.mbcset
= mbcset
;
3180 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3181 if (BE (mbc_tree
== NULL
, 0))
3182 goto parse_bracket_exp_espace
;
3183 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3184 if (sbcset
[sbc_idx
])
3186 /* If there are no bits set in sbcset, there is no point
3187 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3188 if (sbc_idx
< BITSET_WORDS
)
3190 /* Build a tree for simple bracket. */
3191 br_token
.type
= SIMPLE_BRACKET
;
3192 br_token
.opr
.sbcset
= sbcset
;
3193 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3194 if (BE (work_tree
== NULL
, 0))
3195 goto parse_bracket_exp_espace
;
3197 /* Then join them by ALT node. */
3198 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3199 if (BE (work_tree
== NULL
, 0))
3200 goto parse_bracket_exp_espace
;
3205 work_tree
= mbc_tree
;
3209 #endif /* not RE_ENABLE_I18N */
3211 #ifdef RE_ENABLE_I18N
3212 free_charset (mbcset
);
3214 /* Build a tree for simple bracket. */
3215 br_token
.type
= SIMPLE_BRACKET
;
3216 br_token
.opr
.sbcset
= sbcset
;
3217 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3218 if (BE (work_tree
== NULL
, 0))
3219 goto parse_bracket_exp_espace
;
3223 parse_bracket_exp_espace
:
3225 parse_bracket_exp_free_return
:
3227 #ifdef RE_ENABLE_I18N
3228 free_charset (mbcset
);
3233 /* Parse an element in the bracket expression. */
3235 static reg_errcode_t
3236 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3237 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3238 reg_syntax_t syntax
, int accept_hyphen
)
3240 #ifdef RE_ENABLE_I18N
3242 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3243 if (cur_char_size
> 1)
3245 elem
->type
= MB_CHAR
;
3246 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3247 re_string_skip_bytes (regexp
, cur_char_size
);
3250 #endif /* RE_ENABLE_I18N */
3251 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3252 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3253 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3254 return parse_bracket_symbol (elem
, regexp
, token
);
3255 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3257 /* A '-' must only appear as anything but a range indicator before
3258 the closing bracket. Everything else is an error. */
3260 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3261 if (token2
.type
!= OP_CLOSE_BRACKET
)
3262 /* The actual error value is not standardized since this whole
3263 case is undefined. But ERANGE makes good sense. */
3266 elem
->type
= SB_CHAR
;
3267 elem
->opr
.ch
= token
->opr
.c
;
3271 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3272 such as [:<character_class>:], [.<collating_element>.], and
3273 [=<equivalent_class>=]. */
3275 static reg_errcode_t
3276 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3279 unsigned char ch
, delim
= token
->opr
.c
;
3281 if (re_string_eoi(regexp
))
3285 if (i
>= BRACKET_NAME_BUF_SIZE
)
3287 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3288 ch
= re_string_fetch_byte_case (regexp
);
3290 ch
= re_string_fetch_byte (regexp
);
3291 if (re_string_eoi(regexp
))
3293 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3295 elem
->opr
.name
[i
] = ch
;
3297 re_string_skip_bytes (regexp
, 1);
3298 elem
->opr
.name
[i
] = '\0';
3299 switch (token
->type
)
3301 case OP_OPEN_COLL_ELEM
:
3302 elem
->type
= COLL_SYM
;
3304 case OP_OPEN_EQUIV_CLASS
:
3305 elem
->type
= EQUIV_CLASS
;
3307 case OP_OPEN_CHAR_CLASS
:
3308 elem
->type
= CHAR_CLASS
;
3316 /* Helper function for parse_bracket_exp.
3317 Build the equivalence class which is represented by NAME.
3318 The result are written to MBCSET and SBCSET.
3319 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3320 is a pointer argument sinse we may update it. */
3322 static reg_errcode_t
3323 #ifdef RE_ENABLE_I18N
3324 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3325 int *equiv_class_alloc
, const unsigned char *name
)
3327 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3331 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3334 const int32_t *table
, *indirect
;
3335 const unsigned char *weights
, *extra
, *cp
;
3336 unsigned char char_buf
[2];
3340 /* This #include defines a local function! */
3341 # include <locale/weight.h>
3342 /* Calculate the index for equivalence class. */
3344 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3345 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3346 _NL_COLLATE_WEIGHTMB
);
3347 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3348 _NL_COLLATE_EXTRAMB
);
3349 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3350 _NL_COLLATE_INDIRECTMB
);
3351 idx1
= findidx (&cp
);
3352 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3353 /* This isn't a valid character. */
3354 return REG_ECOLLATE
;
3356 /* Build single byte matcing table for this equivalence class. */
3357 char_buf
[1] = (unsigned char) '\0';
3358 len
= weights
[idx1
];
3359 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3363 idx2
= findidx (&cp
);
3368 /* This isn't a valid character. */
3370 if (len
== weights
[idx2
])
3373 while (cnt
<= len
&&
3374 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3378 bitset_set (sbcset
, ch
);
3381 /* Check whether the array has enough space. */
3382 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3384 /* Not enough, realloc it. */
3385 /* +1 in case of mbcset->nequiv_classes is 0. */
3386 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3387 /* Use realloc since the array is NULL if *alloc == 0. */
3388 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3390 new_equiv_class_alloc
);
3391 if (BE (new_equiv_classes
== NULL
, 0))
3393 mbcset
->equiv_classes
= new_equiv_classes
;
3394 *equiv_class_alloc
= new_equiv_class_alloc
;
3396 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3401 if (BE (strlen ((const char *) name
) != 1, 0))
3402 return REG_ECOLLATE
;
3403 bitset_set (sbcset
, *name
);
3408 /* Helper function for parse_bracket_exp.
3409 Build the character class which is represented by NAME.
3410 The result are written to MBCSET and SBCSET.
3411 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3412 is a pointer argument sinse we may update it. */
3414 static reg_errcode_t
3415 #ifdef RE_ENABLE_I18N
3416 build_charclass (__RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3417 re_charset_t
*mbcset
, int *char_class_alloc
,
3418 const unsigned char *class_name
, reg_syntax_t syntax
)
3420 build_charclass (__RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3421 const unsigned char *class_name
, reg_syntax_t syntax
)
3425 const char *name
= (const char *) class_name
;
3427 /* In case of REG_ICASE "upper" and "lower" match the both of
3428 upper and lower cases. */
3429 if ((syntax
& RE_ICASE
)
3430 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3433 #ifdef RE_ENABLE_I18N
3434 /* Check the space of the arrays. */
3435 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3437 /* Not enough, realloc it. */
3438 /* +1 in case of mbcset->nchar_classes is 0. */
3439 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3440 /* Use realloc since array is NULL if *alloc == 0. */
3441 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3442 new_char_class_alloc
);
3443 if (BE (new_char_classes
== NULL
, 0))
3445 mbcset
->char_classes
= new_char_classes
;
3446 *char_class_alloc
= new_char_class_alloc
;
3448 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3449 #endif /* RE_ENABLE_I18N */
3451 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3453 if (BE (trans != NULL, 0)) \
3455 for (i = 0; i < SBC_MAX; ++i) \
3456 if (ctype_func (i)) \
3457 bitset_set (sbcset, trans[i]); \
3461 for (i = 0; i < SBC_MAX; ++i) \
3462 if (ctype_func (i)) \
3463 bitset_set (sbcset, i); \
3467 if (strcmp (name
, "alnum") == 0)
3468 BUILD_CHARCLASS_LOOP (isalnum
);
3469 else if (strcmp (name
, "cntrl") == 0)
3470 BUILD_CHARCLASS_LOOP (iscntrl
);
3471 else if (strcmp (name
, "lower") == 0)
3472 BUILD_CHARCLASS_LOOP (islower
);
3473 else if (strcmp (name
, "space") == 0)
3474 BUILD_CHARCLASS_LOOP (isspace
);
3475 else if (strcmp (name
, "alpha") == 0)
3476 BUILD_CHARCLASS_LOOP (isalpha
);
3477 else if (strcmp (name
, "digit") == 0)
3478 BUILD_CHARCLASS_LOOP (isdigit
);
3479 else if (strcmp (name
, "print") == 0)
3480 BUILD_CHARCLASS_LOOP (isprint
);
3481 else if (strcmp (name
, "upper") == 0)
3482 BUILD_CHARCLASS_LOOP (isupper
);
3483 else if (strcmp (name
, "blank") == 0)
3484 BUILD_CHARCLASS_LOOP (isblank
);
3485 else if (strcmp (name
, "graph") == 0)
3486 BUILD_CHARCLASS_LOOP (isgraph
);
3487 else if (strcmp (name
, "punct") == 0)
3488 BUILD_CHARCLASS_LOOP (ispunct
);
3489 else if (strcmp (name
, "xdigit") == 0)
3490 BUILD_CHARCLASS_LOOP (isxdigit
);
3498 build_charclass_op (re_dfa_t
*dfa
, __RE_TRANSLATE_TYPE trans
,
3499 const unsigned char *class_name
,
3500 const unsigned char *extra
, int non_match
,
3503 re_bitset_ptr_t sbcset
;
3504 #ifdef RE_ENABLE_I18N
3505 re_charset_t
*mbcset
;
3509 re_token_t br_token
;
3512 sbcset
= calloc (sizeof (bitset_t
), 1);
3513 #ifdef RE_ENABLE_I18N
3514 mbcset
= calloc (sizeof (re_charset_t
), 1);
3517 #ifdef RE_ENABLE_I18N
3518 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3520 if (BE (sbcset
== NULL
, 0))
3529 #ifdef RE_ENABLE_I18N
3531 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3532 bitset_set(cset->sbcset, '\0');
3534 mbcset
->non_match
= 1;
3538 /* We don't care the syntax in this case. */
3539 ret
= build_charclass (trans
, sbcset
,
3540 #ifdef RE_ENABLE_I18N
3545 if (BE (ret
!= REG_NOERROR
, 0))
3548 #ifdef RE_ENABLE_I18N
3549 free_charset (mbcset
);
3554 /* \w match '_' also. */
3555 for (; *extra
; extra
++)
3556 bitset_set (sbcset
, *extra
);
3558 /* If it is non-matching list. */
3560 bitset_not (sbcset
);
3562 #ifdef RE_ENABLE_I18N
3563 /* Ensure only single byte characters are set. */
3564 if (dfa
->mb_cur_max
> 1)
3565 bitset_mask (sbcset
, dfa
->sb_char
);
3568 /* Build a tree for simple bracket. */
3569 br_token
.type
= SIMPLE_BRACKET
;
3570 br_token
.opr
.sbcset
= sbcset
;
3571 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3572 if (BE (tree
== NULL
, 0))
3573 goto build_word_op_espace
;
3575 #ifdef RE_ENABLE_I18N
3576 if (dfa
->mb_cur_max
> 1)
3578 bin_tree_t
*mbc_tree
;
3579 /* Build a tree for complex bracket. */
3580 br_token
.type
= COMPLEX_BRACKET
;
3581 br_token
.opr
.mbcset
= mbcset
;
3582 dfa
->has_mb_node
= 1;
3583 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3584 if (BE (mbc_tree
== NULL
, 0))
3585 goto build_word_op_espace
;
3586 /* Then join them by ALT node. */
3587 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3588 if (BE (mbc_tree
!= NULL
, 1))
3593 free_charset (mbcset
);
3596 #else /* not RE_ENABLE_I18N */
3600 build_word_op_espace
:
3602 #ifdef RE_ENABLE_I18N
3603 free_charset (mbcset
);
3609 /* This is intended for the expressions like "a{1,3}".
3610 Fetch a number from `input', and return the number.
3611 Return -1, if the number field is empty like "{,1}".
3612 Return -2, If an error is occured. */
3615 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3621 fetch_token (token
, input
, syntax
);
3623 if (BE (token
->type
== END_OF_RE
, 0))
3625 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3627 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3628 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3629 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3634 #ifdef RE_ENABLE_I18N
3636 free_charset (re_charset_t
*cset
)
3638 re_free (cset
->mbchars
);
3640 re_free (cset
->coll_syms
);
3641 re_free (cset
->equiv_classes
);
3642 re_free (cset
->range_starts
);
3643 re_free (cset
->range_ends
);
3645 re_free (cset
->char_classes
);
3648 #endif /* RE_ENABLE_I18N */
3650 /* Functions for binary tree operation. */
3652 /* Create a tree node. */
3655 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3656 re_token_type_t type
)
3660 return create_token_tree (dfa
, left
, right
, &t
);
3664 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3665 const re_token_t
*token
)
3668 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3670 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3672 if (storage
== NULL
)
3674 storage
->next
= dfa
->str_tree_storage
;
3675 dfa
->str_tree_storage
= storage
;
3676 dfa
->str_tree_storage_idx
= 0;
3678 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3680 tree
->parent
= NULL
;
3682 tree
->right
= right
;
3683 tree
->token
= *token
;
3684 tree
->token
.duplicated
= 0;
3685 tree
->token
.opt_subexp
= 0;
3688 tree
->node_idx
= -1;
3691 left
->parent
= tree
;
3693 right
->parent
= tree
;
3697 /* Mark the tree SRC as an optional subexpression.
3698 To be called from preorder or postorder. */
3700 static reg_errcode_t
3701 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3703 int idx
= (int) (long) extra
;
3704 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3705 node
->token
.opt_subexp
= 1;
3710 /* Free the allocated memory inside NODE. */
3713 free_token (re_token_t
*node
)
3715 #ifdef RE_ENABLE_I18N
3716 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3717 free_charset (node
->opr
.mbcset
);
3720 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3721 re_free (node
->opr
.sbcset
);
3724 /* Worker function for tree walking. Free the allocated memory inside NODE
3725 and its children. */
3727 static reg_errcode_t
3728 free_tree (void *extra
, bin_tree_t
*node
)
3730 free_token (&node
->token
);
3735 /* Duplicate the node SRC, and return new node. This is a preorder
3736 visit similar to the one implemented by the generic visitor, but
3737 we need more infrastructure to maintain two parallel trees --- so,
3738 it's easier to duplicate. */
3741 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3743 const bin_tree_t
*node
;
3744 bin_tree_t
*dup_root
;
3745 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3747 for (node
= root
; ; )
3749 /* Create a new tree and link it back to the current parent. */
3750 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3753 (*p_new
)->parent
= dup_node
;
3754 (*p_new
)->token
.duplicated
= 1;
3757 /* Go to the left node, or up and to the right. */
3761 p_new
= &dup_node
->left
;
3765 const bin_tree_t
*prev
= NULL
;
3766 while (node
->right
== prev
|| node
->right
== NULL
)
3769 node
= node
->parent
;
3770 dup_node
= dup_node
->parent
;
3775 p_new
= &dup_node
->right
;