1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
preorder (bin_tree_t
*root
,
38 reg_errcode_t (fn (void *, bin_tree_t
*)),
40 static reg_errcode_t
postorder (bin_tree_t
*root
,
41 reg_errcode_t (fn (void *, bin_tree_t
*)),
43 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
44 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
45 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
47 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
51 int top_clone_node
, int root_node
,
52 unsigned int constraint
);
53 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
54 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
55 unsigned int constraint
);
56 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
57 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
59 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
60 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
62 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
64 static int peek_token (re_token_t
*token
, re_string_t
*input
,
66 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
68 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
69 reg_syntax_t syntax
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
80 re_token_t
*token
, reg_syntax_t syntax
,
81 int nest
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
83 re_dfa_t
*dfa
, re_token_t
*token
,
84 reg_syntax_t syntax
, reg_errcode_t
*err
);
85 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
86 re_token_t
*token
, reg_syntax_t syntax
,
88 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
90 re_token_t
*token
, int token_len
,
94 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
100 re_charset_t
*mbcset
, int *range_alloc
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 re_charset_t
*mbcset
,
106 const unsigned char *name
);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
109 bracket_elem_t
*start_elem
,
110 bracket_elem_t
*end_elem
);
111 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
112 const unsigned char *name
);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
117 re_charset_t
*mbcset
,
118 int *equiv_class_alloc
,
119 const unsigned char *name
);
120 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
122 re_charset_t
*mbcset
,
123 int *char_class_alloc
,
124 const unsigned char *class_name
,
125 reg_syntax_t syntax
);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
128 const unsigned char *name
);
129 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
131 const unsigned char *class_name
,
132 reg_syntax_t syntax
);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
135 RE_TRANSLATE_TYPE trans
,
136 const unsigned char *class_name
,
137 const unsigned char *extra
,
138 int non_match
, reg_errcode_t
*err
);
139 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
140 bin_tree_t
*left
, bin_tree_t
*right
,
141 re_token_type_t type
);
142 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
143 bin_tree_t
*left
, bin_tree_t
*right
,
144 const re_token_t
*token
);
145 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
146 static void free_token (re_token_t
*node
);
147 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
148 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid
[] attribute_hidden
=
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx
[] attribute_hidden
=
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
240 re_compile_pattern (pattern
, length
, bufp
)
243 struct re_pattern_buffer
*bufp
;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
252 /* Match anchors at newline. */
253 bufp
->newline_anchor
= 1;
255 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
259 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
262 weak_alias (__re_compile_pattern
, re_compile_pattern
)
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options
;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
281 re_set_syntax (syntax
)
284 reg_syntax_t ret
= re_syntax_options
;
286 re_syntax_options
= syntax
;
290 weak_alias (__re_set_syntax
, re_set_syntax
)
294 re_compile_fastmap (bufp
)
295 struct re_pattern_buffer
*bufp
;
297 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
298 char *fastmap
= bufp
->fastmap
;
300 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
301 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
302 if (dfa
->init_state
!= dfa
->init_state_word
)
303 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_nl
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
308 bufp
->fastmap_accurate
= 1;
312 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
316 __attribute ((always_inline
))
317 re_set_fastmap (char *fastmap
, int icase
, int ch
)
321 fastmap
[tolower (ch
)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
328 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
330 const re_dfastate_t
*init_state
;
333 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
335 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
336 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
338 int node
= init_state
->nodes
.elems
[node_cnt
];
339 re_token_type_t type
= dfa
->nodes
[node
].type
;
341 if (type
== CHARACTER
)
343 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
347 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
352 *p
++ = dfa
->nodes
[node
].opr
.c
;
353 while (++node
< dfa
->nodes_len
354 && dfa
->nodes
[node
].type
== CHARACTER
355 && dfa
->nodes
[node
].mb_partial
)
356 *p
++ = dfa
->nodes
[node
].opr
.c
;
357 memset (&state
, '\0', sizeof (state
));
358 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
360 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
362 re_set_fastmap (fastmap
, 0, buf
[0]);
366 else if (type
== SIMPLE_BRACKET
)
369 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
372 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
373 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
374 if (w
& ((bitset_word_t
) 1 << j
))
375 re_set_fastmap (fastmap
, icase
, ch
);
378 #ifdef RE_ENABLE_I18N
379 else if (type
== COMPLEX_BRACKET
)
382 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
383 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
384 || cset
->nranges
|| cset
->nchar_classes
)
387 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
389 /* In this case we want to catch the bytes which are
390 the first byte of any collation elements.
391 e.g. In da_DK, we want to catch 'a' since "aa"
392 is a valid collation element, and don't catch
393 'b' since 'b' is the only collation element
394 which starts from 'b'. */
395 const int32_t *table
= (const int32_t *)
396 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
397 for (i
= 0; i
< SBC_MAX
; ++i
)
399 re_set_fastmap (fastmap
, icase
, i
);
402 if (dfa
->mb_cur_max
> 1)
403 for (i
= 0; i
< SBC_MAX
; ++i
)
404 if (__btowc (i
) == WEOF
)
405 re_set_fastmap (fastmap
, icase
, i
);
406 # endif /* not _LIBC */
408 for (i
= 0; i
< cset
->nmbchars
; ++i
)
412 memset (&state
, '\0', sizeof (state
));
413 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
414 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
415 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
417 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
419 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
423 #endif /* RE_ENABLE_I18N */
424 else if (type
== OP_PERIOD
425 #ifdef RE_ENABLE_I18N
426 || type
== OP_UTF8_PERIOD
427 #endif /* RE_ENABLE_I18N */
428 || type
== END_OF_RE
)
430 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
431 if (type
== END_OF_RE
)
432 bufp
->can_be_null
= 1;
438 /* Entry point for POSIX code. */
439 /* regcomp takes a regular expression as a string and compiles it.
441 PREG is a regex_t *. We do not expect any fields to be initialized,
442 since POSIX says we shouldn't. Thus, we set
444 `buffer' to the compiled pattern;
445 `used' to the length of the compiled pattern;
446 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447 REG_EXTENDED bit in CFLAGS is set; otherwise, to
448 RE_SYNTAX_POSIX_BASIC;
449 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450 `fastmap' to an allocated space for the fastmap;
451 `fastmap_accurate' to zero;
452 `re_nsub' to the number of subexpressions in PATTERN.
454 PATTERN is the address of the pattern string.
456 CFLAGS is a series of bits which affect compilation.
458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459 use POSIX basic syntax.
461 If REG_NEWLINE is set, then . and [^...] don't match newline.
462 Also, regexec will try a match beginning after every newline.
464 If REG_ICASE is set, then we considers upper- and lowercase
465 versions of letters to be equivalent when matching.
467 If REG_NOSUB is set, then when PREG is passed to regexec, that
468 routine will report only success or failure, and nothing about the
471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472 the return codes and their meanings.) */
475 regcomp (preg
, pattern
, cflags
)
476 regex_t
*__restrict preg
;
477 const char *__restrict pattern
;
481 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
482 : RE_SYNTAX_POSIX_BASIC
);
488 /* Try to allocate space for the fastmap. */
489 preg
->fastmap
= re_malloc (char, SBC_MAX
);
490 if (BE (preg
->fastmap
== NULL
, 0))
493 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
495 /* If REG_NEWLINE is set, newlines are treated differently. */
496 if (cflags
& REG_NEWLINE
)
497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
498 syntax
&= ~RE_DOT_NEWLINE
;
499 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
500 /* It also changes the matching behavior. */
501 preg
->newline_anchor
= 1;
504 preg
->newline_anchor
= 0;
505 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
506 preg
->translate
= NULL
;
508 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
510 /* POSIX doesn't distinguish between an unmatched open-group and an
511 unmatched close-group: both are REG_EPAREN. */
512 if (ret
== REG_ERPAREN
)
515 /* We have already checked preg->fastmap != NULL. */
516 if (BE (ret
== REG_NOERROR
, 1))
517 /* Compute the fastmap now, since regexec cannot modify the pattern
518 buffer. This function never fails in this implementation. */
519 (void) re_compile_fastmap (preg
);
522 /* Some error occurred while compiling the expression. */
523 re_free (preg
->fastmap
);
524 preg
->fastmap
= NULL
;
530 weak_alias (__regcomp
, regcomp
)
533 /* Returns a message corresponding to an error code, ERRCODE, returned
534 from either regcomp or regexec. We don't use PREG here. */
537 regerror (errcode
, preg
, errbuf
, errbuf_size
)
539 const regex_t
*__restrict preg
;
540 char *__restrict errbuf
;
547 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
548 / sizeof (__re_error_msgid_idx
[0])), 0))
549 /* Only error codes returned by the rest of the code should be passed
550 to this routine. If we are given anything else, or if other regex
551 code generates an invalid error code, then the program has a bug.
552 Dump core so we can fix it. */
555 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
557 msg_size
= strlen (msg
) + 1; /* Includes the null. */
559 if (BE (errbuf_size
!= 0, 1))
561 if (BE (msg_size
> errbuf_size
, 0))
563 #if defined HAVE_MEMPCPY || defined _LIBC
564 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
566 memcpy (errbuf
, msg
, errbuf_size
- 1);
567 errbuf
[errbuf_size
- 1] = 0;
571 memcpy (errbuf
, msg
, msg_size
);
577 weak_alias (__regerror
, regerror
)
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map
=
588 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
595 free_dfa_content (re_dfa_t
*dfa
)
600 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
601 free_token (dfa
->nodes
+ i
);
602 re_free (dfa
->nexts
);
603 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
605 if (dfa
->eclosures
!= NULL
)
606 re_node_set_free (dfa
->eclosures
+ i
);
607 if (dfa
->inveclosures
!= NULL
)
608 re_node_set_free (dfa
->inveclosures
+ i
);
609 if (dfa
->edests
!= NULL
)
610 re_node_set_free (dfa
->edests
+ i
);
612 re_free (dfa
->edests
);
613 re_free (dfa
->eclosures
);
614 re_free (dfa
->inveclosures
);
615 re_free (dfa
->nodes
);
617 if (dfa
->state_table
)
618 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
620 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
621 for (j
= 0; j
< entry
->num
; ++j
)
623 re_dfastate_t
*state
= entry
->array
[j
];
626 re_free (entry
->array
);
628 re_free (dfa
->state_table
);
629 #ifdef RE_ENABLE_I18N
630 if (dfa
->sb_char
!= utf8_sb_map
)
631 re_free (dfa
->sb_char
);
633 re_free (dfa
->subexp_map
);
635 re_free (dfa
->re_str
);
642 /* Free dynamically allocated space used by PREG. */
648 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
649 if (BE (dfa
!= NULL
, 1))
650 free_dfa_content (dfa
);
654 re_free (preg
->fastmap
);
655 preg
->fastmap
= NULL
;
657 re_free (preg
->translate
);
658 preg
->translate
= NULL
;
661 weak_alias (__regfree
, regfree
)
664 /* Entry points compatible with 4.2 BSD regex library. We don't define
665 them unless specifically requested. */
667 #if defined _REGEX_RE_COMP || defined _LIBC
669 /* BSD has one and only one pattern buffer. */
670 static struct re_pattern_buffer re_comp_buf
;
674 /* Make these definitions weak in libc, so POSIX programs can redefine
675 these names if they don't use our functions, and still use
676 regcomp/regexec above without link errors. */
687 if (!re_comp_buf
.buffer
)
688 return gettext ("No previous regular expression");
692 if (re_comp_buf
.buffer
)
694 fastmap
= re_comp_buf
.fastmap
;
695 re_comp_buf
.fastmap
= NULL
;
696 __regfree (&re_comp_buf
);
697 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
698 re_comp_buf
.fastmap
= fastmap
;
701 if (re_comp_buf
.fastmap
== NULL
)
703 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
704 if (re_comp_buf
.fastmap
== NULL
)
705 return (char *) gettext (__re_error_msgid
706 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
709 /* Since `re_exec' always passes NULL for the `regs' argument, we
710 don't need to initialize the pattern buffer fields which affect it. */
712 /* Match anchors at newlines. */
713 re_comp_buf
.newline_anchor
= 1;
715 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
720 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
721 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
725 libc_freeres_fn (free_mem
)
727 __regfree (&re_comp_buf
);
731 #endif /* _REGEX_RE_COMP */
733 /* Internal entry point.
734 Compile the regular expression PATTERN, whose length is LENGTH.
735 SYNTAX indicate regular expression's syntax. */
738 re_compile_internal (preg
, pattern
, length
, syntax
)
740 const char * pattern
;
744 reg_errcode_t err
= REG_NOERROR
;
748 /* Initialize the pattern buffer. */
749 preg
->fastmap_accurate
= 0;
750 preg
->syntax
= syntax
;
751 preg
->not_bol
= preg
->not_eol
= 0;
754 preg
->can_be_null
= 0;
755 preg
->regs_allocated
= REGS_UNALLOCATED
;
757 /* Initialize the dfa. */
758 dfa
= (re_dfa_t
*) preg
->buffer
;
759 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
761 /* If zero allocated, but buffer is non-null, try to realloc
762 enough space. This loses if buffer's address is bogus, but
763 that is the user's responsibility. If ->buffer is NULL this
764 is a simple allocation. */
765 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
768 preg
->allocated
= sizeof (re_dfa_t
);
769 preg
->buffer
= (unsigned char *) dfa
;
771 preg
->used
= sizeof (re_dfa_t
);
773 err
= init_dfa (dfa
, length
);
774 if (BE (err
!= REG_NOERROR
, 0))
776 free_dfa_content (dfa
);
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa
->re_str
= re_malloc (char, length
+ 1);
784 strncpy (dfa
->re_str
, pattern
, length
+ 1);
787 __libc_lock_init (dfa
->lock
);
789 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
790 syntax
& RE_ICASE
, dfa
);
791 if (BE (err
!= REG_NOERROR
, 0))
793 re_compile_internal_free_return
:
794 free_workarea_compile (preg
);
795 re_string_destruct (®exp
);
796 free_dfa_content (dfa
);
802 /* Parse the regular expression, and build a structure tree. */
804 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
805 if (BE (dfa
->str_tree
== NULL
, 0))
806 goto re_compile_internal_free_return
;
808 /* Analyze the tree and create the nfa. */
809 err
= analyze (preg
);
810 if (BE (err
!= REG_NOERROR
, 0))
811 goto re_compile_internal_free_return
;
813 #ifdef RE_ENABLE_I18N
814 /* If possible, do searching in single byte encoding to speed things up. */
815 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
819 /* Then create the initial state of the dfa. */
820 err
= create_initial_state (dfa
);
822 /* Release work areas. */
823 free_workarea_compile (preg
);
824 re_string_destruct (®exp
);
826 if (BE (err
!= REG_NOERROR
, 0))
828 free_dfa_content (dfa
);
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
840 init_dfa (dfa
, pat_len
)
844 unsigned int table_size
;
849 memset (dfa
, '\0', sizeof (re_dfa_t
));
851 /* Force allocation of str_tree_storage the first time. */
852 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
854 /* Avoid overflows. */
855 if (pat_len
== SIZE_MAX
)
858 dfa
->nodes_alloc
= pat_len
+ 1;
859 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size
= 1; ; table_size
<<= 1)
863 if (table_size
> pat_len
)
866 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
867 dfa
->state_hash_mask
= table_size
- 1;
869 dfa
->mb_cur_max
= MB_CUR_MAX
;
871 if (dfa
->mb_cur_max
== 6
872 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
874 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
877 # ifdef HAVE_LANGINFO_CODESET
878 codeset_name
= nl_langinfo (CODESET
);
880 codeset_name
= getenv ("LC_ALL");
881 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
882 codeset_name
= getenv ("LC_CTYPE");
883 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
884 codeset_name
= getenv ("LANG");
885 if (codeset_name
== NULL
)
887 else if (strchr (codeset_name
, '.') != NULL
)
888 codeset_name
= strchr (codeset_name
, '.') + 1;
891 if (strcasecmp (codeset_name
, "UTF-8") == 0
892 || strcasecmp (codeset_name
, "UTF8") == 0)
895 /* We check exhaustively in the loop below if this charset is a
896 superset of ASCII. */
897 dfa
->map_notascii
= 0;
900 #ifdef RE_ENABLE_I18N
901 if (dfa
->mb_cur_max
> 1)
904 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
909 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
910 if (BE (dfa
->sb_char
== NULL
, 0))
913 /* Set the bits corresponding to single byte chars. */
914 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
915 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
917 wint_t wch
= __btowc (ch
);
919 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
921 if (isascii (ch
) && wch
!= ch
)
922 dfa
->map_notascii
= 1;
929 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
943 dfa
->word_ops_used
= 1;
944 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
945 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
946 if (isalnum (ch
) || ch
== '_')
947 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
950 /* Free the work area which are only used while compiling. */
953 free_workarea_compile (preg
)
956 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
957 bin_tree_storage_t
*storage
, *next
;
958 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
960 next
= storage
->next
;
963 dfa
->str_tree_storage
= NULL
;
964 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
965 dfa
->str_tree
= NULL
;
966 re_free (dfa
->org_indices
);
967 dfa
->org_indices
= NULL
;
970 /* Create initial states for all contexts. */
973 create_initial_state (dfa
)
978 re_node_set init_nodes
;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first
= dfa
->str_tree
->first
->node_idx
;
983 dfa
->init_node
= first
;
984 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
985 if (BE (err
!= REG_NOERROR
, 0))
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa
->nbackref
> 0)
993 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
995 int node_idx
= init_nodes
.elems
[i
];
996 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
999 if (type
!= OP_BACK_REF
)
1001 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1003 re_token_t
*clexp_node
;
1004 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1005 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1006 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1009 if (clexp_idx
== init_nodes
.nelem
)
1012 if (type
== OP_BACK_REF
)
1014 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1015 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1017 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa
->init_state
== NULL
, 0))
1028 if (dfa
->init_state
->has_constraint
)
1030 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1032 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1034 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1038 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1039 || dfa
->init_state_begbuf
== NULL
, 0))
1043 dfa
->init_state_word
= dfa
->init_state_nl
1044 = dfa
->init_state_begbuf
= dfa
->init_state
;
1046 re_node_set_free (&init_nodes
);
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1059 int node
, i
, mb_chars
= 0, has_period
= 0;
1061 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1062 switch (dfa
->nodes
[node
].type
)
1065 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1069 switch (dfa
->nodes
[node
].opr
.idx
)
1077 /* Word anchors etc. cannot be handled. */
1087 case OP_DUP_ASTERISK
:
1088 case OP_OPEN_SUBEXP
:
1089 case OP_CLOSE_SUBEXP
:
1091 case COMPLEX_BRACKET
:
1093 case SIMPLE_BRACKET
:
1094 /* Just double check. The non-ASCII range starts at 0x80. */
1095 assert (0x80 % BITSET_WORD_BITS
== 0);
1096 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1097 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1104 if (mb_chars
|| has_period
)
1105 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1107 if (dfa
->nodes
[node
].type
== CHARACTER
1108 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1109 dfa
->nodes
[node
].mb_partial
= 0;
1110 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1111 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1114 /* The search can be in single byte locale. */
1115 dfa
->mb_cur_max
= 1;
1117 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122 "eclosure", and "inveclosure". */
1124 static reg_errcode_t
1128 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1131 /* Allocate arrays. */
1132 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1133 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1134 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1135 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1136 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1137 || dfa
->eclosures
== NULL
, 0))
1140 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1141 if (dfa
->subexp_map
!= NULL
)
1144 for (i
= 0; i
< preg
->re_nsub
; i
++)
1145 dfa
->subexp_map
[i
] = i
;
1146 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1147 for (i
= 0; i
< preg
->re_nsub
; i
++)
1148 if (dfa
->subexp_map
[i
] != i
)
1150 if (i
== preg
->re_nsub
)
1152 free (dfa
->subexp_map
);
1153 dfa
->subexp_map
= NULL
;
1157 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1158 if (BE (ret
!= REG_NOERROR
, 0))
1160 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1161 if (BE (ret
!= REG_NOERROR
, 0))
1163 preorder (dfa
->str_tree
, calc_next
, dfa
);
1164 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1165 if (BE (ret
!= REG_NOERROR
, 0))
1167 ret
= calc_eclosure (dfa
);
1168 if (BE (ret
!= REG_NOERROR
, 0))
1171 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1172 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1173 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1176 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1177 if (BE (dfa
->inveclosures
== NULL
, 0))
1179 ret
= calc_inveclosure (dfa
);
1185 /* Our parse trees are very unbalanced, so we cannot use a stack to
1186 implement parse tree visits. Instead, we use parent pointers and
1187 some hairy code in these two functions. */
1188 static reg_errcode_t
1189 postorder (root
, fn
, extra
)
1191 reg_errcode_t (fn (void *, bin_tree_t
*));
1194 bin_tree_t
*node
, *prev
;
1196 for (node
= root
; ; )
1198 /* Descend down the tree, preferably to the left (or to the right
1199 if that's the only child). */
1200 while (node
->left
|| node
->right
)
1208 reg_errcode_t err
= fn (extra
, node
);
1209 if (BE (err
!= REG_NOERROR
, 0))
1211 if (node
->parent
== NULL
)
1214 node
= node
->parent
;
1216 /* Go up while we have a node that is reached from the right. */
1217 while (node
->right
== prev
|| node
->right
== NULL
);
1222 static reg_errcode_t
1223 preorder (root
, fn
, extra
)
1225 reg_errcode_t (fn (void *, bin_tree_t
*));
1230 for (node
= root
; ; )
1232 reg_errcode_t err
= fn (extra
, node
);
1233 if (BE (err
!= REG_NOERROR
, 0))
1236 /* Go to the left node, or up and to the right. */
1241 bin_tree_t
*prev
= NULL
;
1242 while (node
->right
== prev
|| node
->right
== NULL
)
1245 node
= node
->parent
;
1254 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1255 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1256 backreferences as well. Requires a preorder visit. */
1257 static reg_errcode_t
1258 optimize_subexps (extra
, node
)
1262 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1264 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1266 int idx
= node
->token
.opr
.idx
;
1267 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1268 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1271 else if (node
->token
.type
== SUBEXP
1272 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1274 int other_idx
= node
->left
->token
.opr
.idx
;
1276 node
->left
= node
->left
->left
;
1278 node
->left
->parent
= node
;
1280 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1281 if (other_idx
< BITSET_WORD_BITS
)
1282 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1288 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1289 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1290 static reg_errcode_t
1291 lower_subexps (extra
, node
)
1295 regex_t
*preg
= (regex_t
*) extra
;
1296 reg_errcode_t err
= REG_NOERROR
;
1298 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1300 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1302 node
->left
->parent
= node
;
1304 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1306 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1308 node
->right
->parent
= node
;
1315 lower_subexp (err
, preg
, node
)
1320 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1321 bin_tree_t
*body
= node
->left
;
1322 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1325 /* We do not optimize empty subexpressions, because otherwise we may
1326 have bad CONCAT nodes with NULL children. This is obviously not
1327 very common, so we do not lose much. An example that triggers
1328 this case is the sed "script" /\(\)/x. */
1329 && node
->left
!= NULL
1330 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1331 || !(dfa
->used_bkref_map
1332 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1335 /* Convert the SUBEXP node to the concatenation of an
1336 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1337 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1338 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1339 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1340 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1341 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1347 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1348 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1352 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353 nodes. Requires a postorder visit. */
1354 static reg_errcode_t
1355 calc_first (extra
, node
)
1359 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1360 if (node
->token
.type
== CONCAT
)
1362 node
->first
= node
->left
->first
;
1363 node
->node_idx
= node
->left
->node_idx
;
1368 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1369 if (BE (node
->node_idx
== -1, 0))
1375 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1376 static reg_errcode_t
1377 calc_next (extra
, node
)
1381 switch (node
->token
.type
)
1383 case OP_DUP_ASTERISK
:
1384 node
->left
->next
= node
;
1387 node
->left
->next
= node
->right
->first
;
1388 node
->right
->next
= node
->next
;
1392 node
->left
->next
= node
->next
;
1394 node
->right
->next
= node
->next
;
1400 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1401 static reg_errcode_t
1402 link_nfa_nodes (extra
, node
)
1406 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1407 int idx
= node
->node_idx
;
1408 reg_errcode_t err
= REG_NOERROR
;
1410 switch (node
->token
.type
)
1416 assert (node
->next
== NULL
);
1419 case OP_DUP_ASTERISK
:
1423 dfa
->has_plural_match
= 1;
1424 if (node
->left
!= NULL
)
1425 left
= node
->left
->first
->node_idx
;
1427 left
= node
->next
->node_idx
;
1428 if (node
->right
!= NULL
)
1429 right
= node
->right
->first
->node_idx
;
1431 right
= node
->next
->node_idx
;
1433 assert (right
> -1);
1434 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1439 case OP_OPEN_SUBEXP
:
1440 case OP_CLOSE_SUBEXP
:
1441 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1445 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1446 if (node
->token
.type
== OP_BACK_REF
)
1447 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1451 assert (!IS_EPSILON_NODE (node
->token
.type
));
1452 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1459 /* Duplicate the epsilon closure of the node ROOT_NODE.
1460 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461 to their own constraint. */
1463 static reg_errcode_t
1464 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1467 int top_org_node
, top_clone_node
, root_node
;
1468 unsigned int init_constraint
;
1470 int org_node
, clone_node
, ret
;
1471 unsigned int constraint
= init_constraint
;
1472 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1474 int org_dest
, clone_dest
;
1475 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1477 /* If the back reference epsilon-transit, its destination must
1478 also have the constraint. Then duplicate the epsilon closure
1479 of the destination of the back reference, and store it in
1480 edests of the back reference. */
1481 org_dest
= dfa
->nexts
[org_node
];
1482 re_node_set_empty (dfa
->edests
+ clone_node
);
1483 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1484 if (BE (clone_dest
== -1, 0))
1486 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1487 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1488 if (BE (ret
< 0, 0))
1491 else if (dfa
->edests
[org_node
].nelem
== 0)
1493 /* In case of the node can't epsilon-transit, don't duplicate the
1494 destination and store the original destination as the
1495 destination of the node. */
1496 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1499 else if (dfa
->edests
[org_node
].nelem
== 1)
1501 /* In case of the node can epsilon-transit, and it has only one
1503 org_dest
= dfa
->edests
[org_node
].elems
[0];
1504 re_node_set_empty (dfa
->edests
+ clone_node
);
1505 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1507 /* In case of the node has another constraint, append it. */
1508 if (org_node
== root_node
&& clone_node
!= org_node
)
1510 /* ...but if the node is root_node itself, it means the
1511 epsilon closure have a loop, then tie it to the
1512 destination of the root_node. */
1513 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1515 if (BE (ret
< 0, 0))
1519 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1521 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1522 if (BE (clone_dest
== -1, 0))
1524 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1525 if (BE (ret
< 0, 0))
1528 else /* dfa->edests[org_node].nelem == 2 */
1530 /* In case of the node can epsilon-transit, and it has two
1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1532 org_dest
= dfa
->edests
[org_node
].elems
[0];
1533 re_node_set_empty (dfa
->edests
+ clone_node
);
1534 /* Search for a duplicated node which satisfies the constraint. */
1535 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1536 if (clone_dest
== -1)
1538 /* There are no such a duplicated node, create a new one. */
1540 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1541 if (BE (clone_dest
== -1, 0))
1543 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1544 if (BE (ret
< 0, 0))
1546 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1547 root_node
, constraint
);
1548 if (BE (err
!= REG_NOERROR
, 0))
1553 /* There are a duplicated node which satisfy the constraint,
1554 use it to avoid infinite loop. */
1555 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1556 if (BE (ret
< 0, 0))
1560 org_dest
= dfa
->edests
[org_node
].elems
[1];
1561 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1562 if (BE (clone_dest
== -1, 0))
1564 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1565 if (BE (ret
< 0, 0))
1568 org_node
= org_dest
;
1569 clone_node
= clone_dest
;
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575 satisfies the constraint CONSTRAINT. */
1578 search_duplicated_node (dfa
, org_node
, constraint
)
1579 const re_dfa_t
*dfa
;
1581 unsigned int constraint
;
1584 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1586 if (org_node
== dfa
->org_indices
[idx
]
1587 && constraint
== dfa
->nodes
[idx
].constraint
)
1588 return idx
; /* Found. */
1590 return -1; /* Not found. */
1593 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594 Return the index of the new node, or -1 if insufficient storage is
1598 duplicate_node (dfa
, org_idx
, constraint
)
1601 unsigned int constraint
;
1603 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1604 if (BE (dup_idx
!= -1, 1))
1606 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1607 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1608 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1609 dfa
->nodes
[dup_idx
].duplicated
= 1;
1611 /* Store the index of the original node. */
1612 dfa
->org_indices
[dup_idx
] = org_idx
;
1617 static reg_errcode_t
1618 calc_inveclosure (dfa
)
1622 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1623 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1625 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1627 int *elems
= dfa
->eclosures
[src
].elems
;
1628 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1630 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1631 if (BE (ret
== -1, 0))
1639 /* Calculate "eclosure" for all the node in DFA. */
1641 static reg_errcode_t
1645 int node_idx
, incomplete
;
1647 assert (dfa
->nodes_len
> 0);
1650 /* For each nodes, calculate epsilon closure. */
1651 for (node_idx
= 0; ; ++node_idx
)
1654 re_node_set eclosure_elem
;
1655 if (node_idx
== dfa
->nodes_len
)
1664 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1667 /* If we have already calculated, skip it. */
1668 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1670 /* Calculate epsilon closure of `node_idx'. */
1671 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1672 if (BE (err
!= REG_NOERROR
, 0))
1675 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1678 re_node_set_free (&eclosure_elem
);
1684 /* Calculate epsilon closure of NODE. */
1686 static reg_errcode_t
1687 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1688 re_node_set
*new_set
;
1693 unsigned int constraint
;
1695 re_node_set eclosure
;
1697 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1698 if (BE (err
!= REG_NOERROR
, 0))
1701 /* This indicates that we are calculating this node now.
1702 We reference this value to avoid infinite loop. */
1703 dfa
->eclosures
[node
].nelem
= -1;
1705 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1706 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1707 /* If the current node has constraints, duplicate all nodes.
1708 Since they must inherit the constraints. */
1710 && dfa
->edests
[node
].nelem
1711 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1713 int org_node
, cur_node
;
1714 org_node
= cur_node
= node
;
1715 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1716 if (BE (err
!= REG_NOERROR
, 0))
1720 /* Expand each epsilon destination nodes. */
1721 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1722 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1724 re_node_set eclosure_elem
;
1725 int edest
= dfa
->edests
[node
].elems
[i
];
1726 /* If calculating the epsilon closure of `edest' is in progress,
1727 return intermediate result. */
1728 if (dfa
->eclosures
[edest
].nelem
== -1)
1733 /* If we haven't calculated the epsilon closure of `edest' yet,
1734 calculate now. Otherwise use calculated epsilon closure. */
1735 if (dfa
->eclosures
[edest
].nelem
== 0)
1737 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1738 if (BE (err
!= REG_NOERROR
, 0))
1742 eclosure_elem
= dfa
->eclosures
[edest
];
1743 /* Merge the epsilon closure of `edest'. */
1744 re_node_set_merge (&eclosure
, &eclosure_elem
);
1745 /* If the epsilon closure of `edest' is incomplete,
1746 the epsilon closure of this node is also incomplete. */
1747 if (dfa
->eclosures
[edest
].nelem
== 0)
1750 re_node_set_free (&eclosure_elem
);
1754 /* Epsilon closures include itself. */
1755 re_node_set_insert (&eclosure
, node
);
1756 if (incomplete
&& !root
)
1757 dfa
->eclosures
[node
].nelem
= 0;
1759 dfa
->eclosures
[node
] = eclosure
;
1760 *new_set
= eclosure
;
1764 /* Functions for token which are used in the parser. */
1766 /* Fetch a token from INPUT.
1767 We must not use this function inside bracket expressions. */
1770 fetch_token (result
, input
, syntax
)
1773 reg_syntax_t syntax
;
1775 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1778 /* Peek a token from INPUT, and return the length of the token.
1779 We must not use this function inside bracket expressions. */
1782 peek_token (token
, input
, syntax
)
1785 reg_syntax_t syntax
;
1789 if (re_string_eoi (input
))
1791 token
->type
= END_OF_RE
;
1795 c
= re_string_peek_byte (input
, 0);
1798 token
->word_char
= 0;
1799 #ifdef RE_ENABLE_I18N
1800 token
->mb_partial
= 0;
1801 if (input
->mb_cur_max
> 1 &&
1802 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1804 token
->type
= CHARACTER
;
1805 token
->mb_partial
= 1;
1812 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1814 token
->type
= BACK_SLASH
;
1818 c2
= re_string_peek_byte_case (input
, 1);
1820 token
->type
= CHARACTER
;
1821 #ifdef RE_ENABLE_I18N
1822 if (input
->mb_cur_max
> 1)
1824 wint_t wc
= re_string_wchar_at (input
,
1825 re_string_cur_idx (input
) + 1);
1826 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1830 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1835 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1836 token
->type
= OP_ALT
;
1838 case '1': case '2': case '3': case '4': case '5':
1839 case '6': case '7': case '8': case '9':
1840 if (!(syntax
& RE_NO_BK_REFS
))
1842 token
->type
= OP_BACK_REF
;
1843 token
->opr
.idx
= c2
- '1';
1847 if (!(syntax
& RE_NO_GNU_OPS
))
1849 token
->type
= ANCHOR
;
1850 token
->opr
.ctx_type
= WORD_FIRST
;
1854 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= ANCHOR
;
1857 token
->opr
.ctx_type
= WORD_LAST
;
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1863 token
->type
= ANCHOR
;
1864 token
->opr
.ctx_type
= WORD_DELIM
;
1868 if (!(syntax
& RE_NO_GNU_OPS
))
1870 token
->type
= ANCHOR
;
1871 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1875 if (!(syntax
& RE_NO_GNU_OPS
))
1876 token
->type
= OP_WORD
;
1879 if (!(syntax
& RE_NO_GNU_OPS
))
1880 token
->type
= OP_NOTWORD
;
1883 if (!(syntax
& RE_NO_GNU_OPS
))
1884 token
->type
= OP_SPACE
;
1887 if (!(syntax
& RE_NO_GNU_OPS
))
1888 token
->type
= OP_NOTSPACE
;
1891 if (!(syntax
& RE_NO_GNU_OPS
))
1893 token
->type
= ANCHOR
;
1894 token
->opr
.ctx_type
= BUF_FIRST
;
1898 if (!(syntax
& RE_NO_GNU_OPS
))
1900 token
->type
= ANCHOR
;
1901 token
->opr
.ctx_type
= BUF_LAST
;
1905 if (!(syntax
& RE_NO_BK_PARENS
))
1906 token
->type
= OP_OPEN_SUBEXP
;
1909 if (!(syntax
& RE_NO_BK_PARENS
))
1910 token
->type
= OP_CLOSE_SUBEXP
;
1913 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1914 token
->type
= OP_DUP_PLUS
;
1917 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1918 token
->type
= OP_DUP_QUESTION
;
1921 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1922 token
->type
= OP_OPEN_DUP_NUM
;
1925 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1926 token
->type
= OP_CLOSE_DUP_NUM
;
1934 token
->type
= CHARACTER
;
1935 #ifdef RE_ENABLE_I18N
1936 if (input
->mb_cur_max
> 1)
1938 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1939 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1943 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1948 if (syntax
& RE_NEWLINE_ALT
)
1949 token
->type
= OP_ALT
;
1952 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1953 token
->type
= OP_ALT
;
1956 token
->type
= OP_DUP_ASTERISK
;
1959 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1960 token
->type
= OP_DUP_PLUS
;
1963 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1964 token
->type
= OP_DUP_QUESTION
;
1967 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1968 token
->type
= OP_OPEN_DUP_NUM
;
1971 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1972 token
->type
= OP_CLOSE_DUP_NUM
;
1975 if (syntax
& RE_NO_BK_PARENS
)
1976 token
->type
= OP_OPEN_SUBEXP
;
1979 if (syntax
& RE_NO_BK_PARENS
)
1980 token
->type
= OP_CLOSE_SUBEXP
;
1983 token
->type
= OP_OPEN_BRACKET
;
1986 token
->type
= OP_PERIOD
;
1989 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1990 re_string_cur_idx (input
) != 0)
1992 char prev
= re_string_peek_byte (input
, -1);
1993 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1996 token
->type
= ANCHOR
;
1997 token
->opr
.ctx_type
= LINE_FIRST
;
2000 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2001 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2004 re_string_skip_bytes (input
, 1);
2005 peek_token (&next
, input
, syntax
);
2006 re_string_skip_bytes (input
, -1);
2007 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2010 token
->type
= ANCHOR
;
2011 token
->opr
.ctx_type
= LINE_LAST
;
2019 /* Peek a token from INPUT, and return the length of the token.
2020 We must not use this function out of bracket expressions. */
2023 peek_token_bracket (token
, input
, syntax
)
2026 reg_syntax_t syntax
;
2029 if (re_string_eoi (input
))
2031 token
->type
= END_OF_RE
;
2034 c
= re_string_peek_byte (input
, 0);
2037 #ifdef RE_ENABLE_I18N
2038 if (input
->mb_cur_max
> 1 &&
2039 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2041 token
->type
= CHARACTER
;
2044 #endif /* RE_ENABLE_I18N */
2046 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2047 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2049 /* In this case, '\' escape a character. */
2051 re_string_skip_bytes (input
, 1);
2052 c2
= re_string_peek_byte (input
, 0);
2054 token
->type
= CHARACTER
;
2057 if (c
== '[') /* '[' is a special char in a bracket exps. */
2061 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2062 c2
= re_string_peek_byte (input
, 1);
2070 token
->type
= OP_OPEN_COLL_ELEM
;
2073 token
->type
= OP_OPEN_EQUIV_CLASS
;
2076 if (syntax
& RE_CHAR_CLASSES
)
2078 token
->type
= OP_OPEN_CHAR_CLASS
;
2081 /* else fall through. */
2083 token
->type
= CHARACTER
;
2093 token
->type
= OP_CHARSET_RANGE
;
2096 token
->type
= OP_CLOSE_BRACKET
;
2099 token
->type
= OP_NON_MATCH_LIST
;
2102 token
->type
= CHARACTER
;
2107 /* Functions for parser. */
2109 /* Entry point of the parser.
2110 Parse the regular expression REGEXP and return the structure tree.
2111 If an error is occured, ERR is set by error code, and return NULL.
2112 This function build the following tree, from regular expression <reg_exp>:
2118 CAT means concatenation.
2119 EOR means end of regular expression. */
2122 parse (regexp
, preg
, syntax
, err
)
2123 re_string_t
*regexp
;
2125 reg_syntax_t syntax
;
2128 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2129 bin_tree_t
*tree
, *eor
, *root
;
2130 re_token_t current_token
;
2131 dfa
->syntax
= syntax
;
2132 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2133 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2134 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2136 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2138 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2141 if (BE (eor
== NULL
|| root
== NULL
, 0))
2149 /* This function build the following tree, from regular expression
2150 <branch1>|<branch2>:
2156 ALT means alternative, which represents the operator `|'. */
2159 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2160 re_string_t
*regexp
;
2163 reg_syntax_t syntax
;
2167 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2168 bin_tree_t
*tree
, *branch
= NULL
;
2169 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2170 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2173 while (token
->type
== OP_ALT
)
2175 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2176 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2177 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2179 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2180 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2185 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2186 if (BE (tree
== NULL
, 0))
2195 /* This function build the following tree, from regular expression
2202 CAT means concatenation. */
2205 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2206 re_string_t
*regexp
;
2209 reg_syntax_t syntax
;
2213 bin_tree_t
*tree
, *exp
;
2214 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2215 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2216 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2219 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2220 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2222 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2223 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2227 if (tree
!= NULL
&& exp
!= NULL
)
2229 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2236 else if (tree
== NULL
)
2238 /* Otherwise exp == NULL, we don't need to create new tree. */
2243 /* This function build the following tree, from regular expression a*:
2250 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2251 re_string_t
*regexp
;
2254 reg_syntax_t syntax
;
2258 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2260 switch (token
->type
)
2263 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2264 if (BE (tree
== NULL
, 0))
2269 #ifdef RE_ENABLE_I18N
2270 if (dfa
->mb_cur_max
> 1)
2272 while (!re_string_eoi (regexp
)
2273 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2275 bin_tree_t
*mbc_remain
;
2276 fetch_token (token
, regexp
, syntax
);
2277 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2278 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2279 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2288 case OP_OPEN_SUBEXP
:
2289 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2290 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2293 case OP_OPEN_BRACKET
:
2294 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2295 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2299 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2304 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2305 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2306 if (BE (tree
== NULL
, 0))
2312 dfa
->has_mb_node
= 1;
2314 case OP_OPEN_DUP_NUM
:
2315 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2321 case OP_DUP_ASTERISK
:
2323 case OP_DUP_QUESTION
:
2324 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2329 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2331 fetch_token (token
, regexp
, syntax
);
2332 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2334 /* else fall through */
2335 case OP_CLOSE_SUBEXP
:
2336 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2337 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2342 /* else fall through */
2343 case OP_CLOSE_DUP_NUM
:
2344 /* We treat it as a normal character. */
2346 /* Then we can these characters as normal characters. */
2347 token
->type
= CHARACTER
;
2348 /* mb_partial and word_char bits should be initialized already
2350 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2351 if (BE (tree
== NULL
, 0))
2358 if ((token
->opr
.ctx_type
2359 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2360 && dfa
->word_ops_used
== 0)
2361 init_word_char (dfa
);
2362 if (token
->opr
.ctx_type
== WORD_DELIM
2363 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2365 bin_tree_t
*tree_first
, *tree_last
;
2366 if (token
->opr
.ctx_type
== WORD_DELIM
)
2368 token
->opr
.ctx_type
= WORD_FIRST
;
2369 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2370 token
->opr
.ctx_type
= WORD_LAST
;
2374 token
->opr
.ctx_type
= INSIDE_WORD
;
2375 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2376 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2378 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2379 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2380 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2388 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2389 if (BE (tree
== NULL
, 0))
2395 /* We must return here, since ANCHORs can't be followed
2396 by repetition operators.
2397 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2398 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2399 fetch_token (token
, regexp
, syntax
);
2402 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2403 if (BE (tree
== NULL
, 0))
2408 if (dfa
->mb_cur_max
> 1)
2409 dfa
->has_mb_node
= 1;
2413 tree
= build_charclass_op (dfa
, regexp
->trans
,
2414 (const unsigned char *) "alnum",
2415 (const unsigned char *) "_",
2416 token
->type
== OP_NOTWORD
, err
);
2417 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2422 tree
= build_charclass_op (dfa
, regexp
->trans
,
2423 (const unsigned char *) "space",
2424 (const unsigned char *) "",
2425 token
->type
== OP_NOTSPACE
, err
);
2426 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2436 /* Must not happen? */
2442 fetch_token (token
, regexp
, syntax
);
2444 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2445 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2447 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2448 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2450 /* In BRE consecutive duplications are not allowed. */
2451 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2452 && (token
->type
== OP_DUP_ASTERISK
2453 || token
->type
== OP_OPEN_DUP_NUM
))
2463 /* This function build the following tree, from regular expression
2471 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2472 re_string_t
*regexp
;
2475 reg_syntax_t syntax
;
2479 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2482 cur_nsub
= preg
->re_nsub
++;
2484 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2486 /* The subexpression may be a null string. */
2487 if (token
->type
== OP_CLOSE_SUBEXP
)
2491 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2492 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2494 if (BE (*err
!= REG_NOERROR
, 0))
2498 if (cur_nsub
<= '9' - '1')
2499 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2501 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2502 if (BE (tree
== NULL
, 0))
2507 tree
->token
.opr
.idx
= cur_nsub
;
2511 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2514 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2516 re_string_t
*regexp
;
2519 reg_syntax_t syntax
;
2522 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2523 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2524 re_token_t start_token
= *token
;
2526 if (token
->type
== OP_OPEN_DUP_NUM
)
2529 start
= fetch_number (regexp
, token
, syntax
);
2532 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2533 start
= 0; /* We treat "{,m}" as "{0,m}". */
2536 *err
= REG_BADBR
; /* <re>{} is invalid. */
2540 if (BE (start
!= -2, 1))
2542 /* We treat "{n}" as "{n,n}". */
2543 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2544 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2545 ? fetch_number (regexp
, token
, syntax
) : -2));
2547 if (BE (start
== -2 || end
== -2, 0))
2549 /* Invalid sequence. */
2550 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2552 if (token
->type
== END_OF_RE
)
2560 /* If the syntax bit is set, rollback. */
2561 re_string_set_index (regexp
, start_idx
);
2562 *token
= start_token
;
2563 token
->type
= CHARACTER
;
2564 /* mb_partial and word_char bits should be already initialized by
2569 if (BE (end
!= -1 && start
> end
, 0))
2571 /* First number greater than second. */
2578 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2579 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2582 fetch_token (token
, regexp
, syntax
);
2584 if (BE (elem
== NULL
, 0))
2586 if (BE (start
== 0 && end
== 0, 0))
2588 postorder (elem
, free_tree
, NULL
);
2592 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2593 if (BE (start
> 0, 0))
2596 for (i
= 2; i
<= start
; ++i
)
2598 elem
= duplicate_tree (elem
, dfa
);
2599 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2600 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2601 goto parse_dup_op_espace
;
2607 /* Duplicate ELEM before it is marked optional. */
2608 elem
= duplicate_tree (elem
, dfa
);
2614 if (elem
->token
.type
== SUBEXP
)
2615 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2617 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2618 if (BE (tree
== NULL
, 0))
2619 goto parse_dup_op_espace
;
2621 /* This loop is actually executed only when end != -1,
2622 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2623 already created the start+1-th copy. */
2624 for (i
= start
+ 2; i
<= end
; ++i
)
2626 elem
= duplicate_tree (elem
, dfa
);
2627 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2628 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2629 goto parse_dup_op_espace
;
2631 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2632 if (BE (tree
== NULL
, 0))
2633 goto parse_dup_op_espace
;
2637 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2641 parse_dup_op_espace
:
2646 /* Size of the names for collating symbol/equivalence_class/character_class.
2647 I'm not sure, but maybe enough. */
2648 #define BRACKET_NAME_BUF_SIZE 32
2651 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2652 Build the range expression which starts from START_ELEM, and ends
2653 at END_ELEM. The result are written to MBCSET and SBCSET.
2654 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2655 mbcset->range_ends, is a pointer argument sinse we may
2658 static reg_errcode_t
2659 # ifdef RE_ENABLE_I18N
2660 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2661 re_charset_t
*mbcset
;
2663 # else /* not RE_ENABLE_I18N */
2664 build_range_exp (sbcset
, start_elem
, end_elem
)
2665 # endif /* not RE_ENABLE_I18N */
2667 bracket_elem_t
*start_elem
, *end_elem
;
2669 unsigned int start_ch
, end_ch
;
2670 /* Equivalence Classes and Character Classes can't be a range start/end. */
2671 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2672 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2676 /* We can handle no multi character collating elements without libc
2678 if (BE ((start_elem
->type
== COLL_SYM
2679 && strlen ((char *) start_elem
->opr
.name
) > 1)
2680 || (end_elem
->type
== COLL_SYM
2681 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2682 return REG_ECOLLATE
;
2684 # ifdef RE_ENABLE_I18N
2689 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2691 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2692 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2694 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2695 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2697 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2698 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2699 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2700 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2701 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2702 return REG_ECOLLATE
;
2703 cmp_buf
[0] = start_wc
;
2704 cmp_buf
[4] = end_wc
;
2705 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2708 /* Got valid collation sequence values, add them as a new entry.
2709 However, for !_LIBC we have no collation elements: if the
2710 character set is single byte, the single byte character set
2711 that we build below suffices. parse_bracket_exp passes
2712 no MBCSET if dfa->mb_cur_max == 1. */
2715 /* Check the space of the arrays. */
2716 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2718 /* There is not enough space, need realloc. */
2719 wchar_t *new_array_start
, *new_array_end
;
2722 /* +1 in case of mbcset->nranges is 0. */
2723 new_nranges
= 2 * mbcset
->nranges
+ 1;
2724 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2725 are NULL if *range_alloc == 0. */
2726 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2728 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2731 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2734 mbcset
->range_starts
= new_array_start
;
2735 mbcset
->range_ends
= new_array_end
;
2736 *range_alloc
= new_nranges
;
2739 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2740 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2743 /* Build the table for single byte characters. */
2744 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2747 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2748 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2749 bitset_set (sbcset
, wc
);
2752 # else /* not RE_ENABLE_I18N */
2755 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2756 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2758 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2759 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2761 if (start_ch
> end_ch
)
2763 /* Build the table for single byte characters. */
2764 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2765 if (start_ch
<= ch
&& ch
<= end_ch
)
2766 bitset_set (sbcset
, ch
);
2768 # endif /* not RE_ENABLE_I18N */
2771 #endif /* not _LIBC */
2774 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2775 Build the collating element which is represented by NAME.
2776 The result are written to MBCSET and SBCSET.
2777 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2778 pointer argument since we may update it. */
2780 static reg_errcode_t
2781 # ifdef RE_ENABLE_I18N
2782 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2783 re_charset_t
*mbcset
;
2784 int *coll_sym_alloc
;
2785 # else /* not RE_ENABLE_I18N */
2786 build_collating_symbol (sbcset
, name
)
2787 # endif /* not RE_ENABLE_I18N */
2789 const unsigned char *name
;
2791 size_t name_len
= strlen ((const char *) name
);
2792 if (BE (name_len
!= 1, 0))
2793 return REG_ECOLLATE
;
2796 bitset_set (sbcset
, name
[0]);
2800 #endif /* not _LIBC */
2802 /* This function parse bracket expression like "[abc]", "[a-c]",
2806 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2807 re_string_t
*regexp
;
2810 reg_syntax_t syntax
;
2814 const unsigned char *collseqmb
;
2815 const char *collseqwc
;
2818 const int32_t *symb_table
;
2819 const unsigned char *extra
;
2821 /* Local function for parse_bracket_exp used in _LIBC environement.
2822 Seek the collating symbol entry correspondings to NAME.
2823 Return the index of the symbol in the SYMB_TABLE. */
2826 __attribute ((always_inline
))
2827 seek_collating_symbol_entry (name
, name_len
)
2828 const unsigned char *name
;
2831 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2832 int32_t elem
= hash
% table_size
;
2833 int32_t second
= hash
% (table_size
- 2);
2834 while (symb_table
[2 * elem
] != 0)
2836 /* First compare the hashing value. */
2837 if (symb_table
[2 * elem
] == hash
2838 /* Compare the length of the name. */
2839 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2840 /* Compare the name. */
2841 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2844 /* Yep, this is the entry. */
2854 /* Local function for parse_bracket_exp used in _LIBC environement.
2855 Look up the collation sequence value of BR_ELEM.
2856 Return the value if succeeded, UINT_MAX otherwise. */
2858 auto inline unsigned int
2859 __attribute ((always_inline
))
2860 lookup_collation_sequence_value (br_elem
)
2861 bracket_elem_t
*br_elem
;
2863 if (br_elem
->type
== SB_CHAR
)
2866 if (MB_CUR_MAX == 1)
2869 return collseqmb
[br_elem
->opr
.ch
];
2872 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2873 return __collseq_table_lookup (collseqwc
, wc
);
2876 else if (br_elem
->type
== MB_CHAR
)
2878 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2880 else if (br_elem
->type
== COLL_SYM
)
2882 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2886 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2888 if (symb_table
[2 * elem
] != 0)
2890 /* We found the entry. */
2891 idx
= symb_table
[2 * elem
+ 1];
2892 /* Skip the name of collating element name. */
2893 idx
+= 1 + extra
[idx
];
2894 /* Skip the byte sequence of the collating element. */
2895 idx
+= 1 + extra
[idx
];
2896 /* Adjust for the alignment. */
2897 idx
= (idx
+ 3) & ~3;
2898 /* Skip the multibyte collation sequence value. */
2899 idx
+= sizeof (unsigned int);
2900 /* Skip the wide char sequence of the collating element. */
2901 idx
+= sizeof (unsigned int) *
2902 (1 + *(unsigned int *) (extra
+ idx
));
2903 /* Return the collation sequence value. */
2904 return *(unsigned int *) (extra
+ idx
);
2906 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2908 /* No valid character. Match it as a single byte
2910 return collseqmb
[br_elem
->opr
.name
[0]];
2913 else if (sym_name_len
== 1)
2914 return collseqmb
[br_elem
->opr
.name
[0]];
2919 /* Local function for parse_bracket_exp used in _LIBC environement.
2920 Build the range expression which starts from START_ELEM, and ends
2921 at END_ELEM. The result are written to MBCSET and SBCSET.
2922 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2923 mbcset->range_ends, is a pointer argument sinse we may
2926 auto inline reg_errcode_t
2927 __attribute ((always_inline
))
2928 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2929 re_charset_t
*mbcset
;
2932 bracket_elem_t
*start_elem
, *end_elem
;
2935 uint32_t start_collseq
;
2936 uint32_t end_collseq
;
2938 /* Equivalence Classes and Character Classes can't be a range
2940 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2941 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2945 start_collseq
= lookup_collation_sequence_value (start_elem
);
2946 end_collseq
= lookup_collation_sequence_value (end_elem
);
2947 /* Check start/end collation sequence values. */
2948 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2949 return REG_ECOLLATE
;
2950 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2953 /* Got valid collation sequence values, add them as a new entry.
2954 However, if we have no collation elements, and the character set
2955 is single byte, the single byte character set that we
2956 build below suffices. */
2957 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2959 /* Check the space of the arrays. */
2960 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2962 /* There is not enough space, need realloc. */
2963 uint32_t *new_array_start
;
2964 uint32_t *new_array_end
;
2967 /* +1 in case of mbcset->nranges is 0. */
2968 new_nranges
= 2 * mbcset
->nranges
+ 1;
2969 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2971 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2974 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2977 mbcset
->range_starts
= new_array_start
;
2978 mbcset
->range_ends
= new_array_end
;
2979 *range_alloc
= new_nranges
;
2982 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2983 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2986 /* Build the table for single byte characters. */
2987 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2989 uint32_t ch_collseq
;
2991 if (MB_CUR_MAX == 1)
2994 ch_collseq
= collseqmb
[ch
];
2996 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2997 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2998 bitset_set (sbcset
, ch
);
3003 /* Local function for parse_bracket_exp used in _LIBC environement.
3004 Build the collating element which is represented by NAME.
3005 The result are written to MBCSET and SBCSET.
3006 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3007 pointer argument sinse we may update it. */
3009 auto inline reg_errcode_t
3010 __attribute ((always_inline
))
3011 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
3012 re_charset_t
*mbcset
;
3013 int *coll_sym_alloc
;
3015 const unsigned char *name
;
3018 size_t name_len
= strlen ((const char *) name
);
3021 elem
= seek_collating_symbol_entry (name
, name_len
);
3022 if (symb_table
[2 * elem
] != 0)
3024 /* We found the entry. */
3025 idx
= symb_table
[2 * elem
+ 1];
3026 /* Skip the name of collating element name. */
3027 idx
+= 1 + extra
[idx
];
3029 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3031 /* No valid character, treat it as a normal
3033 bitset_set (sbcset
, name
[0]);
3037 return REG_ECOLLATE
;
3039 /* Got valid collation sequence, add it as a new entry. */
3040 /* Check the space of the arrays. */
3041 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3043 /* Not enough, realloc it. */
3044 /* +1 in case of mbcset->ncoll_syms is 0. */
3045 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3046 /* Use realloc since mbcset->coll_syms is NULL
3048 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3049 new_coll_sym_alloc
);
3050 if (BE (new_coll_syms
== NULL
, 0))
3052 mbcset
->coll_syms
= new_coll_syms
;
3053 *coll_sym_alloc
= new_coll_sym_alloc
;
3055 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3060 if (BE (name_len
!= 1, 0))
3061 return REG_ECOLLATE
;
3064 bitset_set (sbcset
, name
[0]);
3071 re_token_t br_token
;
3072 re_bitset_ptr_t sbcset
;
3073 #ifdef RE_ENABLE_I18N
3074 re_charset_t
*mbcset
;
3075 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3076 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3077 #endif /* not RE_ENABLE_I18N */
3079 bin_tree_t
*work_tree
;
3081 int first_round
= 1;
3083 collseqmb
= (const unsigned char *)
3084 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3085 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3091 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3092 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3093 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3094 _NL_COLLATE_SYMB_TABLEMB
);
3095 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3096 _NL_COLLATE_SYMB_EXTRAMB
);
3099 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3100 #ifdef RE_ENABLE_I18N
3101 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3102 #endif /* RE_ENABLE_I18N */
3103 #ifdef RE_ENABLE_I18N
3104 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3106 if (BE (sbcset
== NULL
, 0))
3107 #endif /* RE_ENABLE_I18N */
3113 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3114 if (BE (token
->type
== END_OF_RE
, 0))
3117 goto parse_bracket_exp_free_return
;
3119 if (token
->type
== OP_NON_MATCH_LIST
)
3121 #ifdef RE_ENABLE_I18N
3122 mbcset
->non_match
= 1;
3123 #endif /* not RE_ENABLE_I18N */
3125 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3126 bitset_set (sbcset
, '\0');
3127 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3128 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3129 if (BE (token
->type
== END_OF_RE
, 0))
3132 goto parse_bracket_exp_free_return
;
3136 /* We treat the first ']' as a normal character. */
3137 if (token
->type
== OP_CLOSE_BRACKET
)
3138 token
->type
= CHARACTER
;
3142 bracket_elem_t start_elem
, end_elem
;
3143 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3144 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3146 int token_len2
= 0, is_range_exp
= 0;
3149 start_elem
.opr
.name
= start_name_buf
;
3150 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3151 syntax
, first_round
);
3152 if (BE (ret
!= REG_NOERROR
, 0))
3155 goto parse_bracket_exp_free_return
;
3159 /* Get information about the next token. We need it in any case. */
3160 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3162 /* Do not check for ranges if we know they are not allowed. */
3163 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3165 if (BE (token
->type
== END_OF_RE
, 0))
3168 goto parse_bracket_exp_free_return
;
3170 if (token
->type
== OP_CHARSET_RANGE
)
3172 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3173 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3174 if (BE (token2
.type
== END_OF_RE
, 0))
3177 goto parse_bracket_exp_free_return
;
3179 if (token2
.type
== OP_CLOSE_BRACKET
)
3181 /* We treat the last '-' as a normal character. */
3182 re_string_skip_bytes (regexp
, -token_len
);
3183 token
->type
= CHARACTER
;
3190 if (is_range_exp
== 1)
3192 end_elem
.opr
.name
= end_name_buf
;
3193 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3195 if (BE (ret
!= REG_NOERROR
, 0))
3198 goto parse_bracket_exp_free_return
;
3201 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3204 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3205 &start_elem
, &end_elem
);
3207 # ifdef RE_ENABLE_I18N
3208 *err
= build_range_exp (sbcset
,
3209 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3210 &range_alloc
, &start_elem
, &end_elem
);
3212 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3214 #endif /* RE_ENABLE_I18N */
3215 if (BE (*err
!= REG_NOERROR
, 0))
3216 goto parse_bracket_exp_free_return
;
3220 switch (start_elem
.type
)
3223 bitset_set (sbcset
, start_elem
.opr
.ch
);
3225 #ifdef RE_ENABLE_I18N
3227 /* Check whether the array has enough space. */
3228 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3230 wchar_t *new_mbchars
;
3231 /* Not enough, realloc it. */
3232 /* +1 in case of mbcset->nmbchars is 0. */
3233 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3234 /* Use realloc since array is NULL if *alloc == 0. */
3235 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3237 if (BE (new_mbchars
== NULL
, 0))
3238 goto parse_bracket_exp_espace
;
3239 mbcset
->mbchars
= new_mbchars
;
3241 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3243 #endif /* RE_ENABLE_I18N */
3245 *err
= build_equiv_class (sbcset
,
3246 #ifdef RE_ENABLE_I18N
3247 mbcset
, &equiv_class_alloc
,
3248 #endif /* RE_ENABLE_I18N */
3249 start_elem
.opr
.name
);
3250 if (BE (*err
!= REG_NOERROR
, 0))
3251 goto parse_bracket_exp_free_return
;
3254 *err
= build_collating_symbol (sbcset
,
3255 #ifdef RE_ENABLE_I18N
3256 mbcset
, &coll_sym_alloc
,
3257 #endif /* RE_ENABLE_I18N */
3258 start_elem
.opr
.name
);
3259 if (BE (*err
!= REG_NOERROR
, 0))
3260 goto parse_bracket_exp_free_return
;
3263 *err
= build_charclass (regexp
->trans
, sbcset
,
3264 #ifdef RE_ENABLE_I18N
3265 mbcset
, &char_class_alloc
,
3266 #endif /* RE_ENABLE_I18N */
3267 start_elem
.opr
.name
, syntax
);
3268 if (BE (*err
!= REG_NOERROR
, 0))
3269 goto parse_bracket_exp_free_return
;
3276 if (BE (token
->type
== END_OF_RE
, 0))
3279 goto parse_bracket_exp_free_return
;
3281 if (token
->type
== OP_CLOSE_BRACKET
)
3285 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3287 /* If it is non-matching list. */
3289 bitset_not (sbcset
);
3291 #ifdef RE_ENABLE_I18N
3292 /* Ensure only single byte characters are set. */
3293 if (dfa
->mb_cur_max
> 1)
3294 bitset_mask (sbcset
, dfa
->sb_char
);
3296 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3297 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3298 || mbcset
->non_match
)))
3300 bin_tree_t
*mbc_tree
;
3302 /* Build a tree for complex bracket. */
3303 dfa
->has_mb_node
= 1;
3304 br_token
.type
= COMPLEX_BRACKET
;
3305 br_token
.opr
.mbcset
= mbcset
;
3306 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3307 if (BE (mbc_tree
== NULL
, 0))
3308 goto parse_bracket_exp_espace
;
3309 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3310 if (sbcset
[sbc_idx
])
3312 /* If there are no bits set in sbcset, there is no point
3313 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3314 if (sbc_idx
< BITSET_WORDS
)
3316 /* Build a tree for simple bracket. */
3317 br_token
.type
= SIMPLE_BRACKET
;
3318 br_token
.opr
.sbcset
= sbcset
;
3319 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3320 if (BE (work_tree
== NULL
, 0))
3321 goto parse_bracket_exp_espace
;
3323 /* Then join them by ALT node. */
3324 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3325 if (BE (work_tree
== NULL
, 0))
3326 goto parse_bracket_exp_espace
;
3331 work_tree
= mbc_tree
;
3335 #endif /* not RE_ENABLE_I18N */
3337 #ifdef RE_ENABLE_I18N
3338 free_charset (mbcset
);
3340 /* Build a tree for simple bracket. */
3341 br_token
.type
= SIMPLE_BRACKET
;
3342 br_token
.opr
.sbcset
= sbcset
;
3343 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3344 if (BE (work_tree
== NULL
, 0))
3345 goto parse_bracket_exp_espace
;
3349 parse_bracket_exp_espace
:
3351 parse_bracket_exp_free_return
:
3353 #ifdef RE_ENABLE_I18N
3354 free_charset (mbcset
);
3355 #endif /* RE_ENABLE_I18N */
3359 /* Parse an element in the bracket expression. */
3361 static reg_errcode_t
3362 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3364 bracket_elem_t
*elem
;
3365 re_string_t
*regexp
;
3369 reg_syntax_t syntax
;
3372 #ifdef RE_ENABLE_I18N
3374 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3375 if (cur_char_size
> 1)
3377 elem
->type
= MB_CHAR
;
3378 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3379 re_string_skip_bytes (regexp
, cur_char_size
);
3382 #endif /* RE_ENABLE_I18N */
3383 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3384 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3385 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3386 return parse_bracket_symbol (elem
, regexp
, token
);
3387 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3389 /* A '-' must only appear as anything but a range indicator before
3390 the closing bracket. Everything else is an error. */
3392 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3393 if (token2
.type
!= OP_CLOSE_BRACKET
)
3394 /* The actual error value is not standardized since this whole
3395 case is undefined. But ERANGE makes good sense. */
3398 elem
->type
= SB_CHAR
;
3399 elem
->opr
.ch
= token
->opr
.c
;
3403 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3404 such as [:<character_class>:], [.<collating_element>.], and
3405 [=<equivalent_class>=]. */
3407 static reg_errcode_t
3408 parse_bracket_symbol (elem
, regexp
, token
)
3409 bracket_elem_t
*elem
;
3410 re_string_t
*regexp
;
3413 unsigned char ch
, delim
= token
->opr
.c
;
3415 if (re_string_eoi(regexp
))
3419 if (i
>= BRACKET_NAME_BUF_SIZE
)
3421 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3422 ch
= re_string_fetch_byte_case (regexp
);
3424 ch
= re_string_fetch_byte (regexp
);
3425 if (re_string_eoi(regexp
))
3427 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3429 elem
->opr
.name
[i
] = ch
;
3431 re_string_skip_bytes (regexp
, 1);
3432 elem
->opr
.name
[i
] = '\0';
3433 switch (token
->type
)
3435 case OP_OPEN_COLL_ELEM
:
3436 elem
->type
= COLL_SYM
;
3438 case OP_OPEN_EQUIV_CLASS
:
3439 elem
->type
= EQUIV_CLASS
;
3441 case OP_OPEN_CHAR_CLASS
:
3442 elem
->type
= CHAR_CLASS
;
3450 /* Helper function for parse_bracket_exp.
3451 Build the equivalence class which is represented by NAME.
3452 The result are written to MBCSET and SBCSET.
3453 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3454 is a pointer argument sinse we may update it. */
3456 static reg_errcode_t
3457 #ifdef RE_ENABLE_I18N
3458 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3459 re_charset_t
*mbcset
;
3460 int *equiv_class_alloc
;
3461 #else /* not RE_ENABLE_I18N */
3462 build_equiv_class (sbcset
, name
)
3463 #endif /* not RE_ENABLE_I18N */
3465 const unsigned char *name
;
3468 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3471 const int32_t *table
, *indirect
;
3472 const unsigned char *weights
, *extra
, *cp
;
3473 unsigned char char_buf
[2];
3477 /* This #include defines a local function! */
3478 # include <locale/weight.h>
3479 /* Calculate the index for equivalence class. */
3481 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3482 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3483 _NL_COLLATE_WEIGHTMB
);
3484 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3485 _NL_COLLATE_EXTRAMB
);
3486 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3487 _NL_COLLATE_INDIRECTMB
);
3488 idx1
= findidx (&cp
);
3489 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3490 /* This isn't a valid character. */
3491 return REG_ECOLLATE
;
3493 /* Build single byte matcing table for this equivalence class. */
3494 char_buf
[1] = (unsigned char) '\0';
3495 len
= weights
[idx1
];
3496 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3500 idx2
= findidx (&cp
);
3505 /* This isn't a valid character. */
3507 if (len
== weights
[idx2
])
3510 while (cnt
<= len
&&
3511 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3515 bitset_set (sbcset
, ch
);
3518 /* Check whether the array has enough space. */
3519 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3521 /* Not enough, realloc it. */
3522 /* +1 in case of mbcset->nequiv_classes is 0. */
3523 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3524 /* Use realloc since the array is NULL if *alloc == 0. */
3525 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3527 new_equiv_class_alloc
);
3528 if (BE (new_equiv_classes
== NULL
, 0))
3530 mbcset
->equiv_classes
= new_equiv_classes
;
3531 *equiv_class_alloc
= new_equiv_class_alloc
;
3533 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3538 if (BE (strlen ((const char *) name
) != 1, 0))
3539 return REG_ECOLLATE
;
3540 bitset_set (sbcset
, *name
);
3545 /* Helper function for parse_bracket_exp.
3546 Build the character class which is represented by NAME.
3547 The result are written to MBCSET and SBCSET.
3548 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3549 is a pointer argument sinse we may update it. */
3551 static reg_errcode_t
3552 #ifdef RE_ENABLE_I18N
3553 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3554 re_charset_t
*mbcset
;
3555 int *char_class_alloc
;
3556 #else /* not RE_ENABLE_I18N */
3557 build_charclass (trans
, sbcset
, class_name
, syntax
)
3558 #endif /* not RE_ENABLE_I18N */
3559 RE_TRANSLATE_TYPE trans
;
3561 const unsigned char *class_name
;
3562 reg_syntax_t syntax
;
3565 const char *name
= (const char *) class_name
;
3567 /* In case of REG_ICASE "upper" and "lower" match the both of
3568 upper and lower cases. */
3569 if ((syntax
& RE_ICASE
)
3570 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3573 #ifdef RE_ENABLE_I18N
3574 /* Check the space of the arrays. */
3575 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3577 /* Not enough, realloc it. */
3578 /* +1 in case of mbcset->nchar_classes is 0. */
3579 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3580 /* Use realloc since array is NULL if *alloc == 0. */
3581 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3582 new_char_class_alloc
);
3583 if (BE (new_char_classes
== NULL
, 0))
3585 mbcset
->char_classes
= new_char_classes
;
3586 *char_class_alloc
= new_char_class_alloc
;
3588 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3589 #endif /* RE_ENABLE_I18N */
3591 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3592 for (i = 0; i < SBC_MAX; ++i) \
3594 if (ctype_func (i)) \
3596 int ch = trans ? trans[i] : i; \
3597 bitset_set (sbcset, ch); \
3601 if (strcmp (name
, "alnum") == 0)
3602 BUILD_CHARCLASS_LOOP (isalnum
)
3603 else if (strcmp (name
, "cntrl") == 0)
3604 BUILD_CHARCLASS_LOOP (iscntrl
)
3605 else if (strcmp (name
, "lower") == 0)
3606 BUILD_CHARCLASS_LOOP (islower
)
3607 else if (strcmp (name
, "space") == 0)
3608 BUILD_CHARCLASS_LOOP (isspace
)
3609 else if (strcmp (name
, "alpha") == 0)
3610 BUILD_CHARCLASS_LOOP (isalpha
)
3611 else if (strcmp (name
, "digit") == 0)
3612 BUILD_CHARCLASS_LOOP (isdigit
)
3613 else if (strcmp (name
, "print") == 0)
3614 BUILD_CHARCLASS_LOOP (isprint
)
3615 else if (strcmp (name
, "upper") == 0)
3616 BUILD_CHARCLASS_LOOP (isupper
)
3617 else if (strcmp (name
, "blank") == 0)
3618 BUILD_CHARCLASS_LOOP (isblank
)
3619 else if (strcmp (name
, "graph") == 0)
3620 BUILD_CHARCLASS_LOOP (isgraph
)
3621 else if (strcmp (name
, "punct") == 0)
3622 BUILD_CHARCLASS_LOOP (ispunct
)
3623 else if (strcmp (name
, "xdigit") == 0)
3624 BUILD_CHARCLASS_LOOP (isxdigit
)
3632 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3634 RE_TRANSLATE_TYPE trans
;
3635 const unsigned char *class_name
;
3636 const unsigned char *extra
;
3640 re_bitset_ptr_t sbcset
;
3641 #ifdef RE_ENABLE_I18N
3642 re_charset_t
*mbcset
;
3644 #endif /* not RE_ENABLE_I18N */
3646 re_token_t br_token
;
3649 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3650 #ifdef RE_ENABLE_I18N
3651 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3652 #endif /* RE_ENABLE_I18N */
3654 #ifdef RE_ENABLE_I18N
3655 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3656 #else /* not RE_ENABLE_I18N */
3657 if (BE (sbcset
== NULL
, 0))
3658 #endif /* not RE_ENABLE_I18N */
3666 #ifdef RE_ENABLE_I18N
3668 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3669 bitset_set(cset->sbcset, '\0');
3671 mbcset
->non_match
= 1;
3672 #endif /* not RE_ENABLE_I18N */
3675 /* We don't care the syntax in this case. */
3676 ret
= build_charclass (trans
, sbcset
,
3677 #ifdef RE_ENABLE_I18N
3679 #endif /* RE_ENABLE_I18N */
3682 if (BE (ret
!= REG_NOERROR
, 0))
3685 #ifdef RE_ENABLE_I18N
3686 free_charset (mbcset
);
3687 #endif /* RE_ENABLE_I18N */
3691 /* \w match '_' also. */
3692 for (; *extra
; extra
++)
3693 bitset_set (sbcset
, *extra
);
3695 /* If it is non-matching list. */
3697 bitset_not (sbcset
);
3699 #ifdef RE_ENABLE_I18N
3700 /* Ensure only single byte characters are set. */
3701 if (dfa
->mb_cur_max
> 1)
3702 bitset_mask (sbcset
, dfa
->sb_char
);
3705 /* Build a tree for simple bracket. */
3706 br_token
.type
= SIMPLE_BRACKET
;
3707 br_token
.opr
.sbcset
= sbcset
;
3708 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3709 if (BE (tree
== NULL
, 0))
3710 goto build_word_op_espace
;
3712 #ifdef RE_ENABLE_I18N
3713 if (dfa
->mb_cur_max
> 1)
3715 bin_tree_t
*mbc_tree
;
3716 /* Build a tree for complex bracket. */
3717 br_token
.type
= COMPLEX_BRACKET
;
3718 br_token
.opr
.mbcset
= mbcset
;
3719 dfa
->has_mb_node
= 1;
3720 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3721 if (BE (mbc_tree
== NULL
, 0))
3722 goto build_word_op_espace
;
3723 /* Then join them by ALT node. */
3724 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3725 if (BE (mbc_tree
!= NULL
, 1))
3730 free_charset (mbcset
);
3733 #else /* not RE_ENABLE_I18N */
3735 #endif /* not RE_ENABLE_I18N */
3737 build_word_op_espace
:
3739 #ifdef RE_ENABLE_I18N
3740 free_charset (mbcset
);
3741 #endif /* RE_ENABLE_I18N */
3746 /* This is intended for the expressions like "a{1,3}".
3747 Fetch a number from `input', and return the number.
3748 Return -1, if the number field is empty like "{,1}".
3749 Return -2, If an error is occured. */
3752 fetch_number (input
, token
, syntax
)
3755 reg_syntax_t syntax
;
3761 fetch_token (token
, input
, syntax
);
3763 if (BE (token
->type
== END_OF_RE
, 0))
3765 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3767 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3768 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3769 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3774 #ifdef RE_ENABLE_I18N
3776 free_charset (re_charset_t
*cset
)
3778 re_free (cset
->mbchars
);
3780 re_free (cset
->coll_syms
);
3781 re_free (cset
->equiv_classes
);
3782 re_free (cset
->range_starts
);
3783 re_free (cset
->range_ends
);
3785 re_free (cset
->char_classes
);
3788 #endif /* RE_ENABLE_I18N */
3790 /* Functions for binary tree operation. */
3792 /* Create a tree node. */
3795 create_tree (dfa
, left
, right
, type
)
3799 re_token_type_t type
;
3803 return create_token_tree (dfa
, left
, right
, &t
);
3807 create_token_tree (dfa
, left
, right
, token
)
3811 const re_token_t
*token
;
3814 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3816 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3818 if (storage
== NULL
)
3820 storage
->next
= dfa
->str_tree_storage
;
3821 dfa
->str_tree_storage
= storage
;
3822 dfa
->str_tree_storage_idx
= 0;
3824 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3826 tree
->parent
= NULL
;
3828 tree
->right
= right
;
3829 tree
->token
= *token
;
3830 tree
->token
.duplicated
= 0;
3831 tree
->token
.opt_subexp
= 0;
3834 tree
->node_idx
= -1;
3837 left
->parent
= tree
;
3839 right
->parent
= tree
;
3843 /* Mark the tree SRC as an optional subexpression.
3844 To be called from preorder or postorder. */
3846 static reg_errcode_t
3847 mark_opt_subexp (extra
, node
)
3851 int idx
= (int) (long) extra
;
3852 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3853 node
->token
.opt_subexp
= 1;
3858 /* Free the allocated memory inside NODE. */
3861 free_token (re_token_t
*node
)
3863 #ifdef RE_ENABLE_I18N
3864 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3865 free_charset (node
->opr
.mbcset
);
3867 #endif /* RE_ENABLE_I18N */
3868 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3869 re_free (node
->opr
.sbcset
);
3872 /* Worker function for tree walking. Free the allocated memory inside NODE
3873 and its children. */
3875 static reg_errcode_t
3876 free_tree (void *extra
, bin_tree_t
*node
)
3878 free_token (&node
->token
);
3883 /* Duplicate the node SRC, and return new node. This is a preorder
3884 visit similar to the one implemented by the generic visitor, but
3885 we need more infrastructure to maintain two parallel trees --- so,
3886 it's easier to duplicate. */
3889 duplicate_tree (root
, dfa
)
3890 const bin_tree_t
*root
;
3893 const bin_tree_t
*node
;
3894 bin_tree_t
*dup_root
;
3895 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3897 for (node
= root
; ; )
3899 /* Create a new tree and link it back to the current parent. */
3900 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3903 (*p_new
)->parent
= dup_node
;
3904 (*p_new
)->token
.duplicated
= 1;
3907 /* Go to the left node, or up and to the right. */
3911 p_new
= &dup_node
->left
;
3915 const bin_tree_t
*prev
= NULL
;
3916 while (node
->right
== prev
|| node
->right
== NULL
)
3919 node
= node
->parent
;
3920 dup_node
= dup_node
->parent
;
3925 p_new
= &dup_node
->right
;