1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
28 static void free_charset (re_charset_t
*cset
);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t
*preg
);
31 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
35 static reg_errcode_t
analyze (regex_t
*preg
);
36 static reg_errcode_t
preorder (bin_tree_t
*root
,
37 reg_errcode_t (fn (void *, bin_tree_t
*)),
39 static reg_errcode_t
postorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
43 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
44 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
46 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
49 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
50 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
53 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
55 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
56 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
58 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 reg_syntax_t syntax
) internal_function
;
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
92 int *equiv_class_alloc
,
93 const unsigned char *name
);
94 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
97 int *char_class_alloc
,
98 const unsigned char *class_name
,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
102 const unsigned char *name
);
103 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
105 const unsigned char *class_name
,
106 reg_syntax_t syntax
);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
109 RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 const unsigned char *extra
,
112 int non_match
, reg_errcode_t
*err
);
113 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
114 bin_tree_t
*left
, bin_tree_t
*right
,
115 re_token_type_t type
);
116 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 const re_token_t
*token
);
119 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
120 static void free_token (re_token_t
*node
);
121 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
122 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid
[] attribute_hidden
=
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx
[] attribute_hidden
=
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
214 re_compile_pattern (pattern
, length
, bufp
)
217 struct re_pattern_buffer
*bufp
;
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
226 /* Match anchors at newline. */
227 bufp
->newline_anchor
= 1;
229 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
233 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
236 weak_alias (__re_compile_pattern
, re_compile_pattern
)
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options
;
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
255 re_set_syntax (syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (bufp
)
269 struct re_pattern_buffer
*bufp
;
271 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
272 char *fastmap
= bufp
->fastmap
;
274 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
275 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
276 if (dfa
->init_state
!= dfa
->init_state_word
)
277 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
278 if (dfa
->init_state
!= dfa
->init_state_nl
)
279 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
280 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
281 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
282 bufp
->fastmap_accurate
= 1;
286 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
290 __attribute ((always_inline
))
291 re_set_fastmap (char *fastmap
, int icase
, int ch
)
295 fastmap
[tolower (ch
)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
302 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
305 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
307 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
308 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
310 int node
= init_state
->nodes
.elems
[node_cnt
];
311 re_token_type_t type
= dfa
->nodes
[node
].type
;
313 if (type
== CHARACTER
)
315 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
319 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, 0, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
353 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
357 /* See if we have to try all bytes which start multiple collation
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
364 && (cset
->ncoll_syms
|| cset
->nranges
))
366 const int32_t *table
= (const int32_t *)
367 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
368 for (i
= 0; i
< SBC_MAX
; ++i
)
370 re_set_fastmap (fastmap
, icase
, i
);
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa
->mb_cur_max
> 1
379 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
381 || cset
->nequiv_classes
389 memset (&mbs
, 0, sizeof (mbs
));
390 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
391 re_set_fastmap (fastmap
, false, (int) c
);
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i
= 0; i
< cset
->nmbchars
; ++i
)
403 memset (&state
, '\0', sizeof (state
));
404 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
405 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
406 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
408 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
410 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
415 #endif /* RE_ENABLE_I18N */
416 else if (type
== OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type
== OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type
== END_OF_RE
)
422 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
423 if (type
== END_OF_RE
)
424 bufp
->can_be_null
= 1;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 `buffer' to the compiled pattern;
437 `used' to the length of the compiled pattern;
438 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 `fastmap' to an allocated space for the fastmap;
443 `fastmap_accurate' to zero;
444 `re_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (preg
, pattern
, cflags
)
468 regex_t
*__restrict preg
;
469 const char *__restrict pattern
;
473 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
474 : RE_SYNTAX_POSIX_BASIC
);
480 /* Try to allocate space for the fastmap. */
481 preg
->fastmap
= re_malloc (char, SBC_MAX
);
482 if (BE (preg
->fastmap
== NULL
, 0))
485 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
487 /* If REG_NEWLINE is set, newlines are treated differently. */
488 if (cflags
& REG_NEWLINE
)
489 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
490 syntax
&= ~RE_DOT_NEWLINE
;
491 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
492 /* It also changes the matching behavior. */
493 preg
->newline_anchor
= 1;
496 preg
->newline_anchor
= 0;
497 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
498 preg
->translate
= NULL
;
500 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
502 /* POSIX doesn't distinguish between an unmatched open-group and an
503 unmatched close-group: both are REG_EPAREN. */
504 if (ret
== REG_ERPAREN
)
507 /* We have already checked preg->fastmap != NULL. */
508 if (BE (ret
== REG_NOERROR
, 1))
509 /* Compute the fastmap now, since regexec cannot modify the pattern
510 buffer. This function never fails in this implementation. */
511 (void) re_compile_fastmap (preg
);
514 /* Some error occurred while compiling the expression. */
515 re_free (preg
->fastmap
);
516 preg
->fastmap
= NULL
;
522 weak_alias (__regcomp
, regcomp
)
525 /* Returns a message corresponding to an error code, ERRCODE, returned
526 from either regcomp or regexec. We don't use PREG here. */
529 regerror (errcode
, preg
, errbuf
, errbuf_size
)
531 const regex_t
*__restrict preg
;
532 char *__restrict errbuf
;
539 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
540 / sizeof (__re_error_msgid_idx
[0])), 0))
541 /* Only error codes returned by the rest of the code should be passed
542 to this routine. If we are given anything else, or if other regex
543 code generates an invalid error code, then the program has a bug.
544 Dump core so we can fix it. */
547 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
549 msg_size
= strlen (msg
) + 1; /* Includes the null. */
551 if (BE (errbuf_size
!= 0, 1))
553 if (BE (msg_size
> errbuf_size
, 0))
555 #if defined HAVE_MEMPCPY || defined _LIBC
556 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
558 memcpy (errbuf
, msg
, errbuf_size
- 1);
559 errbuf
[errbuf_size
- 1] = 0;
563 memcpy (errbuf
, msg
, msg_size
);
569 weak_alias (__regerror
, regerror
)
573 #ifdef RE_ENABLE_I18N
574 /* This static array is used for the map to single-byte characters when
575 UTF-8 is used. Otherwise we would allocate memory just to initialize
576 it the same all the time. UTF-8 is the preferred encoding so this is
577 a worthwhile optimization. */
578 static const bitset_t utf8_sb_map
=
580 /* Set the first 128 bits. */
581 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
587 free_dfa_content (re_dfa_t
*dfa
)
592 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
593 free_token (dfa
->nodes
+ i
);
594 re_free (dfa
->nexts
);
595 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
597 if (dfa
->eclosures
!= NULL
)
598 re_node_set_free (dfa
->eclosures
+ i
);
599 if (dfa
->inveclosures
!= NULL
)
600 re_node_set_free (dfa
->inveclosures
+ i
);
601 if (dfa
->edests
!= NULL
)
602 re_node_set_free (dfa
->edests
+ i
);
604 re_free (dfa
->edests
);
605 re_free (dfa
->eclosures
);
606 re_free (dfa
->inveclosures
);
607 re_free (dfa
->nodes
);
609 if (dfa
->state_table
)
610 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
612 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
613 for (j
= 0; j
< entry
->num
; ++j
)
615 re_dfastate_t
*state
= entry
->array
[j
];
618 re_free (entry
->array
);
620 re_free (dfa
->state_table
);
621 #ifdef RE_ENABLE_I18N
622 if (dfa
->sb_char
!= utf8_sb_map
)
623 re_free (dfa
->sb_char
);
625 re_free (dfa
->subexp_map
);
627 re_free (dfa
->re_str
);
634 /* Free dynamically allocated space used by PREG. */
640 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
641 if (BE (dfa
!= NULL
, 1))
642 free_dfa_content (dfa
);
646 re_free (preg
->fastmap
);
647 preg
->fastmap
= NULL
;
649 re_free (preg
->translate
);
650 preg
->translate
= NULL
;
653 weak_alias (__regfree
, regfree
)
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
659 #if defined _REGEX_RE_COMP || defined _LIBC
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf
;
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
679 if (!re_comp_buf
.buffer
)
680 return gettext ("No previous regular expression");
684 if (re_comp_buf
.buffer
)
686 fastmap
= re_comp_buf
.fastmap
;
687 re_comp_buf
.fastmap
= NULL
;
688 __regfree (&re_comp_buf
);
689 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
690 re_comp_buf
.fastmap
= fastmap
;
693 if (re_comp_buf
.fastmap
== NULL
)
695 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
696 if (re_comp_buf
.fastmap
== NULL
)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
704 /* Match anchors at newlines. */
705 re_comp_buf
.newline_anchor
= 1;
707 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
717 libc_freeres_fn (free_mem
)
719 __regfree (&re_comp_buf
);
723 #endif /* _REGEX_RE_COMP */
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
730 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
733 reg_errcode_t err
= REG_NOERROR
;
737 /* Initialize the pattern buffer. */
738 preg
->fastmap_accurate
= 0;
739 preg
->syntax
= syntax
;
740 preg
->not_bol
= preg
->not_eol
= 0;
743 preg
->can_be_null
= 0;
744 preg
->regs_allocated
= REGS_UNALLOCATED
;
746 /* Initialize the dfa. */
747 dfa
= (re_dfa_t
*) preg
->buffer
;
748 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
757 preg
->allocated
= sizeof (re_dfa_t
);
758 preg
->buffer
= (unsigned char *) dfa
;
760 preg
->used
= sizeof (re_dfa_t
);
762 err
= init_dfa (dfa
, length
);
763 if (BE (err
!= REG_NOERROR
, 0))
765 free_dfa_content (dfa
);
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa
->re_str
= re_malloc (char, length
+ 1);
773 strncpy (dfa
->re_str
, pattern
, length
+ 1);
776 __libc_lock_init (dfa
->lock
);
778 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
779 syntax
& RE_ICASE
, dfa
);
780 if (BE (err
!= REG_NOERROR
, 0))
782 re_compile_internal_free_return
:
783 free_workarea_compile (preg
);
784 re_string_destruct (®exp
);
785 free_dfa_content (dfa
);
791 /* Parse the regular expression, and build a structure tree. */
793 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
794 if (BE (dfa
->str_tree
== NULL
, 0))
795 goto re_compile_internal_free_return
;
797 /* Analyze the tree and create the nfa. */
798 err
= analyze (preg
);
799 if (BE (err
!= REG_NOERROR
, 0))
800 goto re_compile_internal_free_return
;
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
808 /* Then create the initial state of the dfa. */
809 err
= create_initial_state (dfa
);
811 /* Release work areas. */
812 free_workarea_compile (preg
);
813 re_string_destruct (®exp
);
815 if (BE (err
!= REG_NOERROR
, 0))
817 free_dfa_content (dfa
);
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
829 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
831 unsigned int table_size
;
836 memset (dfa
, '\0', sizeof (re_dfa_t
));
838 /* Force allocation of str_tree_storage the first time. */
839 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
841 /* Avoid overflows. */
842 if (pat_len
== SIZE_MAX
)
845 dfa
->nodes_alloc
= pat_len
+ 1;
846 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
848 /* table_size = 2 ^ ceil(log pat_len) */
849 for (table_size
= 1; ; table_size
<<= 1)
850 if (table_size
> pat_len
)
853 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
854 dfa
->state_hash_mask
= table_size
- 1;
856 dfa
->mb_cur_max
= MB_CUR_MAX
;
858 if (dfa
->mb_cur_max
== 6
859 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
861 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
864 # ifdef HAVE_LANGINFO_CODESET
865 codeset_name
= nl_langinfo (CODESET
);
867 codeset_name
= getenv ("LC_ALL");
868 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
869 codeset_name
= getenv ("LC_CTYPE");
870 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
871 codeset_name
= getenv ("LANG");
872 if (codeset_name
== NULL
)
874 else if (strchr (codeset_name
, '.') != NULL
)
875 codeset_name
= strchr (codeset_name
, '.') + 1;
878 if (strcasecmp (codeset_name
, "UTF-8") == 0
879 || strcasecmp (codeset_name
, "UTF8") == 0)
882 /* We check exhaustively in the loop below if this charset is a
883 superset of ASCII. */
884 dfa
->map_notascii
= 0;
887 #ifdef RE_ENABLE_I18N
888 if (dfa
->mb_cur_max
> 1)
891 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
896 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
897 if (BE (dfa
->sb_char
== NULL
, 0))
900 /* Set the bits corresponding to single byte chars. */
901 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
902 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
904 wint_t wch
= __btowc (ch
);
906 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
908 if (isascii (ch
) && wch
!= ch
)
909 dfa
->map_notascii
= 1;
916 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
921 /* Initialize WORD_CHAR table, which indicate which character is
922 "word". In this case "word" means that it is the word construction
923 character used by some operators like "\<", "\>", etc. */
927 init_word_char (re_dfa_t
*dfa
)
930 dfa
->word_ops_used
= 1;
931 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
932 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
933 if (isalnum (ch
) || ch
== '_')
934 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
937 /* Free the work area which are only used while compiling. */
940 free_workarea_compile (regex_t
*preg
)
942 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
943 bin_tree_storage_t
*storage
, *next
;
944 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
946 next
= storage
->next
;
949 dfa
->str_tree_storage
= NULL
;
950 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
951 dfa
->str_tree
= NULL
;
952 re_free (dfa
->org_indices
);
953 dfa
->org_indices
= NULL
;
956 /* Create initial states for all contexts. */
959 create_initial_state (re_dfa_t
*dfa
)
963 re_node_set init_nodes
;
965 /* Initial states have the epsilon closure of the node which is
966 the first node of the regular expression. */
967 first
= dfa
->str_tree
->first
->node_idx
;
968 dfa
->init_node
= first
;
969 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
970 if (BE (err
!= REG_NOERROR
, 0))
973 /* The back-references which are in initial states can epsilon transit,
974 since in this case all of the subexpressions can be null.
975 Then we add epsilon closures of the nodes which are the next nodes of
976 the back-references. */
977 if (dfa
->nbackref
> 0)
978 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
980 int node_idx
= init_nodes
.elems
[i
];
981 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
984 if (type
!= OP_BACK_REF
)
986 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
988 re_token_t
*clexp_node
;
989 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
990 if (clexp_node
->type
== OP_CLOSE_SUBEXP
991 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
994 if (clexp_idx
== init_nodes
.nelem
)
997 if (type
== OP_BACK_REF
)
999 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1000 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1002 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1005 if (err
!= REG_NOERROR
)
1012 /* It must be the first time to invoke acquire_state. */
1013 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1014 /* We don't check ERR here, since the initial state must not be NULL. */
1015 if (BE (dfa
->init_state
== NULL
, 0))
1017 if (dfa
->init_state
->has_constraint
)
1019 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1021 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1023 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1027 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1028 || dfa
->init_state_begbuf
== NULL
, 0))
1032 dfa
->init_state_word
= dfa
->init_state_nl
1033 = dfa
->init_state_begbuf
= dfa
->init_state
;
1035 re_node_set_free (&init_nodes
);
1039 #ifdef RE_ENABLE_I18N
1040 /* If it is possible to do searching in single byte encoding instead of UTF-8
1041 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1042 DFA nodes where needed. */
1045 optimize_utf8 (re_dfa_t
*dfa
)
1047 int node
, i
, mb_chars
= 0, has_period
= 0;
1049 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1050 switch (dfa
->nodes
[node
].type
)
1053 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1057 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1065 /* Word anchors etc. cannot be handled. It's okay to test
1066 opr.ctx_type since constraints (for all DFA nodes) are
1067 created by ORing one or more opr.ctx_type values. */
1077 case OP_DUP_ASTERISK
:
1078 case OP_OPEN_SUBEXP
:
1079 case OP_CLOSE_SUBEXP
:
1081 case COMPLEX_BRACKET
:
1083 case SIMPLE_BRACKET
:
1084 /* Just double check. The non-ASCII range starts at 0x80. */
1085 assert (0x80 % BITSET_WORD_BITS
== 0);
1086 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1087 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1094 if (mb_chars
|| has_period
)
1095 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1097 if (dfa
->nodes
[node
].type
== CHARACTER
1098 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1099 dfa
->nodes
[node
].mb_partial
= 0;
1100 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1101 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1104 /* The search can be in single byte locale. */
1105 dfa
->mb_cur_max
= 1;
1107 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1111 /* Analyze the structure tree, and calculate "first", "next", "edest",
1112 "eclosure", and "inveclosure". */
1114 static reg_errcode_t
1115 analyze (regex_t
*preg
)
1117 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1120 /* Allocate arrays. */
1121 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1122 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1123 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1124 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1125 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1126 || dfa
->eclosures
== NULL
, 0))
1129 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1130 if (dfa
->subexp_map
!= NULL
)
1133 for (i
= 0; i
< preg
->re_nsub
; i
++)
1134 dfa
->subexp_map
[i
] = i
;
1135 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1136 for (i
= 0; i
< preg
->re_nsub
; i
++)
1137 if (dfa
->subexp_map
[i
] != i
)
1139 if (i
== preg
->re_nsub
)
1141 free (dfa
->subexp_map
);
1142 dfa
->subexp_map
= NULL
;
1146 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1147 if (BE (ret
!= REG_NOERROR
, 0))
1149 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1150 if (BE (ret
!= REG_NOERROR
, 0))
1152 preorder (dfa
->str_tree
, calc_next
, dfa
);
1153 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1154 if (BE (ret
!= REG_NOERROR
, 0))
1156 ret
= calc_eclosure (dfa
);
1157 if (BE (ret
!= REG_NOERROR
, 0))
1160 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1162 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1165 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1166 if (BE (dfa
->inveclosures
== NULL
, 0))
1168 ret
= calc_inveclosure (dfa
);
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175 implement parse tree visits. Instead, we use parent pointers and
1176 some hairy code in these two functions. */
1177 static reg_errcode_t
1178 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1181 bin_tree_t
*node
, *prev
;
1183 for (node
= root
; ; )
1185 /* Descend down the tree, preferably to the left (or to the right
1186 if that's the only child). */
1187 while (node
->left
|| node
->right
)
1195 reg_errcode_t err
= fn (extra
, node
);
1196 if (BE (err
!= REG_NOERROR
, 0))
1198 if (node
->parent
== NULL
)
1201 node
= node
->parent
;
1203 /* Go up while we have a node that is reached from the right. */
1204 while (node
->right
== prev
|| node
->right
== NULL
);
1209 static reg_errcode_t
1210 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1215 for (node
= root
; ; )
1217 reg_errcode_t err
= fn (extra
, node
);
1218 if (BE (err
!= REG_NOERROR
, 0))
1221 /* Go to the left node, or up and to the right. */
1226 bin_tree_t
*prev
= NULL
;
1227 while (node
->right
== prev
|| node
->right
== NULL
)
1230 node
= node
->parent
;
1239 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1241 backreferences as well. Requires a preorder visit. */
1242 static reg_errcode_t
1243 optimize_subexps (void *extra
, bin_tree_t
*node
)
1245 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1247 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1249 int idx
= node
->token
.opr
.idx
;
1250 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1251 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1254 else if (node
->token
.type
== SUBEXP
1255 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1257 int other_idx
= node
->left
->token
.opr
.idx
;
1259 node
->left
= node
->left
->left
;
1261 node
->left
->parent
= node
;
1263 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1264 if (other_idx
< BITSET_WORD_BITS
)
1265 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1271 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1273 static reg_errcode_t
1274 lower_subexps (void *extra
, bin_tree_t
*node
)
1276 regex_t
*preg
= (regex_t
*) extra
;
1277 reg_errcode_t err
= REG_NOERROR
;
1279 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1281 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1283 node
->left
->parent
= node
;
1285 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1287 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1289 node
->right
->parent
= node
;
1296 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1298 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1299 bin_tree_t
*body
= node
->left
;
1300 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1303 /* We do not optimize empty subexpressions, because otherwise we may
1304 have bad CONCAT nodes with NULL children. This is obviously not
1305 very common, so we do not lose much. An example that triggers
1306 this case is the sed "script" /\(\)/x. */
1307 && node
->left
!= NULL
1308 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1309 || !(dfa
->used_bkref_map
1310 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1313 /* Convert the SUBEXP node to the concatenation of an
1314 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1315 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1316 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1317 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1318 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1319 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1325 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1326 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1330 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331 nodes. Requires a postorder visit. */
1332 static reg_errcode_t
1333 calc_first (void *extra
, bin_tree_t
*node
)
1335 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1336 if (node
->token
.type
== CONCAT
)
1338 node
->first
= node
->left
->first
;
1339 node
->node_idx
= node
->left
->node_idx
;
1344 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1345 if (BE (node
->node_idx
== -1, 0))
1347 if (node
->token
.type
== ANCHOR
)
1348 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1353 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1354 static reg_errcode_t
1355 calc_next (void *extra
, bin_tree_t
*node
)
1357 switch (node
->token
.type
)
1359 case OP_DUP_ASTERISK
:
1360 node
->left
->next
= node
;
1363 node
->left
->next
= node
->right
->first
;
1364 node
->right
->next
= node
->next
;
1368 node
->left
->next
= node
->next
;
1370 node
->right
->next
= node
->next
;
1376 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1377 static reg_errcode_t
1378 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1380 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1381 int idx
= node
->node_idx
;
1382 reg_errcode_t err
= REG_NOERROR
;
1384 switch (node
->token
.type
)
1390 assert (node
->next
== NULL
);
1393 case OP_DUP_ASTERISK
:
1397 dfa
->has_plural_match
= 1;
1398 if (node
->left
!= NULL
)
1399 left
= node
->left
->first
->node_idx
;
1401 left
= node
->next
->node_idx
;
1402 if (node
->right
!= NULL
)
1403 right
= node
->right
->first
->node_idx
;
1405 right
= node
->next
->node_idx
;
1407 assert (right
> -1);
1408 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1413 case OP_OPEN_SUBEXP
:
1414 case OP_CLOSE_SUBEXP
:
1415 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1419 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1420 if (node
->token
.type
== OP_BACK_REF
)
1421 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1425 assert (!IS_EPSILON_NODE (node
->token
.type
));
1426 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1433 /* Duplicate the epsilon closure of the node ROOT_NODE.
1434 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1435 to their own constraint. */
1437 static reg_errcode_t
1439 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1440 int root_node
, unsigned int init_constraint
)
1442 int org_node
, clone_node
, ret
;
1443 unsigned int constraint
= init_constraint
;
1444 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1446 int org_dest
, clone_dest
;
1447 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1449 /* If the back reference epsilon-transit, its destination must
1450 also have the constraint. Then duplicate the epsilon closure
1451 of the destination of the back reference, and store it in
1452 edests of the back reference. */
1453 org_dest
= dfa
->nexts
[org_node
];
1454 re_node_set_empty (dfa
->edests
+ clone_node
);
1455 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1456 if (BE (clone_dest
== -1, 0))
1458 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1459 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1460 if (BE (ret
< 0, 0))
1463 else if (dfa
->edests
[org_node
].nelem
== 0)
1465 /* In case of the node can't epsilon-transit, don't duplicate the
1466 destination and store the original destination as the
1467 destination of the node. */
1468 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1471 else if (dfa
->edests
[org_node
].nelem
== 1)
1473 /* In case of the node can epsilon-transit, and it has only one
1475 org_dest
= dfa
->edests
[org_node
].elems
[0];
1476 re_node_set_empty (dfa
->edests
+ clone_node
);
1477 /* If the node is root_node itself, it means the epsilon clsoure
1478 has a loop. Then tie it to the destination of the root_node. */
1479 if (org_node
== root_node
&& clone_node
!= org_node
)
1481 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1482 if (BE (ret
< 0, 0))
1486 /* In case of the node has another constraint, add it. */
1487 constraint
|= dfa
->nodes
[org_node
].constraint
;
1488 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1489 if (BE (clone_dest
== -1, 0))
1491 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1492 if (BE (ret
< 0, 0))
1495 else /* dfa->edests[org_node].nelem == 2 */
1497 /* In case of the node can epsilon-transit, and it has two
1498 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1499 org_dest
= dfa
->edests
[org_node
].elems
[0];
1500 re_node_set_empty (dfa
->edests
+ clone_node
);
1501 /* Search for a duplicated node which satisfies the constraint. */
1502 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1503 if (clone_dest
== -1)
1505 /* There is no such duplicated node, create a new one. */
1507 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1508 if (BE (clone_dest
== -1, 0))
1510 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1511 if (BE (ret
< 0, 0))
1513 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1514 root_node
, constraint
);
1515 if (BE (err
!= REG_NOERROR
, 0))
1520 /* There is a duplicated node which satisfies the constraint,
1521 use it to avoid infinite loop. */
1522 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1523 if (BE (ret
< 0, 0))
1527 org_dest
= dfa
->edests
[org_node
].elems
[1];
1528 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1529 if (BE (clone_dest
== -1, 0))
1531 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1532 if (BE (ret
< 0, 0))
1535 org_node
= org_dest
;
1536 clone_node
= clone_dest
;
1541 /* Search for a node which is duplicated from the node ORG_NODE, and
1542 satisfies the constraint CONSTRAINT. */
1545 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1546 unsigned int constraint
)
1549 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1551 if (org_node
== dfa
->org_indices
[idx
]
1552 && constraint
== dfa
->nodes
[idx
].constraint
)
1553 return idx
; /* Found. */
1555 return -1; /* Not found. */
1558 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1559 Return the index of the new node, or -1 if insufficient storage is
1563 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1565 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1566 if (BE (dup_idx
!= -1, 1))
1568 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1569 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1570 dfa
->nodes
[dup_idx
].duplicated
= 1;
1572 /* Store the index of the original node. */
1573 dfa
->org_indices
[dup_idx
] = org_idx
;
1578 static reg_errcode_t
1579 calc_inveclosure (re_dfa_t
*dfa
)
1582 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1583 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1585 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1587 int *elems
= dfa
->eclosures
[src
].elems
;
1588 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1590 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1591 if (BE (ret
== -1, 0))
1599 /* Calculate "eclosure" for all the node in DFA. */
1601 static reg_errcode_t
1602 calc_eclosure (re_dfa_t
*dfa
)
1604 int node_idx
, incomplete
;
1606 assert (dfa
->nodes_len
> 0);
1609 /* For each nodes, calculate epsilon closure. */
1610 for (node_idx
= 0; ; ++node_idx
)
1613 re_node_set eclosure_elem
;
1614 if (node_idx
== dfa
->nodes_len
)
1623 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1626 /* If we have already calculated, skip it. */
1627 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1629 /* Calculate epsilon closure of `node_idx'. */
1630 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1631 if (BE (err
!= REG_NOERROR
, 0))
1634 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1637 re_node_set_free (&eclosure_elem
);
1643 /* Calculate epsilon closure of NODE. */
1645 static reg_errcode_t
1646 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1650 re_node_set eclosure
;
1653 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1654 if (BE (err
!= REG_NOERROR
, 0))
1657 /* This indicates that we are calculating this node now.
1658 We reference this value to avoid infinite loop. */
1659 dfa
->eclosures
[node
].nelem
= -1;
1661 /* If the current node has constraints, duplicate all nodes
1662 since they must inherit the constraints. */
1663 if (dfa
->nodes
[node
].constraint
1664 && dfa
->edests
[node
].nelem
1665 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1667 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1668 dfa
->nodes
[node
].constraint
);
1669 if (BE (err
!= REG_NOERROR
, 0))
1673 /* Expand each epsilon destination nodes. */
1674 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1675 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1677 re_node_set eclosure_elem
;
1678 int edest
= dfa
->edests
[node
].elems
[i
];
1679 /* If calculating the epsilon closure of `edest' is in progress,
1680 return intermediate result. */
1681 if (dfa
->eclosures
[edest
].nelem
== -1)
1686 /* If we haven't calculated the epsilon closure of `edest' yet,
1687 calculate now. Otherwise use calculated epsilon closure. */
1688 if (dfa
->eclosures
[edest
].nelem
== 0)
1690 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1691 if (BE (err
!= REG_NOERROR
, 0))
1695 eclosure_elem
= dfa
->eclosures
[edest
];
1696 /* Merge the epsilon closure of `edest'. */
1697 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1698 if (BE (err
!= REG_NOERROR
, 0))
1700 /* If the epsilon closure of `edest' is incomplete,
1701 the epsilon closure of this node is also incomplete. */
1702 if (dfa
->eclosures
[edest
].nelem
== 0)
1705 re_node_set_free (&eclosure_elem
);
1709 /* An epsilon closure includes itself. */
1710 ret
= re_node_set_insert (&eclosure
, node
);
1711 if (BE (ret
< 0, 0))
1713 if (incomplete
&& !root
)
1714 dfa
->eclosures
[node
].nelem
= 0;
1716 dfa
->eclosures
[node
] = eclosure
;
1717 *new_set
= eclosure
;
1721 /* Functions for token which are used in the parser. */
1723 /* Fetch a token from INPUT.
1724 We must not use this function inside bracket expressions. */
1728 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1730 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1733 /* Peek a token from INPUT, and return the length of the token.
1734 We must not use this function inside bracket expressions. */
1738 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1742 if (re_string_eoi (input
))
1744 token
->type
= END_OF_RE
;
1748 c
= re_string_peek_byte (input
, 0);
1751 token
->word_char
= 0;
1752 #ifdef RE_ENABLE_I18N
1753 token
->mb_partial
= 0;
1754 if (input
->mb_cur_max
> 1 &&
1755 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1757 token
->type
= CHARACTER
;
1758 token
->mb_partial
= 1;
1765 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1767 token
->type
= BACK_SLASH
;
1771 c2
= re_string_peek_byte_case (input
, 1);
1773 token
->type
= CHARACTER
;
1774 #ifdef RE_ENABLE_I18N
1775 if (input
->mb_cur_max
> 1)
1777 wint_t wc
= re_string_wchar_at (input
,
1778 re_string_cur_idx (input
) + 1);
1779 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1783 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1788 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1789 token
->type
= OP_ALT
;
1791 case '1': case '2': case '3': case '4': case '5':
1792 case '6': case '7': case '8': case '9':
1793 if (!(syntax
& RE_NO_BK_REFS
))
1795 token
->type
= OP_BACK_REF
;
1796 token
->opr
.idx
= c2
- '1';
1800 if (!(syntax
& RE_NO_GNU_OPS
))
1802 token
->type
= ANCHOR
;
1803 token
->opr
.ctx_type
= WORD_FIRST
;
1807 if (!(syntax
& RE_NO_GNU_OPS
))
1809 token
->type
= ANCHOR
;
1810 token
->opr
.ctx_type
= WORD_LAST
;
1814 if (!(syntax
& RE_NO_GNU_OPS
))
1816 token
->type
= ANCHOR
;
1817 token
->opr
.ctx_type
= WORD_DELIM
;
1821 if (!(syntax
& RE_NO_GNU_OPS
))
1823 token
->type
= ANCHOR
;
1824 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1828 if (!(syntax
& RE_NO_GNU_OPS
))
1829 token
->type
= OP_WORD
;
1832 if (!(syntax
& RE_NO_GNU_OPS
))
1833 token
->type
= OP_NOTWORD
;
1836 if (!(syntax
& RE_NO_GNU_OPS
))
1837 token
->type
= OP_SPACE
;
1840 if (!(syntax
& RE_NO_GNU_OPS
))
1841 token
->type
= OP_NOTSPACE
;
1844 if (!(syntax
& RE_NO_GNU_OPS
))
1846 token
->type
= ANCHOR
;
1847 token
->opr
.ctx_type
= BUF_FIRST
;
1851 if (!(syntax
& RE_NO_GNU_OPS
))
1853 token
->type
= ANCHOR
;
1854 token
->opr
.ctx_type
= BUF_LAST
;
1858 if (!(syntax
& RE_NO_BK_PARENS
))
1859 token
->type
= OP_OPEN_SUBEXP
;
1862 if (!(syntax
& RE_NO_BK_PARENS
))
1863 token
->type
= OP_CLOSE_SUBEXP
;
1866 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1867 token
->type
= OP_DUP_PLUS
;
1870 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1871 token
->type
= OP_DUP_QUESTION
;
1874 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1875 token
->type
= OP_OPEN_DUP_NUM
;
1878 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1879 token
->type
= OP_CLOSE_DUP_NUM
;
1887 token
->type
= CHARACTER
;
1888 #ifdef RE_ENABLE_I18N
1889 if (input
->mb_cur_max
> 1)
1891 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1892 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1896 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1901 if (syntax
& RE_NEWLINE_ALT
)
1902 token
->type
= OP_ALT
;
1905 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1906 token
->type
= OP_ALT
;
1909 token
->type
= OP_DUP_ASTERISK
;
1912 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1913 token
->type
= OP_DUP_PLUS
;
1916 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1917 token
->type
= OP_DUP_QUESTION
;
1920 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1921 token
->type
= OP_OPEN_DUP_NUM
;
1924 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1925 token
->type
= OP_CLOSE_DUP_NUM
;
1928 if (syntax
& RE_NO_BK_PARENS
)
1929 token
->type
= OP_OPEN_SUBEXP
;
1932 if (syntax
& RE_NO_BK_PARENS
)
1933 token
->type
= OP_CLOSE_SUBEXP
;
1936 token
->type
= OP_OPEN_BRACKET
;
1939 token
->type
= OP_PERIOD
;
1942 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1943 re_string_cur_idx (input
) != 0)
1945 char prev
= re_string_peek_byte (input
, -1);
1946 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1949 token
->type
= ANCHOR
;
1950 token
->opr
.ctx_type
= LINE_FIRST
;
1953 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1954 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1957 re_string_skip_bytes (input
, 1);
1958 peek_token (&next
, input
, syntax
);
1959 re_string_skip_bytes (input
, -1);
1960 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1963 token
->type
= ANCHOR
;
1964 token
->opr
.ctx_type
= LINE_LAST
;
1972 /* Peek a token from INPUT, and return the length of the token.
1973 We must not use this function out of bracket expressions. */
1977 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1980 if (re_string_eoi (input
))
1982 token
->type
= END_OF_RE
;
1985 c
= re_string_peek_byte (input
, 0);
1988 #ifdef RE_ENABLE_I18N
1989 if (input
->mb_cur_max
> 1 &&
1990 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1992 token
->type
= CHARACTER
;
1995 #endif /* RE_ENABLE_I18N */
1997 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1998 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2000 /* In this case, '\' escape a character. */
2002 re_string_skip_bytes (input
, 1);
2003 c2
= re_string_peek_byte (input
, 0);
2005 token
->type
= CHARACTER
;
2008 if (c
== '[') /* '[' is a special char in a bracket exps. */
2012 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2013 c2
= re_string_peek_byte (input
, 1);
2021 token
->type
= OP_OPEN_COLL_ELEM
;
2024 token
->type
= OP_OPEN_EQUIV_CLASS
;
2027 if (syntax
& RE_CHAR_CLASSES
)
2029 token
->type
= OP_OPEN_CHAR_CLASS
;
2032 /* else fall through. */
2034 token
->type
= CHARACTER
;
2044 token
->type
= OP_CHARSET_RANGE
;
2047 token
->type
= OP_CLOSE_BRACKET
;
2050 token
->type
= OP_NON_MATCH_LIST
;
2053 token
->type
= CHARACTER
;
2058 /* Functions for parser. */
2060 /* Entry point of the parser.
2061 Parse the regular expression REGEXP and return the structure tree.
2062 If an error is occured, ERR is set by error code, and return NULL.
2063 This function build the following tree, from regular expression <reg_exp>:
2069 CAT means concatenation.
2070 EOR means end of regular expression. */
2073 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2076 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2077 bin_tree_t
*tree
, *eor
, *root
;
2078 re_token_t current_token
;
2079 dfa
->syntax
= syntax
;
2080 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2081 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2082 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2084 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2086 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2089 if (BE (eor
== NULL
|| root
== NULL
, 0))
2097 /* This function build the following tree, from regular expression
2098 <branch1>|<branch2>:
2104 ALT means alternative, which represents the operator `|'. */
2107 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2108 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2110 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2111 bin_tree_t
*tree
, *branch
= NULL
;
2112 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2113 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2116 while (token
->type
== OP_ALT
)
2118 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2119 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2120 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2122 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2123 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2128 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2129 if (BE (tree
== NULL
, 0))
2138 /* This function build the following tree, from regular expression
2145 CAT means concatenation. */
2148 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2149 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2151 bin_tree_t
*tree
, *exp
;
2152 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2153 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2154 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2157 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2158 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2160 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2161 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2165 if (tree
!= NULL
&& exp
!= NULL
)
2167 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2174 else if (tree
== NULL
)
2176 /* Otherwise exp == NULL, we don't need to create new tree. */
2181 /* This function build the following tree, from regular expression a*:
2188 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2189 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2191 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2193 switch (token
->type
)
2196 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2197 if (BE (tree
== NULL
, 0))
2202 #ifdef RE_ENABLE_I18N
2203 if (dfa
->mb_cur_max
> 1)
2205 while (!re_string_eoi (regexp
)
2206 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2208 bin_tree_t
*mbc_remain
;
2209 fetch_token (token
, regexp
, syntax
);
2210 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2211 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2212 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2221 case OP_OPEN_SUBEXP
:
2222 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2223 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2226 case OP_OPEN_BRACKET
:
2227 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2228 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2232 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2237 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2238 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2239 if (BE (tree
== NULL
, 0))
2245 dfa
->has_mb_node
= 1;
2247 case OP_OPEN_DUP_NUM
:
2248 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2254 case OP_DUP_ASTERISK
:
2256 case OP_DUP_QUESTION
:
2257 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2262 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2264 fetch_token (token
, regexp
, syntax
);
2265 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2267 /* else fall through */
2268 case OP_CLOSE_SUBEXP
:
2269 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2270 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2275 /* else fall through */
2276 case OP_CLOSE_DUP_NUM
:
2277 /* We treat it as a normal character. */
2279 /* Then we can these characters as normal characters. */
2280 token
->type
= CHARACTER
;
2281 /* mb_partial and word_char bits should be initialized already
2283 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2284 if (BE (tree
== NULL
, 0))
2291 if ((token
->opr
.ctx_type
2292 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2293 && dfa
->word_ops_used
== 0)
2294 init_word_char (dfa
);
2295 if (token
->opr
.ctx_type
== WORD_DELIM
2296 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2298 bin_tree_t
*tree_first
, *tree_last
;
2299 if (token
->opr
.ctx_type
== WORD_DELIM
)
2301 token
->opr
.ctx_type
= WORD_FIRST
;
2302 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2303 token
->opr
.ctx_type
= WORD_LAST
;
2307 token
->opr
.ctx_type
= INSIDE_WORD
;
2308 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2309 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2311 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2312 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2313 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2321 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2322 if (BE (tree
== NULL
, 0))
2328 /* We must return here, since ANCHORs can't be followed
2329 by repetition operators.
2330 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2331 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2332 fetch_token (token
, regexp
, syntax
);
2335 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2336 if (BE (tree
== NULL
, 0))
2341 if (dfa
->mb_cur_max
> 1)
2342 dfa
->has_mb_node
= 1;
2346 tree
= build_charclass_op (dfa
, regexp
->trans
,
2347 (const unsigned char *) "alnum",
2348 (const unsigned char *) "_",
2349 token
->type
== OP_NOTWORD
, err
);
2350 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2355 tree
= build_charclass_op (dfa
, regexp
->trans
,
2356 (const unsigned char *) "space",
2357 (const unsigned char *) "",
2358 token
->type
== OP_NOTSPACE
, err
);
2359 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2369 /* Must not happen? */
2375 fetch_token (token
, regexp
, syntax
);
2377 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2378 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2380 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2381 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2383 /* In BRE consecutive duplications are not allowed. */
2384 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2385 && (token
->type
== OP_DUP_ASTERISK
2386 || token
->type
== OP_OPEN_DUP_NUM
))
2396 /* This function build the following tree, from regular expression
2404 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2405 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2407 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2410 cur_nsub
= preg
->re_nsub
++;
2412 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2414 /* The subexpression may be a null string. */
2415 if (token
->type
== OP_CLOSE_SUBEXP
)
2419 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2420 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2422 if (BE (*err
!= REG_NOERROR
, 0))
2426 if (cur_nsub
<= '9' - '1')
2427 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2429 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2430 if (BE (tree
== NULL
, 0))
2435 tree
->token
.opr
.idx
= cur_nsub
;
2439 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2442 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2443 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2445 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2446 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2447 re_token_t start_token
= *token
;
2449 if (token
->type
== OP_OPEN_DUP_NUM
)
2452 start
= fetch_number (regexp
, token
, syntax
);
2455 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2456 start
= 0; /* We treat "{,m}" as "{0,m}". */
2459 *err
= REG_BADBR
; /* <re>{} is invalid. */
2463 if (BE (start
!= -2, 1))
2465 /* We treat "{n}" as "{n,n}". */
2466 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2467 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2468 ? fetch_number (regexp
, token
, syntax
) : -2));
2470 if (BE (start
== -2 || end
== -2, 0))
2472 /* Invalid sequence. */
2473 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2475 if (token
->type
== END_OF_RE
)
2483 /* If the syntax bit is set, rollback. */
2484 re_string_set_index (regexp
, start_idx
);
2485 *token
= start_token
;
2486 token
->type
= CHARACTER
;
2487 /* mb_partial and word_char bits should be already initialized by
2492 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2494 /* First number greater than second. */
2501 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2502 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2505 fetch_token (token
, regexp
, syntax
);
2507 if (BE (elem
== NULL
, 0))
2509 if (BE (start
== 0 && end
== 0, 0))
2511 postorder (elem
, free_tree
, NULL
);
2515 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2516 if (BE (start
> 0, 0))
2519 for (i
= 2; i
<= start
; ++i
)
2521 elem
= duplicate_tree (elem
, dfa
);
2522 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2523 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2524 goto parse_dup_op_espace
;
2530 /* Duplicate ELEM before it is marked optional. */
2531 elem
= duplicate_tree (elem
, dfa
);
2537 if (elem
->token
.type
== SUBEXP
)
2538 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2540 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2541 if (BE (tree
== NULL
, 0))
2542 goto parse_dup_op_espace
;
2544 /* This loop is actually executed only when end != -1,
2545 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2546 already created the start+1-th copy. */
2547 for (i
= start
+ 2; i
<= end
; ++i
)
2549 elem
= duplicate_tree (elem
, dfa
);
2550 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2551 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2552 goto parse_dup_op_espace
;
2554 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2555 if (BE (tree
== NULL
, 0))
2556 goto parse_dup_op_espace
;
2560 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2564 parse_dup_op_espace
:
2569 /* Size of the names for collating symbol/equivalence_class/character_class.
2570 I'm not sure, but maybe enough. */
2571 #define BRACKET_NAME_BUF_SIZE 32
2574 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2575 Build the range expression which starts from START_ELEM, and ends
2576 at END_ELEM. The result are written to MBCSET and SBCSET.
2577 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2578 mbcset->range_ends, is a pointer argument sinse we may
2581 static reg_errcode_t
2583 # ifdef RE_ENABLE_I18N
2584 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2585 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2586 # else /* not RE_ENABLE_I18N */
2587 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2588 bracket_elem_t
*end_elem
)
2589 # endif /* not RE_ENABLE_I18N */
2591 unsigned int start_ch
, end_ch
;
2592 /* Equivalence Classes and Character Classes can't be a range start/end. */
2593 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2594 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2598 /* We can handle no multi character collating elements without libc
2600 if (BE ((start_elem
->type
== COLL_SYM
2601 && strlen ((char *) start_elem
->opr
.name
) > 1)
2602 || (end_elem
->type
== COLL_SYM
2603 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2604 return REG_ECOLLATE
;
2606 # ifdef RE_ENABLE_I18N
2611 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2613 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2614 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2616 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2617 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2619 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2620 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2621 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2622 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2623 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2624 return REG_ECOLLATE
;
2625 cmp_buf
[0] = start_wc
;
2626 cmp_buf
[4] = end_wc
;
2627 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2630 /* Got valid collation sequence values, add them as a new entry.
2631 However, for !_LIBC we have no collation elements: if the
2632 character set is single byte, the single byte character set
2633 that we build below suffices. parse_bracket_exp passes
2634 no MBCSET if dfa->mb_cur_max == 1. */
2637 /* Check the space of the arrays. */
2638 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2640 /* There is not enough space, need realloc. */
2641 wchar_t *new_array_start
, *new_array_end
;
2644 /* +1 in case of mbcset->nranges is 0. */
2645 new_nranges
= 2 * mbcset
->nranges
+ 1;
2646 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2647 are NULL if *range_alloc == 0. */
2648 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2650 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2653 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2656 mbcset
->range_starts
= new_array_start
;
2657 mbcset
->range_ends
= new_array_end
;
2658 *range_alloc
= new_nranges
;
2661 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2662 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2665 /* Build the table for single byte characters. */
2666 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2669 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2670 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2671 bitset_set (sbcset
, wc
);
2674 # else /* not RE_ENABLE_I18N */
2677 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2678 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2680 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2681 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2683 if (start_ch
> end_ch
)
2685 /* Build the table for single byte characters. */
2686 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2687 if (start_ch
<= ch
&& ch
<= end_ch
)
2688 bitset_set (sbcset
, ch
);
2690 # endif /* not RE_ENABLE_I18N */
2693 #endif /* not _LIBC */
2696 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2697 Build the collating element which is represented by NAME.
2698 The result are written to MBCSET and SBCSET.
2699 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2700 pointer argument since we may update it. */
2702 static reg_errcode_t
2704 # ifdef RE_ENABLE_I18N
2705 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2706 int *coll_sym_alloc
, const unsigned char *name
)
2707 # else /* not RE_ENABLE_I18N */
2708 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2709 # endif /* not RE_ENABLE_I18N */
2711 size_t name_len
= strlen ((const char *) name
);
2712 if (BE (name_len
!= 1, 0))
2713 return REG_ECOLLATE
;
2716 bitset_set (sbcset
, name
[0]);
2720 #endif /* not _LIBC */
2722 /* This function parse bracket expression like "[abc]", "[a-c]",
2726 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2727 reg_syntax_t syntax
, reg_errcode_t
*err
)
2730 const unsigned char *collseqmb
;
2731 const char *collseqwc
;
2734 const int32_t *symb_table
;
2735 const unsigned char *extra
;
2737 /* Local function for parse_bracket_exp used in _LIBC environement.
2738 Seek the collating symbol entry correspondings to NAME.
2739 Return the index of the symbol in the SYMB_TABLE. */
2742 __attribute ((always_inline
))
2743 seek_collating_symbol_entry (name
, name_len
)
2744 const unsigned char *name
;
2747 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2748 int32_t elem
= hash
% table_size
;
2749 if (symb_table
[2 * elem
] != 0)
2751 int32_t second
= hash
% (table_size
- 2) + 1;
2755 /* First compare the hashing value. */
2756 if (symb_table
[2 * elem
] == hash
2757 /* Compare the length of the name. */
2758 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2759 /* Compare the name. */
2760 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2763 /* Yep, this is the entry. */
2770 while (symb_table
[2 * elem
] != 0);
2775 /* Local function for parse_bracket_exp used in _LIBC environment.
2776 Look up the collation sequence value of BR_ELEM.
2777 Return the value if succeeded, UINT_MAX otherwise. */
2779 auto inline unsigned int
2780 __attribute ((always_inline
))
2781 lookup_collation_sequence_value (br_elem
)
2782 bracket_elem_t
*br_elem
;
2784 if (br_elem
->type
== SB_CHAR
)
2787 if (MB_CUR_MAX == 1)
2790 return collseqmb
[br_elem
->opr
.ch
];
2793 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2794 return __collseq_table_lookup (collseqwc
, wc
);
2797 else if (br_elem
->type
== MB_CHAR
)
2800 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2802 else if (br_elem
->type
== COLL_SYM
)
2804 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2808 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2810 if (symb_table
[2 * elem
] != 0)
2812 /* We found the entry. */
2813 idx
= symb_table
[2 * elem
+ 1];
2814 /* Skip the name of collating element name. */
2815 idx
+= 1 + extra
[idx
];
2816 /* Skip the byte sequence of the collating element. */
2817 idx
+= 1 + extra
[idx
];
2818 /* Adjust for the alignment. */
2819 idx
= (idx
+ 3) & ~3;
2820 /* Skip the multibyte collation sequence value. */
2821 idx
+= sizeof (unsigned int);
2822 /* Skip the wide char sequence of the collating element. */
2823 idx
+= sizeof (unsigned int) *
2824 (1 + *(unsigned int *) (extra
+ idx
));
2825 /* Return the collation sequence value. */
2826 return *(unsigned int *) (extra
+ idx
);
2828 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2830 /* No valid character. Match it as a single byte
2832 return collseqmb
[br_elem
->opr
.name
[0]];
2835 else if (sym_name_len
== 1)
2836 return collseqmb
[br_elem
->opr
.name
[0]];
2841 /* Local function for parse_bracket_exp used in _LIBC environement.
2842 Build the range expression which starts from START_ELEM, and ends
2843 at END_ELEM. The result are written to MBCSET and SBCSET.
2844 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2845 mbcset->range_ends, is a pointer argument sinse we may
2848 auto inline reg_errcode_t
2849 __attribute ((always_inline
))
2850 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2851 re_charset_t
*mbcset
;
2854 bracket_elem_t
*start_elem
, *end_elem
;
2857 uint32_t start_collseq
;
2858 uint32_t end_collseq
;
2860 /* Equivalence Classes and Character Classes can't be a range
2862 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2863 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2867 start_collseq
= lookup_collation_sequence_value (start_elem
);
2868 end_collseq
= lookup_collation_sequence_value (end_elem
);
2869 /* Check start/end collation sequence values. */
2870 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2871 return REG_ECOLLATE
;
2872 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2875 /* Got valid collation sequence values, add them as a new entry.
2876 However, if we have no collation elements, and the character set
2877 is single byte, the single byte character set that we
2878 build below suffices. */
2879 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2881 /* Check the space of the arrays. */
2882 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2884 /* There is not enough space, need realloc. */
2885 uint32_t *new_array_start
;
2886 uint32_t *new_array_end
;
2889 /* +1 in case of mbcset->nranges is 0. */
2890 new_nranges
= 2 * mbcset
->nranges
+ 1;
2891 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2893 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2896 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2899 mbcset
->range_starts
= new_array_start
;
2900 mbcset
->range_ends
= new_array_end
;
2901 *range_alloc
= new_nranges
;
2904 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2905 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2908 /* Build the table for single byte characters. */
2909 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2911 uint32_t ch_collseq
;
2913 if (MB_CUR_MAX == 1)
2916 ch_collseq
= collseqmb
[ch
];
2918 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2919 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2920 bitset_set (sbcset
, ch
);
2925 /* Local function for parse_bracket_exp used in _LIBC environement.
2926 Build the collating element which is represented by NAME.
2927 The result are written to MBCSET and SBCSET.
2928 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2929 pointer argument sinse we may update it. */
2931 auto inline reg_errcode_t
2932 __attribute ((always_inline
))
2933 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2934 re_charset_t
*mbcset
;
2935 int *coll_sym_alloc
;
2937 const unsigned char *name
;
2940 size_t name_len
= strlen ((const char *) name
);
2943 elem
= seek_collating_symbol_entry (name
, name_len
);
2944 if (symb_table
[2 * elem
] != 0)
2946 /* We found the entry. */
2947 idx
= symb_table
[2 * elem
+ 1];
2948 /* Skip the name of collating element name. */
2949 idx
+= 1 + extra
[idx
];
2951 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2953 /* No valid character, treat it as a normal
2955 bitset_set (sbcset
, name
[0]);
2959 return REG_ECOLLATE
;
2961 /* Got valid collation sequence, add it as a new entry. */
2962 /* Check the space of the arrays. */
2963 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2965 /* Not enough, realloc it. */
2966 /* +1 in case of mbcset->ncoll_syms is 0. */
2967 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2968 /* Use realloc since mbcset->coll_syms is NULL
2970 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2971 new_coll_sym_alloc
);
2972 if (BE (new_coll_syms
== NULL
, 0))
2974 mbcset
->coll_syms
= new_coll_syms
;
2975 *coll_sym_alloc
= new_coll_sym_alloc
;
2977 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2982 if (BE (name_len
!= 1, 0))
2983 return REG_ECOLLATE
;
2986 bitset_set (sbcset
, name
[0]);
2993 re_token_t br_token
;
2994 re_bitset_ptr_t sbcset
;
2995 #ifdef RE_ENABLE_I18N
2996 re_charset_t
*mbcset
;
2997 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2998 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2999 #endif /* not RE_ENABLE_I18N */
3001 bin_tree_t
*work_tree
;
3003 int first_round
= 1;
3005 collseqmb
= (const unsigned char *)
3006 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3007 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3013 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3014 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3015 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3016 _NL_COLLATE_SYMB_TABLEMB
);
3017 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3018 _NL_COLLATE_SYMB_EXTRAMB
);
3021 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3022 #ifdef RE_ENABLE_I18N
3023 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3024 #endif /* RE_ENABLE_I18N */
3025 #ifdef RE_ENABLE_I18N
3026 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3028 if (BE (sbcset
== NULL
, 0))
3029 #endif /* RE_ENABLE_I18N */
3035 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3036 if (BE (token
->type
== END_OF_RE
, 0))
3039 goto parse_bracket_exp_free_return
;
3041 if (token
->type
== OP_NON_MATCH_LIST
)
3043 #ifdef RE_ENABLE_I18N
3044 mbcset
->non_match
= 1;
3045 #endif /* not RE_ENABLE_I18N */
3047 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3048 bitset_set (sbcset
, '\n');
3049 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3050 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3051 if (BE (token
->type
== END_OF_RE
, 0))
3054 goto parse_bracket_exp_free_return
;
3058 /* We treat the first ']' as a normal character. */
3059 if (token
->type
== OP_CLOSE_BRACKET
)
3060 token
->type
= CHARACTER
;
3064 bracket_elem_t start_elem
, end_elem
;
3065 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3066 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3068 int token_len2
= 0, is_range_exp
= 0;
3071 start_elem
.opr
.name
= start_name_buf
;
3072 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3073 syntax
, first_round
);
3074 if (BE (ret
!= REG_NOERROR
, 0))
3077 goto parse_bracket_exp_free_return
;
3081 /* Get information about the next token. We need it in any case. */
3082 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3084 /* Do not check for ranges if we know they are not allowed. */
3085 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3087 if (BE (token
->type
== END_OF_RE
, 0))
3090 goto parse_bracket_exp_free_return
;
3092 if (token
->type
== OP_CHARSET_RANGE
)
3094 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3095 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3096 if (BE (token2
.type
== END_OF_RE
, 0))
3099 goto parse_bracket_exp_free_return
;
3101 if (token2
.type
== OP_CLOSE_BRACKET
)
3103 /* We treat the last '-' as a normal character. */
3104 re_string_skip_bytes (regexp
, -token_len
);
3105 token
->type
= CHARACTER
;
3112 if (is_range_exp
== 1)
3114 end_elem
.opr
.name
= end_name_buf
;
3115 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3117 if (BE (ret
!= REG_NOERROR
, 0))
3120 goto parse_bracket_exp_free_return
;
3123 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3126 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3127 &start_elem
, &end_elem
);
3129 # ifdef RE_ENABLE_I18N
3130 *err
= build_range_exp (sbcset
,
3131 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3132 &range_alloc
, &start_elem
, &end_elem
);
3134 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3136 #endif /* RE_ENABLE_I18N */
3137 if (BE (*err
!= REG_NOERROR
, 0))
3138 goto parse_bracket_exp_free_return
;
3142 switch (start_elem
.type
)
3145 bitset_set (sbcset
, start_elem
.opr
.ch
);
3147 #ifdef RE_ENABLE_I18N
3149 /* Check whether the array has enough space. */
3150 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3152 wchar_t *new_mbchars
;
3153 /* Not enough, realloc it. */
3154 /* +1 in case of mbcset->nmbchars is 0. */
3155 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3156 /* Use realloc since array is NULL if *alloc == 0. */
3157 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3159 if (BE (new_mbchars
== NULL
, 0))
3160 goto parse_bracket_exp_espace
;
3161 mbcset
->mbchars
= new_mbchars
;
3163 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3165 #endif /* RE_ENABLE_I18N */
3167 *err
= build_equiv_class (sbcset
,
3168 #ifdef RE_ENABLE_I18N
3169 mbcset
, &equiv_class_alloc
,
3170 #endif /* RE_ENABLE_I18N */
3171 start_elem
.opr
.name
);
3172 if (BE (*err
!= REG_NOERROR
, 0))
3173 goto parse_bracket_exp_free_return
;
3176 *err
= build_collating_symbol (sbcset
,
3177 #ifdef RE_ENABLE_I18N
3178 mbcset
, &coll_sym_alloc
,
3179 #endif /* RE_ENABLE_I18N */
3180 start_elem
.opr
.name
);
3181 if (BE (*err
!= REG_NOERROR
, 0))
3182 goto parse_bracket_exp_free_return
;
3185 *err
= build_charclass (regexp
->trans
, sbcset
,
3186 #ifdef RE_ENABLE_I18N
3187 mbcset
, &char_class_alloc
,
3188 #endif /* RE_ENABLE_I18N */
3189 start_elem
.opr
.name
, syntax
);
3190 if (BE (*err
!= REG_NOERROR
, 0))
3191 goto parse_bracket_exp_free_return
;
3198 if (BE (token
->type
== END_OF_RE
, 0))
3201 goto parse_bracket_exp_free_return
;
3203 if (token
->type
== OP_CLOSE_BRACKET
)
3207 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3209 /* If it is non-matching list. */
3211 bitset_not (sbcset
);
3213 #ifdef RE_ENABLE_I18N
3214 /* Ensure only single byte characters are set. */
3215 if (dfa
->mb_cur_max
> 1)
3216 bitset_mask (sbcset
, dfa
->sb_char
);
3218 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3219 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3220 || mbcset
->non_match
)))
3222 bin_tree_t
*mbc_tree
;
3224 /* Build a tree for complex bracket. */
3225 dfa
->has_mb_node
= 1;
3226 br_token
.type
= COMPLEX_BRACKET
;
3227 br_token
.opr
.mbcset
= mbcset
;
3228 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3229 if (BE (mbc_tree
== NULL
, 0))
3230 goto parse_bracket_exp_espace
;
3231 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3232 if (sbcset
[sbc_idx
])
3234 /* If there are no bits set in sbcset, there is no point
3235 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3236 if (sbc_idx
< BITSET_WORDS
)
3238 /* Build a tree for simple bracket. */
3239 br_token
.type
= SIMPLE_BRACKET
;
3240 br_token
.opr
.sbcset
= sbcset
;
3241 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3242 if (BE (work_tree
== NULL
, 0))
3243 goto parse_bracket_exp_espace
;
3245 /* Then join them by ALT node. */
3246 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3247 if (BE (work_tree
== NULL
, 0))
3248 goto parse_bracket_exp_espace
;
3253 work_tree
= mbc_tree
;
3257 #endif /* not RE_ENABLE_I18N */
3259 #ifdef RE_ENABLE_I18N
3260 free_charset (mbcset
);
3262 /* Build a tree for simple bracket. */
3263 br_token
.type
= SIMPLE_BRACKET
;
3264 br_token
.opr
.sbcset
= sbcset
;
3265 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3266 if (BE (work_tree
== NULL
, 0))
3267 goto parse_bracket_exp_espace
;
3271 parse_bracket_exp_espace
:
3273 parse_bracket_exp_free_return
:
3275 #ifdef RE_ENABLE_I18N
3276 free_charset (mbcset
);
3277 #endif /* RE_ENABLE_I18N */
3281 /* Parse an element in the bracket expression. */
3283 static reg_errcode_t
3284 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3285 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3286 reg_syntax_t syntax
, int accept_hyphen
)
3288 #ifdef RE_ENABLE_I18N
3290 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3291 if (cur_char_size
> 1)
3293 elem
->type
= MB_CHAR
;
3294 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3295 re_string_skip_bytes (regexp
, cur_char_size
);
3298 #endif /* RE_ENABLE_I18N */
3299 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3300 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3301 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3302 return parse_bracket_symbol (elem
, regexp
, token
);
3303 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3305 /* A '-' must only appear as anything but a range indicator before
3306 the closing bracket. Everything else is an error. */
3308 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3309 if (token2
.type
!= OP_CLOSE_BRACKET
)
3310 /* The actual error value is not standardized since this whole
3311 case is undefined. But ERANGE makes good sense. */
3314 elem
->type
= SB_CHAR
;
3315 elem
->opr
.ch
= token
->opr
.c
;
3319 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3320 such as [:<character_class>:], [.<collating_element>.], and
3321 [=<equivalent_class>=]. */
3323 static reg_errcode_t
3324 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3327 unsigned char ch
, delim
= token
->opr
.c
;
3329 if (re_string_eoi(regexp
))
3333 if (i
>= BRACKET_NAME_BUF_SIZE
)
3335 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3336 ch
= re_string_fetch_byte_case (regexp
);
3338 ch
= re_string_fetch_byte (regexp
);
3339 if (re_string_eoi(regexp
))
3341 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3343 elem
->opr
.name
[i
] = ch
;
3345 re_string_skip_bytes (regexp
, 1);
3346 elem
->opr
.name
[i
] = '\0';
3347 switch (token
->type
)
3349 case OP_OPEN_COLL_ELEM
:
3350 elem
->type
= COLL_SYM
;
3352 case OP_OPEN_EQUIV_CLASS
:
3353 elem
->type
= EQUIV_CLASS
;
3355 case OP_OPEN_CHAR_CLASS
:
3356 elem
->type
= CHAR_CLASS
;
3364 /* Helper function for parse_bracket_exp.
3365 Build the equivalence class which is represented by NAME.
3366 The result are written to MBCSET and SBCSET.
3367 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3368 is a pointer argument sinse we may update it. */
3370 static reg_errcode_t
3371 #ifdef RE_ENABLE_I18N
3372 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3373 int *equiv_class_alloc
, const unsigned char *name
)
3374 #else /* not RE_ENABLE_I18N */
3375 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3376 #endif /* not RE_ENABLE_I18N */
3379 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3382 const int32_t *table
, *indirect
;
3383 const unsigned char *weights
, *extra
, *cp
;
3384 unsigned char char_buf
[2];
3388 /* This #include defines a local function! */
3389 # include <locale/weight.h>
3390 /* Calculate the index for equivalence class. */
3392 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3393 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3394 _NL_COLLATE_WEIGHTMB
);
3395 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3396 _NL_COLLATE_EXTRAMB
);
3397 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3398 _NL_COLLATE_INDIRECTMB
);
3399 idx1
= findidx (&cp
);
3400 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3401 /* This isn't a valid character. */
3402 return REG_ECOLLATE
;
3404 /* Build single byte matcing table for this equivalence class. */
3405 char_buf
[1] = (unsigned char) '\0';
3406 len
= weights
[idx1
& 0xffffff];
3407 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3411 idx2
= findidx (&cp
);
3416 /* This isn't a valid character. */
3418 /* Compare only if the length matches and the collation rule
3419 index is the same. */
3420 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3424 while (cnt
<= len
&&
3425 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3426 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3430 bitset_set (sbcset
, ch
);
3433 /* Check whether the array has enough space. */
3434 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3436 /* Not enough, realloc it. */
3437 /* +1 in case of mbcset->nequiv_classes is 0. */
3438 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3439 /* Use realloc since the array is NULL if *alloc == 0. */
3440 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3442 new_equiv_class_alloc
);
3443 if (BE (new_equiv_classes
== NULL
, 0))
3445 mbcset
->equiv_classes
= new_equiv_classes
;
3446 *equiv_class_alloc
= new_equiv_class_alloc
;
3448 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3453 if (BE (strlen ((const char *) name
) != 1, 0))
3454 return REG_ECOLLATE
;
3455 bitset_set (sbcset
, *name
);
3460 /* Helper function for parse_bracket_exp.
3461 Build the character class which is represented by NAME.
3462 The result are written to MBCSET and SBCSET.
3463 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3464 is a pointer argument sinse we may update it. */
3466 static reg_errcode_t
3467 #ifdef RE_ENABLE_I18N
3468 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3469 re_charset_t
*mbcset
, int *char_class_alloc
,
3470 const unsigned char *class_name
, reg_syntax_t syntax
)
3471 #else /* not RE_ENABLE_I18N */
3472 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3473 const unsigned char *class_name
, reg_syntax_t syntax
)
3474 #endif /* not RE_ENABLE_I18N */
3477 const char *name
= (const char *) class_name
;
3479 /* In case of REG_ICASE "upper" and "lower" match the both of
3480 upper and lower cases. */
3481 if ((syntax
& RE_ICASE
)
3482 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3485 #ifdef RE_ENABLE_I18N
3486 /* Check the space of the arrays. */
3487 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3489 /* Not enough, realloc it. */
3490 /* +1 in case of mbcset->nchar_classes is 0. */
3491 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3492 /* Use realloc since array is NULL if *alloc == 0. */
3493 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3494 new_char_class_alloc
);
3495 if (BE (new_char_classes
== NULL
, 0))
3497 mbcset
->char_classes
= new_char_classes
;
3498 *char_class_alloc
= new_char_class_alloc
;
3500 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3501 #endif /* RE_ENABLE_I18N */
3503 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3505 if (BE (trans != NULL, 0)) \
3507 for (i = 0; i < SBC_MAX; ++i) \
3508 if (ctype_func (i)) \
3509 bitset_set (sbcset, trans[i]); \
3513 for (i = 0; i < SBC_MAX; ++i) \
3514 if (ctype_func (i)) \
3515 bitset_set (sbcset, i); \
3519 if (strcmp (name
, "alnum") == 0)
3520 BUILD_CHARCLASS_LOOP (isalnum
);
3521 else if (strcmp (name
, "cntrl") == 0)
3522 BUILD_CHARCLASS_LOOP (iscntrl
);
3523 else if (strcmp (name
, "lower") == 0)
3524 BUILD_CHARCLASS_LOOP (islower
);
3525 else if (strcmp (name
, "space") == 0)
3526 BUILD_CHARCLASS_LOOP (isspace
);
3527 else if (strcmp (name
, "alpha") == 0)
3528 BUILD_CHARCLASS_LOOP (isalpha
);
3529 else if (strcmp (name
, "digit") == 0)
3530 BUILD_CHARCLASS_LOOP (isdigit
);
3531 else if (strcmp (name
, "print") == 0)
3532 BUILD_CHARCLASS_LOOP (isprint
);
3533 else if (strcmp (name
, "upper") == 0)
3534 BUILD_CHARCLASS_LOOP (isupper
);
3535 else if (strcmp (name
, "blank") == 0)
3536 BUILD_CHARCLASS_LOOP (isblank
);
3537 else if (strcmp (name
, "graph") == 0)
3538 BUILD_CHARCLASS_LOOP (isgraph
);
3539 else if (strcmp (name
, "punct") == 0)
3540 BUILD_CHARCLASS_LOOP (ispunct
);
3541 else if (strcmp (name
, "xdigit") == 0)
3542 BUILD_CHARCLASS_LOOP (isxdigit
);
3550 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3551 const unsigned char *class_name
,
3552 const unsigned char *extra
, int non_match
,
3555 re_bitset_ptr_t sbcset
;
3556 #ifdef RE_ENABLE_I18N
3557 re_charset_t
*mbcset
;
3559 #endif /* not RE_ENABLE_I18N */
3561 re_token_t br_token
;
3564 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3565 #ifdef RE_ENABLE_I18N
3566 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3567 #endif /* RE_ENABLE_I18N */
3569 #ifdef RE_ENABLE_I18N
3570 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3571 #else /* not RE_ENABLE_I18N */
3572 if (BE (sbcset
== NULL
, 0))
3573 #endif /* not RE_ENABLE_I18N */
3581 #ifdef RE_ENABLE_I18N
3582 mbcset
->non_match
= 1;
3583 #endif /* not RE_ENABLE_I18N */
3586 /* We don't care the syntax in this case. */
3587 ret
= build_charclass (trans
, sbcset
,
3588 #ifdef RE_ENABLE_I18N
3590 #endif /* RE_ENABLE_I18N */
3593 if (BE (ret
!= REG_NOERROR
, 0))
3596 #ifdef RE_ENABLE_I18N
3597 free_charset (mbcset
);
3598 #endif /* RE_ENABLE_I18N */
3602 /* \w match '_' also. */
3603 for (; *extra
; extra
++)
3604 bitset_set (sbcset
, *extra
);
3606 /* If it is non-matching list. */
3608 bitset_not (sbcset
);
3610 #ifdef RE_ENABLE_I18N
3611 /* Ensure only single byte characters are set. */
3612 if (dfa
->mb_cur_max
> 1)
3613 bitset_mask (sbcset
, dfa
->sb_char
);
3616 /* Build a tree for simple bracket. */
3617 br_token
.type
= SIMPLE_BRACKET
;
3618 br_token
.opr
.sbcset
= sbcset
;
3619 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3620 if (BE (tree
== NULL
, 0))
3621 goto build_word_op_espace
;
3623 #ifdef RE_ENABLE_I18N
3624 if (dfa
->mb_cur_max
> 1)
3626 bin_tree_t
*mbc_tree
;
3627 /* Build a tree for complex bracket. */
3628 br_token
.type
= COMPLEX_BRACKET
;
3629 br_token
.opr
.mbcset
= mbcset
;
3630 dfa
->has_mb_node
= 1;
3631 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3632 if (BE (mbc_tree
== NULL
, 0))
3633 goto build_word_op_espace
;
3634 /* Then join them by ALT node. */
3635 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3636 if (BE (mbc_tree
!= NULL
, 1))
3641 free_charset (mbcset
);
3644 #else /* not RE_ENABLE_I18N */
3646 #endif /* not RE_ENABLE_I18N */
3648 build_word_op_espace
:
3650 #ifdef RE_ENABLE_I18N
3651 free_charset (mbcset
);
3652 #endif /* RE_ENABLE_I18N */
3657 /* This is intended for the expressions like "a{1,3}".
3658 Fetch a number from `input', and return the number.
3659 Return -1, if the number field is empty like "{,1}".
3660 Return -2, If an error is occured. */
3663 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3669 fetch_token (token
, input
, syntax
);
3671 if (BE (token
->type
== END_OF_RE
, 0))
3673 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3675 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3676 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3677 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3682 #ifdef RE_ENABLE_I18N
3684 free_charset (re_charset_t
*cset
)
3686 re_free (cset
->mbchars
);
3688 re_free (cset
->coll_syms
);
3689 re_free (cset
->equiv_classes
);
3690 re_free (cset
->range_starts
);
3691 re_free (cset
->range_ends
);
3693 re_free (cset
->char_classes
);
3696 #endif /* RE_ENABLE_I18N */
3698 /* Functions for binary tree operation. */
3700 /* Create a tree node. */
3703 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3704 re_token_type_t type
)
3708 return create_token_tree (dfa
, left
, right
, &t
);
3712 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3713 const re_token_t
*token
)
3716 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3718 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3720 if (storage
== NULL
)
3722 storage
->next
= dfa
->str_tree_storage
;
3723 dfa
->str_tree_storage
= storage
;
3724 dfa
->str_tree_storage_idx
= 0;
3726 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3728 tree
->parent
= NULL
;
3730 tree
->right
= right
;
3731 tree
->token
= *token
;
3732 tree
->token
.duplicated
= 0;
3733 tree
->token
.opt_subexp
= 0;
3736 tree
->node_idx
= -1;
3739 left
->parent
= tree
;
3741 right
->parent
= tree
;
3745 /* Mark the tree SRC as an optional subexpression.
3746 To be called from preorder or postorder. */
3748 static reg_errcode_t
3749 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3751 int idx
= (int) (long) extra
;
3752 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3753 node
->token
.opt_subexp
= 1;
3758 /* Free the allocated memory inside NODE. */
3761 free_token (re_token_t
*node
)
3763 #ifdef RE_ENABLE_I18N
3764 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3765 free_charset (node
->opr
.mbcset
);
3767 #endif /* RE_ENABLE_I18N */
3768 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3769 re_free (node
->opr
.sbcset
);
3772 /* Worker function for tree walking. Free the allocated memory inside NODE
3773 and its children. */
3775 static reg_errcode_t
3776 free_tree (void *extra
, bin_tree_t
*node
)
3778 free_token (&node
->token
);
3783 /* Duplicate the node SRC, and return new node. This is a preorder
3784 visit similar to the one implemented by the generic visitor, but
3785 we need more infrastructure to maintain two parallel trees --- so,
3786 it's easier to duplicate. */
3789 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3791 const bin_tree_t
*node
;
3792 bin_tree_t
*dup_root
;
3793 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3795 for (node
= root
; ; )
3797 /* Create a new tree and link it back to the current parent. */
3798 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3801 (*p_new
)->parent
= dup_node
;
3802 (*p_new
)->token
.duplicated
= 1;
3805 /* Go to the left node, or up and to the right. */
3809 p_new
= &dup_node
->left
;
3813 const bin_tree_t
*prev
= NULL
;
3814 while (node
->right
== prev
|| node
->right
== NULL
)
3817 node
= node
->parent
;
3818 dup_node
= dup_node
->parent
;
3823 p_new
= &dup_node
->right
;