1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3 Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
28 static void free_charset (re_charset_t
*cset
);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t
*preg
);
31 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
35 static reg_errcode_t
analyze (regex_t
*preg
);
36 static reg_errcode_t
preorder (bin_tree_t
*root
,
37 reg_errcode_t (fn (void *, bin_tree_t
*)),
39 static reg_errcode_t
postorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
43 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
44 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
46 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
49 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
50 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
53 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
55 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
56 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
58 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 reg_syntax_t syntax
) internal_function
;
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 Idx nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 Idx nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 Idx nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 Idx nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
92 Idx
*equiv_class_alloc
,
93 const unsigned char *name
);
94 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
97 Idx
*char_class_alloc
,
98 const unsigned char *class_name
,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
102 const unsigned char *name
);
103 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
105 const unsigned char *class_name
,
106 reg_syntax_t syntax
);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
109 RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 const unsigned char *extra
,
112 bool non_match
, reg_errcode_t
*err
);
113 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
114 bin_tree_t
*left
, bin_tree_t
*right
,
115 re_token_type_t type
);
116 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 const re_token_t
*token
);
119 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
120 static void free_token (re_token_t
*node
);
121 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
122 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid
[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx
[] =
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
215 re_compile_pattern (pattern
, length
, bufp
)
218 struct re_pattern_buffer
*bufp
;
219 #else /* size_t might promote */
221 re_compile_pattern (const char *pattern
, size_t length
,
222 struct re_pattern_buffer
*bufp
)
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
232 /* Match anchors at newline. */
233 bufp
->newline_anchor
= 1;
235 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
239 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
242 weak_alias (__re_compile_pattern
, re_compile_pattern
)
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options
;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
261 re_set_syntax (syntax
)
264 reg_syntax_t ret
= re_syntax_options
;
266 re_syntax_options
= syntax
;
270 weak_alias (__re_set_syntax
, re_set_syntax
)
274 re_compile_fastmap (bufp
)
275 struct re_pattern_buffer
*bufp
;
277 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
278 char *fastmap
= bufp
->fastmap
;
280 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
281 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
282 if (dfa
->init_state
!= dfa
->init_state_word
)
283 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
284 if (dfa
->init_state
!= dfa
->init_state_nl
)
285 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
286 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
287 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
288 bufp
->fastmap_accurate
= 1;
292 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
296 __attribute ((always_inline
))
297 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
301 fastmap
[tolower (ch
)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
308 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
311 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
313 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
314 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
316 Idx node
= init_state
->nodes
.elems
[node_cnt
];
317 re_token_type_t type
= dfa
->nodes
[node
].type
;
319 if (type
== CHARACTER
)
321 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
325 unsigned char buf
[MB_LEN_MAX
];
331 *p
++ = dfa
->nodes
[node
].opr
.c
;
332 while (++node
< dfa
->nodes_len
333 && dfa
->nodes
[node
].type
== CHARACTER
334 && dfa
->nodes
[node
].mb_partial
)
335 *p
++ = dfa
->nodes
[node
].opr
.c
;
336 memset (&state
, '\0', sizeof (state
));
337 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
339 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
341 re_set_fastmap (fastmap
, false, buf
[0]);
345 else if (type
== SIMPLE_BRACKET
)
348 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
351 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
352 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
353 if (w
& ((bitset_word_t
) 1 << j
))
354 re_set_fastmap (fastmap
, icase
, ch
);
357 #ifdef RE_ENABLE_I18N
358 else if (type
== COMPLEX_BRACKET
)
360 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
364 /* See if we have to try all bytes which start multiple collation
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
371 && (cset
->ncoll_syms
|| cset
->nranges
))
373 const int32_t *table
= (const int32_t *)
374 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
375 for (i
= 0; i
< SBC_MAX
; ++i
)
377 re_set_fastmap (fastmap
, icase
, i
);
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa
->mb_cur_max
> 1
386 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
388 || cset
->nequiv_classes
396 memset (&mbs
, 0, sizeof (mbs
));
397 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
398 re_set_fastmap (fastmap
, false, (int) c
);
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i
= 0; i
< cset
->nmbchars
; ++i
)
410 memset (&state
, '\0', sizeof (state
));
411 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
412 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
413 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
415 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
417 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
422 #endif /* RE_ENABLE_I18N */
423 else if (type
== OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type
== OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type
== END_OF_RE
)
429 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
430 if (type
== END_OF_RE
)
431 bufp
->can_be_null
= 1;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg
, pattern
, cflags
)
475 regex_t
*_Restrict_ preg
;
476 const char *_Restrict_ pattern
;
480 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC
);
487 /* Try to allocate space for the fastmap. */
488 preg
->fastmap
= re_malloc (char, SBC_MAX
);
489 if (BE (preg
->fastmap
== NULL
, 0))
492 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags
& REG_NEWLINE
)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax
&= ~RE_DOT_NEWLINE
;
498 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
499 /* It also changes the matching behavior. */
500 preg
->newline_anchor
= 1;
503 preg
->newline_anchor
= 0;
504 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
505 preg
->translate
= NULL
;
507 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret
== REG_ERPAREN
)
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret
== REG_NOERROR
, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg
);
521 /* Some error occurred while compiling the expression. */
522 re_free (preg
->fastmap
);
523 preg
->fastmap
= NULL
;
529 weak_alias (__regcomp
, regcomp
)
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
537 regerror (errcode
, preg
, errbuf
, errbuf_size
)
539 const regex_t
*_Restrict_ preg
;
540 char *_Restrict_ errbuf
;
542 #else /* size_t might promote */
544 regerror (int errcode
, const regex_t
*_Restrict_ preg
,
545 char *_Restrict_ errbuf
, size_t errbuf_size
)
552 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
553 / sizeof (__re_error_msgid_idx
[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
560 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
562 msg_size
= strlen (msg
) + 1; /* Includes the null. */
564 if (BE (errbuf_size
!= 0, 1))
566 size_t cpy_size
= msg_size
;
567 if (BE (msg_size
> errbuf_size
, 0))
569 cpy_size
= errbuf_size
- 1;
570 errbuf
[cpy_size
] = '\0';
572 memcpy (errbuf
, msg
, cpy_size
);
578 weak_alias (__regerror
, regerror
)
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 static const bitset_t utf8_sb_map
=
589 /* Set the first 128 bits. */
590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
591 # error "bitset_word_t is narrower than 32 bits"
592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
600 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
602 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
608 free_dfa_content (re_dfa_t
*dfa
)
613 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
614 free_token (dfa
->nodes
+ i
);
615 re_free (dfa
->nexts
);
616 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
618 if (dfa
->eclosures
!= NULL
)
619 re_node_set_free (dfa
->eclosures
+ i
);
620 if (dfa
->inveclosures
!= NULL
)
621 re_node_set_free (dfa
->inveclosures
+ i
);
622 if (dfa
->edests
!= NULL
)
623 re_node_set_free (dfa
->edests
+ i
);
625 re_free (dfa
->edests
);
626 re_free (dfa
->eclosures
);
627 re_free (dfa
->inveclosures
);
628 re_free (dfa
->nodes
);
630 if (dfa
->state_table
)
631 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
633 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
634 for (j
= 0; j
< entry
->num
; ++j
)
636 re_dfastate_t
*state
= entry
->array
[j
];
639 re_free (entry
->array
);
641 re_free (dfa
->state_table
);
642 #ifdef RE_ENABLE_I18N
643 if (dfa
->sb_char
!= utf8_sb_map
)
644 re_free (dfa
->sb_char
);
646 re_free (dfa
->subexp_map
);
648 re_free (dfa
->re_str
);
655 /* Free dynamically allocated space used by PREG. */
661 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
662 if (BE (dfa
!= NULL
, 1))
663 free_dfa_content (dfa
);
667 re_free (preg
->fastmap
);
668 preg
->fastmap
= NULL
;
670 re_free (preg
->translate
);
671 preg
->translate
= NULL
;
674 weak_alias (__regfree
, regfree
)
677 /* Entry points compatible with 4.2 BSD regex library. We don't define
678 them unless specifically requested. */
680 #if defined _REGEX_RE_COMP || defined _LIBC
682 /* BSD has one and only one pattern buffer. */
683 static struct re_pattern_buffer re_comp_buf
;
687 /* Make these definitions weak in libc, so POSIX programs can redefine
688 these names if they don't use our functions, and still use
689 regcomp/regexec above without link errors. */
700 if (!re_comp_buf
.buffer
)
701 return gettext ("No previous regular expression");
705 if (re_comp_buf
.buffer
)
707 fastmap
= re_comp_buf
.fastmap
;
708 re_comp_buf
.fastmap
= NULL
;
709 __regfree (&re_comp_buf
);
710 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
711 re_comp_buf
.fastmap
= fastmap
;
714 if (re_comp_buf
.fastmap
== NULL
)
716 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
717 if (re_comp_buf
.fastmap
== NULL
)
718 return (char *) gettext (__re_error_msgid
719 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
722 /* Since `re_exec' always passes NULL for the `regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */
725 /* Match anchors at newlines. */
726 re_comp_buf
.newline_anchor
= 1;
728 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
738 libc_freeres_fn (free_mem
)
740 __regfree (&re_comp_buf
);
744 #endif /* _REGEX_RE_COMP */
746 /* Internal entry point.
747 Compile the regular expression PATTERN, whose length is LENGTH.
748 SYNTAX indicate regular expression's syntax. */
751 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
754 reg_errcode_t err
= REG_NOERROR
;
758 /* Initialize the pattern buffer. */
759 preg
->fastmap_accurate
= 0;
760 preg
->syntax
= syntax
;
761 preg
->not_bol
= preg
->not_eol
= 0;
764 preg
->can_be_null
= 0;
765 preg
->regs_allocated
= REGS_UNALLOCATED
;
767 /* Initialize the dfa. */
768 dfa
= (re_dfa_t
*) preg
->buffer
;
769 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
771 /* If zero allocated, but buffer is non-null, try to realloc
772 enough space. This loses if buffer's address is bogus, but
773 that is the user's responsibility. If ->buffer is NULL this
774 is a simple allocation. */
775 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
778 preg
->allocated
= sizeof (re_dfa_t
);
779 preg
->buffer
= (unsigned char *) dfa
;
781 preg
->used
= sizeof (re_dfa_t
);
783 err
= init_dfa (dfa
, length
);
784 if (BE (err
!= REG_NOERROR
, 0))
786 free_dfa_content (dfa
);
792 /* Note: length+1 will not overflow since it is checked in init_dfa. */
793 dfa
->re_str
= re_malloc (char, length
+ 1);
794 strncpy (dfa
->re_str
, pattern
, length
+ 1);
797 __libc_lock_init (dfa
->lock
);
799 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
800 (syntax
& RE_ICASE
) != 0, dfa
);
801 if (BE (err
!= REG_NOERROR
, 0))
803 re_compile_internal_free_return
:
804 free_workarea_compile (preg
);
805 re_string_destruct (®exp
);
806 free_dfa_content (dfa
);
812 /* Parse the regular expression, and build a structure tree. */
814 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
815 if (BE (dfa
->str_tree
== NULL
, 0))
816 goto re_compile_internal_free_return
;
818 /* Analyze the tree and create the nfa. */
819 err
= analyze (preg
);
820 if (BE (err
!= REG_NOERROR
, 0))
821 goto re_compile_internal_free_return
;
823 #ifdef RE_ENABLE_I18N
824 /* If possible, do searching in single byte encoding to speed things up. */
825 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
829 /* Then create the initial state of the dfa. */
830 err
= create_initial_state (dfa
);
832 /* Release work areas. */
833 free_workarea_compile (preg
);
834 re_string_destruct (®exp
);
836 if (BE (err
!= REG_NOERROR
, 0))
838 free_dfa_content (dfa
);
846 /* Initialize DFA. We use the length of the regular expression PAT_LEN
847 as the initial length of some arrays. */
850 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
852 __re_size_t table_size
;
856 #ifdef RE_ENABLE_I18N
857 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
859 size_t max_i18n_object_size
= 0;
861 size_t max_object_size
=
862 MAX (sizeof (struct re_state_table_entry
),
863 MAX (sizeof (re_token_t
),
864 MAX (sizeof (re_node_set
),
865 MAX (sizeof (regmatch_t
),
866 max_i18n_object_size
))));
868 memset (dfa
, '\0', sizeof (re_dfa_t
));
870 /* Force allocation of str_tree_storage the first time. */
871 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
873 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
874 calculation below, and for similar doubling calculations
875 elsewhere. And it's <= rather than <, because some of the
876 doubling calculations add 1 afterwards. */
877 if (BE (SIZE_MAX
/ max_object_size
/ 2 <= pat_len
, 0))
880 dfa
->nodes_alloc
= pat_len
+ 1;
881 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
883 /* table_size = 2 ^ ceil(log pat_len) */
884 for (table_size
= 1; ; table_size
<<= 1)
885 if (table_size
> pat_len
)
888 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
889 dfa
->state_hash_mask
= table_size
- 1;
891 dfa
->mb_cur_max
= MB_CUR_MAX
;
893 if (dfa
->mb_cur_max
== 6
894 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
896 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
899 codeset_name
= nl_langinfo (CODESET
);
900 if (strcasecmp (codeset_name
, "UTF-8") == 0
901 || strcasecmp (codeset_name
, "UTF8") == 0)
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa
->map_notascii
= 0;
909 #ifdef RE_ENABLE_I18N
910 if (dfa
->mb_cur_max
> 1)
913 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
918 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
919 if (BE (dfa
->sb_char
== NULL
, 0))
922 /* Set the bits corresponding to single byte chars. */
923 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
924 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
926 wint_t wch
= __btowc (ch
);
928 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
930 if (isascii (ch
) && wch
!= ch
)
931 dfa
->map_notascii
= 1;
938 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
943 /* Initialize WORD_CHAR table, which indicate which character is
944 "word". In this case "word" means that it is the word construction
945 character used by some operators like "\<", "\>", etc. */
949 init_word_char (re_dfa_t
*dfa
)
952 dfa
->word_ops_used
= 1;
953 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
954 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
955 if (isalnum (ch
) || ch
== '_')
956 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
959 /* Free the work area which are only used while compiling. */
962 free_workarea_compile (regex_t
*preg
)
964 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
965 bin_tree_storage_t
*storage
, *next
;
966 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
968 next
= storage
->next
;
971 dfa
->str_tree_storage
= NULL
;
972 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
973 dfa
->str_tree
= NULL
;
974 re_free (dfa
->org_indices
);
975 dfa
->org_indices
= NULL
;
978 /* Create initial states for all contexts. */
981 create_initial_state (re_dfa_t
*dfa
)
985 re_node_set init_nodes
;
987 /* Initial states have the epsilon closure of the node which is
988 the first node of the regular expression. */
989 first
= dfa
->str_tree
->first
->node_idx
;
990 dfa
->init_node
= first
;
991 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
992 if (BE (err
!= REG_NOERROR
, 0))
995 /* The back-references which are in initial states can epsilon transit,
996 since in this case all of the subexpressions can be null.
997 Then we add epsilon closures of the nodes which are the next nodes of
998 the back-references. */
999 if (dfa
->nbackref
> 0)
1000 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1002 Idx node_idx
= init_nodes
.elems
[i
];
1003 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1006 if (type
!= OP_BACK_REF
)
1008 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1010 re_token_t
*clexp_node
;
1011 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1012 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1013 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1016 if (clexp_idx
== init_nodes
.nelem
)
1019 if (type
== OP_BACK_REF
)
1021 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1022 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1024 reg_errcode_t merge_err
1025 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1026 if (merge_err
!= REG_NOERROR
)
1033 /* It must be the first time to invoke acquire_state. */
1034 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1035 /* We don't check ERR here, since the initial state must not be NULL. */
1036 if (BE (dfa
->init_state
== NULL
, 0))
1038 if (dfa
->init_state
->has_constraint
)
1040 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1042 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1044 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1048 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1049 || dfa
->init_state_begbuf
== NULL
, 0))
1053 dfa
->init_state_word
= dfa
->init_state_nl
1054 = dfa
->init_state_begbuf
= dfa
->init_state
;
1056 re_node_set_free (&init_nodes
);
1060 #ifdef RE_ENABLE_I18N
1061 /* If it is possible to do searching in single byte encoding instead of UTF-8
1062 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063 DFA nodes where needed. */
1066 optimize_utf8 (re_dfa_t
*dfa
)
1070 bool mb_chars
= false;
1071 bool has_period
= false;
1073 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1074 switch (dfa
->nodes
[node
].type
)
1077 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1081 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1089 /* Word anchors etc. cannot be handled. It's okay to test
1090 opr.ctx_type since constraints (for all DFA nodes) are
1091 created by ORing one or more opr.ctx_type values. */
1101 case OP_DUP_ASTERISK
:
1102 case OP_OPEN_SUBEXP
:
1103 case OP_CLOSE_SUBEXP
:
1105 case COMPLEX_BRACKET
:
1107 case SIMPLE_BRACKET
:
1108 /* Just double check. */
1110 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1112 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1113 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1115 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1125 if (mb_chars
|| has_period
)
1126 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1128 if (dfa
->nodes
[node
].type
== CHARACTER
1129 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1130 dfa
->nodes
[node
].mb_partial
= 0;
1131 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1132 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1135 /* The search can be in single byte locale. */
1136 dfa
->mb_cur_max
= 1;
1138 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143 "eclosure", and "inveclosure". */
1145 static reg_errcode_t
1146 analyze (regex_t
*preg
)
1148 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1151 /* Allocate arrays. */
1152 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1153 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1154 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1155 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1156 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1157 || dfa
->eclosures
== NULL
, 0))
1160 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1161 if (dfa
->subexp_map
!= NULL
)
1164 for (i
= 0; i
< preg
->re_nsub
; i
++)
1165 dfa
->subexp_map
[i
] = i
;
1166 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1167 for (i
= 0; i
< preg
->re_nsub
; i
++)
1168 if (dfa
->subexp_map
[i
] != i
)
1170 if (i
== preg
->re_nsub
)
1172 free (dfa
->subexp_map
);
1173 dfa
->subexp_map
= NULL
;
1177 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1178 if (BE (ret
!= REG_NOERROR
, 0))
1180 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1181 if (BE (ret
!= REG_NOERROR
, 0))
1183 preorder (dfa
->str_tree
, calc_next
, dfa
);
1184 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1185 if (BE (ret
!= REG_NOERROR
, 0))
1187 ret
= calc_eclosure (dfa
);
1188 if (BE (ret
!= REG_NOERROR
, 0))
1191 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1193 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1196 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1197 if (BE (dfa
->inveclosures
== NULL
, 0))
1199 ret
= calc_inveclosure (dfa
);
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206 implement parse tree visits. Instead, we use parent pointers and
1207 some hairy code in these two functions. */
1208 static reg_errcode_t
1209 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1212 bin_tree_t
*node
, *prev
;
1214 for (node
= root
; ; )
1216 /* Descend down the tree, preferably to the left (or to the right
1217 if that's the only child). */
1218 while (node
->left
|| node
->right
)
1226 reg_errcode_t err
= fn (extra
, node
);
1227 if (BE (err
!= REG_NOERROR
, 0))
1229 if (node
->parent
== NULL
)
1232 node
= node
->parent
;
1234 /* Go up while we have a node that is reached from the right. */
1235 while (node
->right
== prev
|| node
->right
== NULL
);
1240 static reg_errcode_t
1241 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1246 for (node
= root
; ; )
1248 reg_errcode_t err
= fn (extra
, node
);
1249 if (BE (err
!= REG_NOERROR
, 0))
1252 /* Go to the left node, or up and to the right. */
1257 bin_tree_t
*prev
= NULL
;
1258 while (node
->right
== prev
|| node
->right
== NULL
)
1261 node
= node
->parent
;
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1272 backreferences as well. Requires a preorder visit. */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra
, bin_tree_t
*node
)
1276 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1278 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1280 int idx
= node
->token
.opr
.idx
;
1281 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1282 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1285 else if (node
->token
.type
== SUBEXP
1286 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1288 Idx other_idx
= node
->left
->token
.opr
.idx
;
1290 node
->left
= node
->left
->left
;
1292 node
->left
->parent
= node
;
1294 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1295 if (other_idx
< BITSET_WORD_BITS
)
1296 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304 static reg_errcode_t
1305 lower_subexps (void *extra
, bin_tree_t
*node
)
1307 regex_t
*preg
= (regex_t
*) extra
;
1308 reg_errcode_t err
= REG_NOERROR
;
1310 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1312 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1314 node
->left
->parent
= node
;
1316 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1318 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1320 node
->right
->parent
= node
;
1327 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1329 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1330 bin_tree_t
*body
= node
->left
;
1331 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1334 /* We do not optimize empty subexpressions, because otherwise we may
1335 have bad CONCAT nodes with NULL children. This is obviously not
1336 very common, so we do not lose much. An example that triggers
1337 this case is the sed "script" /\(\)/x. */
1338 && node
->left
!= NULL
1339 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1340 || !(dfa
->used_bkref_map
1341 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1344 /* Convert the SUBEXP node to the concatenation of an
1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1347 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1348 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1349 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1350 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1356 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1357 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362 nodes. Requires a postorder visit. */
1363 static reg_errcode_t
1364 calc_first (void *extra
, bin_tree_t
*node
)
1366 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1367 if (node
->token
.type
== CONCAT
)
1369 node
->first
= node
->left
->first
;
1370 node
->node_idx
= node
->left
->node_idx
;
1375 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1376 if (BE (node
->node_idx
== REG_MISSING
, 0))
1378 if (node
->token
.type
== ANCHOR
)
1379 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1384 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1385 static reg_errcode_t
1386 calc_next (void *extra
, bin_tree_t
*node
)
1388 switch (node
->token
.type
)
1390 case OP_DUP_ASTERISK
:
1391 node
->left
->next
= node
;
1394 node
->left
->next
= node
->right
->first
;
1395 node
->right
->next
= node
->next
;
1399 node
->left
->next
= node
->next
;
1401 node
->right
->next
= node
->next
;
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1411 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1412 Idx idx
= node
->node_idx
;
1413 reg_errcode_t err
= REG_NOERROR
;
1415 switch (node
->token
.type
)
1421 assert (node
->next
== NULL
);
1424 case OP_DUP_ASTERISK
:
1428 dfa
->has_plural_match
= 1;
1429 if (node
->left
!= NULL
)
1430 left
= node
->left
->first
->node_idx
;
1432 left
= node
->next
->node_idx
;
1433 if (node
->right
!= NULL
)
1434 right
= node
->right
->first
->node_idx
;
1436 right
= node
->next
->node_idx
;
1437 assert (REG_VALID_INDEX (left
));
1438 assert (REG_VALID_INDEX (right
));
1439 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1444 case OP_OPEN_SUBEXP
:
1445 case OP_CLOSE_SUBEXP
:
1446 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1450 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1451 if (node
->token
.type
== OP_BACK_REF
)
1452 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1456 assert (!IS_EPSILON_NODE (node
->token
.type
));
1457 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466 to their own constraint. */
1468 static reg_errcode_t
1470 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1471 Idx root_node
, unsigned int init_constraint
)
1473 Idx org_node
, clone_node
;
1475 unsigned int constraint
= init_constraint
;
1476 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1478 Idx org_dest
, clone_dest
;
1479 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1481 /* If the back reference epsilon-transit, its destination must
1482 also have the constraint. Then duplicate the epsilon closure
1483 of the destination of the back reference, and store it in
1484 edests of the back reference. */
1485 org_dest
= dfa
->nexts
[org_node
];
1486 re_node_set_empty (dfa
->edests
+ clone_node
);
1487 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1488 if (BE (clone_dest
== REG_MISSING
, 0))
1490 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1491 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1495 else if (dfa
->edests
[org_node
].nelem
== 0)
1497 /* In case of the node can't epsilon-transit, don't duplicate the
1498 destination and store the original destination as the
1499 destination of the node. */
1500 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1503 else if (dfa
->edests
[org_node
].nelem
== 1)
1505 /* In case of the node can epsilon-transit, and it has only one
1507 org_dest
= dfa
->edests
[org_node
].elems
[0];
1508 re_node_set_empty (dfa
->edests
+ clone_node
);
1509 /* If the node is root_node itself, it means the epsilon closure
1510 has a loop. Then tie it to the destination of the root_node. */
1511 if (org_node
== root_node
&& clone_node
!= org_node
)
1513 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1518 /* In case the node has another constraint, append it. */
1519 constraint
|= dfa
->nodes
[org_node
].constraint
;
1520 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1521 if (BE (clone_dest
== REG_MISSING
, 0))
1523 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1527 else /* dfa->edests[org_node].nelem == 2 */
1529 /* In case of the node can epsilon-transit, and it has two
1530 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531 org_dest
= dfa
->edests
[org_node
].elems
[0];
1532 re_node_set_empty (dfa
->edests
+ clone_node
);
1533 /* Search for a duplicated node which satisfies the constraint. */
1534 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1535 if (clone_dest
== REG_MISSING
)
1537 /* There is no such duplicated node, create a new one. */
1539 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1540 if (BE (clone_dest
== REG_MISSING
, 0))
1542 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1545 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1546 root_node
, constraint
);
1547 if (BE (err
!= REG_NOERROR
, 0))
1552 /* There is a duplicated node which satisfies the constraint,
1553 use it to avoid infinite loop. */
1554 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1559 org_dest
= dfa
->edests
[org_node
].elems
[1];
1560 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1561 if (BE (clone_dest
== REG_MISSING
, 0))
1563 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1567 org_node
= org_dest
;
1568 clone_node
= clone_dest
;
1573 /* Search for a node which is duplicated from the node ORG_NODE, and
1574 satisfies the constraint CONSTRAINT. */
1577 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1578 unsigned int constraint
)
1581 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1583 if (org_node
== dfa
->org_indices
[idx
]
1584 && constraint
== dfa
->nodes
[idx
].constraint
)
1585 return idx
; /* Found. */
1587 return REG_MISSING
; /* Not found. */
1590 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591 Return the index of the new node, or REG_MISSING if insufficient storage is
1595 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1597 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1598 if (BE (dup_idx
!= REG_MISSING
, 1))
1600 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1601 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1602 dfa
->nodes
[dup_idx
].duplicated
= 1;
1604 /* Store the index of the original node. */
1605 dfa
->org_indices
[dup_idx
] = org_idx
;
1610 static reg_errcode_t
1611 calc_inveclosure (re_dfa_t
*dfa
)
1615 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1616 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1618 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1620 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1621 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1623 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1632 /* Calculate "eclosure" for all the node in DFA. */
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t
*dfa
)
1640 assert (dfa
->nodes_len
> 0);
1643 /* For each nodes, calculate epsilon closure. */
1644 for (node_idx
= 0; ; ++node_idx
)
1647 re_node_set eclosure_elem
;
1648 if (node_idx
== dfa
->nodes_len
)
1657 assert (dfa
->eclosures
[node_idx
].nelem
!= REG_MISSING
);
1660 /* If we have already calculated, skip it. */
1661 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1665 if (BE (err
!= REG_NOERROR
, 0))
1668 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1671 re_node_set_free (&eclosure_elem
);
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1684 re_node_set eclosure
;
1686 bool incomplete
= false;
1687 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1688 if (BE (err
!= REG_NOERROR
, 0))
1691 /* This indicates that we are calculating this node now.
1692 We reference this value to avoid infinite loop. */
1693 dfa
->eclosures
[node
].nelem
= REG_MISSING
;
1695 /* If the current node has constraints, duplicate all nodes
1696 since they must inherit the constraints. */
1697 if (dfa
->nodes
[node
].constraint
1698 && dfa
->edests
[node
].nelem
1699 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1701 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1702 dfa
->nodes
[node
].constraint
);
1703 if (BE (err
!= REG_NOERROR
, 0))
1707 /* Expand each epsilon destination nodes. */
1708 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1709 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1711 re_node_set eclosure_elem
;
1712 Idx edest
= dfa
->edests
[node
].elems
[i
];
1713 /* If calculating the epsilon closure of `edest' is in progress,
1714 return intermediate result. */
1715 if (dfa
->eclosures
[edest
].nelem
== REG_MISSING
)
1720 /* If we haven't calculated the epsilon closure of `edest' yet,
1721 calculate now. Otherwise use calculated epsilon closure. */
1722 if (dfa
->eclosures
[edest
].nelem
== 0)
1724 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1725 if (BE (err
!= REG_NOERROR
, 0))
1729 eclosure_elem
= dfa
->eclosures
[edest
];
1730 /* Merge the epsilon closure of `edest'. */
1731 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1732 if (BE (err
!= REG_NOERROR
, 0))
1734 /* If the epsilon closure of `edest' is incomplete,
1735 the epsilon closure of this node is also incomplete. */
1736 if (dfa
->eclosures
[edest
].nelem
== 0)
1739 re_node_set_free (&eclosure_elem
);
1743 /* An epsilon closure includes itself. */
1744 ok
= re_node_set_insert (&eclosure
, node
);
1747 if (incomplete
&& !root
)
1748 dfa
->eclosures
[node
].nelem
= 0;
1750 dfa
->eclosures
[node
] = eclosure
;
1751 *new_set
= eclosure
;
1755 /* Functions for token which are used in the parser. */
1757 /* Fetch a token from INPUT.
1758 We must not use this function inside bracket expressions. */
1762 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1764 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1767 /* Peek a token from INPUT, and return the length of the token.
1768 We must not use this function inside bracket expressions. */
1772 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1776 if (re_string_eoi (input
))
1778 token
->type
= END_OF_RE
;
1782 c
= re_string_peek_byte (input
, 0);
1785 token
->word_char
= 0;
1786 #ifdef RE_ENABLE_I18N
1787 token
->mb_partial
= 0;
1788 if (input
->mb_cur_max
> 1 &&
1789 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1791 token
->type
= CHARACTER
;
1792 token
->mb_partial
= 1;
1799 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1801 token
->type
= BACK_SLASH
;
1805 c2
= re_string_peek_byte_case (input
, 1);
1807 token
->type
= CHARACTER
;
1808 #ifdef RE_ENABLE_I18N
1809 if (input
->mb_cur_max
> 1)
1811 wint_t wc
= re_string_wchar_at (input
,
1812 re_string_cur_idx (input
) + 1);
1813 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1817 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1822 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1823 token
->type
= OP_ALT
;
1825 case '1': case '2': case '3': case '4': case '5':
1826 case '6': case '7': case '8': case '9':
1827 if (!(syntax
& RE_NO_BK_REFS
))
1829 token
->type
= OP_BACK_REF
;
1830 token
->opr
.idx
= c2
- '1';
1834 if (!(syntax
& RE_NO_GNU_OPS
))
1836 token
->type
= ANCHOR
;
1837 token
->opr
.ctx_type
= WORD_FIRST
;
1841 if (!(syntax
& RE_NO_GNU_OPS
))
1843 token
->type
= ANCHOR
;
1844 token
->opr
.ctx_type
= WORD_LAST
;
1848 if (!(syntax
& RE_NO_GNU_OPS
))
1850 token
->type
= ANCHOR
;
1851 token
->opr
.ctx_type
= WORD_DELIM
;
1855 if (!(syntax
& RE_NO_GNU_OPS
))
1857 token
->type
= ANCHOR
;
1858 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1862 if (!(syntax
& RE_NO_GNU_OPS
))
1863 token
->type
= OP_WORD
;
1866 if (!(syntax
& RE_NO_GNU_OPS
))
1867 token
->type
= OP_NOTWORD
;
1870 if (!(syntax
& RE_NO_GNU_OPS
))
1871 token
->type
= OP_SPACE
;
1874 if (!(syntax
& RE_NO_GNU_OPS
))
1875 token
->type
= OP_NOTSPACE
;
1878 if (!(syntax
& RE_NO_GNU_OPS
))
1880 token
->type
= ANCHOR
;
1881 token
->opr
.ctx_type
= BUF_FIRST
;
1885 if (!(syntax
& RE_NO_GNU_OPS
))
1887 token
->type
= ANCHOR
;
1888 token
->opr
.ctx_type
= BUF_LAST
;
1892 if (!(syntax
& RE_NO_BK_PARENS
))
1893 token
->type
= OP_OPEN_SUBEXP
;
1896 if (!(syntax
& RE_NO_BK_PARENS
))
1897 token
->type
= OP_CLOSE_SUBEXP
;
1900 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1901 token
->type
= OP_DUP_PLUS
;
1904 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1905 token
->type
= OP_DUP_QUESTION
;
1908 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1909 token
->type
= OP_OPEN_DUP_NUM
;
1912 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1913 token
->type
= OP_CLOSE_DUP_NUM
;
1921 token
->type
= CHARACTER
;
1922 #ifdef RE_ENABLE_I18N
1923 if (input
->mb_cur_max
> 1)
1925 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1926 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1930 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1935 if (syntax
& RE_NEWLINE_ALT
)
1936 token
->type
= OP_ALT
;
1939 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1940 token
->type
= OP_ALT
;
1943 token
->type
= OP_DUP_ASTERISK
;
1946 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1947 token
->type
= OP_DUP_PLUS
;
1950 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1951 token
->type
= OP_DUP_QUESTION
;
1954 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1955 token
->type
= OP_OPEN_DUP_NUM
;
1958 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1959 token
->type
= OP_CLOSE_DUP_NUM
;
1962 if (syntax
& RE_NO_BK_PARENS
)
1963 token
->type
= OP_OPEN_SUBEXP
;
1966 if (syntax
& RE_NO_BK_PARENS
)
1967 token
->type
= OP_CLOSE_SUBEXP
;
1970 token
->type
= OP_OPEN_BRACKET
;
1973 token
->type
= OP_PERIOD
;
1976 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1977 re_string_cur_idx (input
) != 0)
1979 char prev
= re_string_peek_byte (input
, -1);
1980 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1983 token
->type
= ANCHOR
;
1984 token
->opr
.ctx_type
= LINE_FIRST
;
1987 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1988 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1991 re_string_skip_bytes (input
, 1);
1992 peek_token (&next
, input
, syntax
);
1993 re_string_skip_bytes (input
, -1);
1994 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1997 token
->type
= ANCHOR
;
1998 token
->opr
.ctx_type
= LINE_LAST
;
2006 /* Peek a token from INPUT, and return the length of the token.
2007 We must not use this function out of bracket expressions. */
2011 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2014 if (re_string_eoi (input
))
2016 token
->type
= END_OF_RE
;
2019 c
= re_string_peek_byte (input
, 0);
2022 #ifdef RE_ENABLE_I18N
2023 if (input
->mb_cur_max
> 1 &&
2024 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2026 token
->type
= CHARACTER
;
2029 #endif /* RE_ENABLE_I18N */
2031 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2032 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2034 /* In this case, '\' escape a character. */
2036 re_string_skip_bytes (input
, 1);
2037 c2
= re_string_peek_byte (input
, 0);
2039 token
->type
= CHARACTER
;
2042 if (c
== '[') /* '[' is a special char in a bracket exps. */
2046 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2047 c2
= re_string_peek_byte (input
, 1);
2055 token
->type
= OP_OPEN_COLL_ELEM
;
2058 token
->type
= OP_OPEN_EQUIV_CLASS
;
2061 if (syntax
& RE_CHAR_CLASSES
)
2063 token
->type
= OP_OPEN_CHAR_CLASS
;
2066 /* else fall through. */
2068 token
->type
= CHARACTER
;
2078 token
->type
= OP_CHARSET_RANGE
;
2081 token
->type
= OP_CLOSE_BRACKET
;
2084 token
->type
= OP_NON_MATCH_LIST
;
2087 token
->type
= CHARACTER
;
2092 /* Functions for parser. */
2094 /* Entry point of the parser.
2095 Parse the regular expression REGEXP and return the structure tree.
2096 If an error is occured, ERR is set by error code, and return NULL.
2097 This function build the following tree, from regular expression <reg_exp>:
2103 CAT means concatenation.
2104 EOR means end of regular expression. */
2107 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2110 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2111 bin_tree_t
*tree
, *eor
, *root
;
2112 re_token_t current_token
;
2113 dfa
->syntax
= syntax
;
2114 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2115 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2116 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2118 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2120 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2123 if (BE (eor
== NULL
|| root
== NULL
, 0))
2131 /* This function build the following tree, from regular expression
2132 <branch1>|<branch2>:
2138 ALT means alternative, which represents the operator `|'. */
2141 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2142 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2144 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2145 bin_tree_t
*tree
, *branch
= NULL
;
2146 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2147 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2150 while (token
->type
== OP_ALT
)
2152 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2153 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2154 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2156 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2157 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2162 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2163 if (BE (tree
== NULL
, 0))
2172 /* This function build the following tree, from regular expression
2179 CAT means concatenation. */
2182 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2183 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2185 bin_tree_t
*tree
, *expr
;
2186 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2187 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2188 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2191 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2192 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2194 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2195 if (BE (*err
!= REG_NOERROR
&& expr
== NULL
, 0))
2199 if (tree
!= NULL
&& expr
!= NULL
)
2201 tree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2208 else if (tree
== NULL
)
2210 /* Otherwise expr == NULL, we don't need to create new tree. */
2215 /* This function build the following tree, from regular expression a*:
2222 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2223 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2225 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2227 switch (token
->type
)
2230 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2231 if (BE (tree
== NULL
, 0))
2236 #ifdef RE_ENABLE_I18N
2237 if (dfa
->mb_cur_max
> 1)
2239 while (!re_string_eoi (regexp
)
2240 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2242 bin_tree_t
*mbc_remain
;
2243 fetch_token (token
, regexp
, syntax
);
2244 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2245 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2246 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2255 case OP_OPEN_SUBEXP
:
2256 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2257 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2260 case OP_OPEN_BRACKET
:
2261 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2262 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2266 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2271 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2272 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2273 if (BE (tree
== NULL
, 0))
2279 dfa
->has_mb_node
= 1;
2281 case OP_OPEN_DUP_NUM
:
2282 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2288 case OP_DUP_ASTERISK
:
2290 case OP_DUP_QUESTION
:
2291 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2296 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2298 fetch_token (token
, regexp
, syntax
);
2299 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2301 /* else fall through */
2302 case OP_CLOSE_SUBEXP
:
2303 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2304 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2309 /* else fall through */
2310 case OP_CLOSE_DUP_NUM
:
2311 /* We treat it as a normal character. */
2313 /* Then we can these characters as normal characters. */
2314 token
->type
= CHARACTER
;
2315 /* mb_partial and word_char bits should be initialized already
2317 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2318 if (BE (tree
== NULL
, 0))
2325 if ((token
->opr
.ctx_type
2326 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2327 && dfa
->word_ops_used
== 0)
2328 init_word_char (dfa
);
2329 if (token
->opr
.ctx_type
== WORD_DELIM
2330 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2332 bin_tree_t
*tree_first
, *tree_last
;
2333 if (token
->opr
.ctx_type
== WORD_DELIM
)
2335 token
->opr
.ctx_type
= WORD_FIRST
;
2336 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2337 token
->opr
.ctx_type
= WORD_LAST
;
2341 token
->opr
.ctx_type
= INSIDE_WORD
;
2342 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2343 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2345 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2346 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2347 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2355 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2356 if (BE (tree
== NULL
, 0))
2362 /* We must return here, since ANCHORs can't be followed
2363 by repetition operators.
2364 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2366 fetch_token (token
, regexp
, syntax
);
2369 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2370 if (BE (tree
== NULL
, 0))
2375 if (dfa
->mb_cur_max
> 1)
2376 dfa
->has_mb_node
= 1;
2380 tree
= build_charclass_op (dfa
, regexp
->trans
,
2381 (const unsigned char *) "alnum",
2382 (const unsigned char *) "_",
2383 token
->type
== OP_NOTWORD
, err
);
2384 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2389 tree
= build_charclass_op (dfa
, regexp
->trans
,
2390 (const unsigned char *) "space",
2391 (const unsigned char *) "",
2392 token
->type
== OP_NOTSPACE
, err
);
2393 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2403 /* Must not happen? */
2409 fetch_token (token
, regexp
, syntax
);
2411 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2412 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2414 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2415 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2417 /* In BRE consecutive duplications are not allowed. */
2418 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2419 && (token
->type
== OP_DUP_ASTERISK
2420 || token
->type
== OP_OPEN_DUP_NUM
))
2430 /* This function build the following tree, from regular expression
2438 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2439 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2441 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2444 cur_nsub
= preg
->re_nsub
++;
2446 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2448 /* The subexpression may be a null string. */
2449 if (token
->type
== OP_CLOSE_SUBEXP
)
2453 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2454 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2456 if (BE (*err
!= REG_NOERROR
, 0))
2460 if (cur_nsub
<= '9' - '1')
2461 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2463 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2464 if (BE (tree
== NULL
, 0))
2469 tree
->token
.opr
.idx
= cur_nsub
;
2473 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2476 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2477 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2479 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2480 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2481 re_token_t start_token
= *token
;
2483 if (token
->type
== OP_OPEN_DUP_NUM
)
2486 start
= fetch_number (regexp
, token
, syntax
);
2487 if (start
== REG_MISSING
)
2489 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2490 start
= 0; /* We treat "{,m}" as "{0,m}". */
2493 *err
= REG_BADBR
; /* <re>{} is invalid. */
2497 if (BE (start
!= REG_ERROR
, 1))
2499 /* We treat "{n}" as "{n,n}". */
2500 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2501 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2502 ? fetch_number (regexp
, token
, syntax
) : REG_ERROR
));
2504 if (BE (start
== REG_ERROR
|| end
== REG_ERROR
, 0))
2506 /* Invalid sequence. */
2507 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2509 if (token
->type
== END_OF_RE
)
2517 /* If the syntax bit is set, rollback. */
2518 re_string_set_index (regexp
, start_idx
);
2519 *token
= start_token
;
2520 token
->type
= CHARACTER
;
2521 /* mb_partial and word_char bits should be already initialized by
2526 if (BE ((end
!= REG_MISSING
&& start
> end
)
2527 || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2529 /* First number greater than second. */
2536 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2537 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : REG_MISSING
;
2540 fetch_token (token
, regexp
, syntax
);
2542 if (BE (elem
== NULL
, 0))
2544 if (BE (start
== 0 && end
== 0, 0))
2546 postorder (elem
, free_tree
, NULL
);
2550 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2551 if (BE (start
> 0, 0))
2554 for (i
= 2; i
<= start
; ++i
)
2556 elem
= duplicate_tree (elem
, dfa
);
2557 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2558 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2559 goto parse_dup_op_espace
;
2565 /* Duplicate ELEM before it is marked optional. */
2566 elem
= duplicate_tree (elem
, dfa
);
2572 if (elem
->token
.type
== SUBEXP
)
2573 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2575 tree
= create_tree (dfa
, elem
, NULL
,
2576 (end
== REG_MISSING
? OP_DUP_ASTERISK
: OP_ALT
));
2577 if (BE (tree
== NULL
, 0))
2578 goto parse_dup_op_espace
;
2580 /* From gnulib's "intprops.h":
2581 True if the arithmetic type T is signed. */
2582 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584 /* This loop is actually executed only when end != REG_MISSING,
2585 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2586 already created the start+1-th copy. */
2587 if (TYPE_SIGNED (Idx
) || end
!= REG_MISSING
)
2588 for (i
= start
+ 2; i
<= end
; ++i
)
2590 elem
= duplicate_tree (elem
, dfa
);
2591 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2592 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2593 goto parse_dup_op_espace
;
2595 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2596 if (BE (tree
== NULL
, 0))
2597 goto parse_dup_op_espace
;
2601 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2605 parse_dup_op_espace
:
2610 /* Size of the names for collating symbol/equivalence_class/character_class.
2611 I'm not sure, but maybe enough. */
2612 #define BRACKET_NAME_BUF_SIZE 32
2615 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616 Build the range expression which starts from START_ELEM, and ends
2617 at END_ELEM. The result are written to MBCSET and SBCSET.
2618 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619 mbcset->range_ends, is a pointer argument sinse we may
2622 static reg_errcode_t
2624 # ifdef RE_ENABLE_I18N
2625 build_range_exp (const reg_syntax_t syntax
,
2627 re_charset_t
*mbcset
,
2629 const bracket_elem_t
*start_elem
,
2630 const bracket_elem_t
*end_elem
)
2631 # else /* not RE_ENABLE_I18N */
2632 build_range_exp (const reg_syntax_t syntax
,
2634 const bracket_elem_t
*start_elem
,
2635 const bracket_elem_t
*end_elem
)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch
, end_ch
;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2641 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2645 /* We can handle no multi character collating elements without libc
2647 if (BE ((start_elem
->type
== COLL_SYM
2648 && strlen ((char *) start_elem
->opr
.name
) > 1)
2649 || (end_elem
->type
== COLL_SYM
2650 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2651 return REG_ECOLLATE
;
2653 # ifdef RE_ENABLE_I18N
2658 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2660 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2661 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2663 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2664 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2666 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2667 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2668 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2669 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2670 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2671 return REG_ECOLLATE
;
2672 cmp_buf
[0] = start_wc
;
2673 cmp_buf
[4] = end_wc
;
2675 if (BE ((syntax
& RE_NO_EMPTY_RANGES
)
2676 && wcscoll (cmp_buf
, cmp_buf
+ 4) > 0, 0))
2679 /* Got valid collation sequence values, add them as a new entry.
2680 However, for !_LIBC we have no collation elements: if the
2681 character set is single byte, the single byte character set
2682 that we build below suffices. parse_bracket_exp passes
2683 no MBCSET if dfa->mb_cur_max == 1. */
2686 /* Check the space of the arrays. */
2687 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2689 /* There is not enough space, need realloc. */
2690 wchar_t *new_array_start
, *new_array_end
;
2693 /* +1 in case of mbcset->nranges is 0. */
2694 new_nranges
= 2 * mbcset
->nranges
+ 1;
2695 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2696 are NULL if *range_alloc == 0. */
2697 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2699 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2702 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2705 mbcset
->range_starts
= new_array_start
;
2706 mbcset
->range_ends
= new_array_end
;
2707 *range_alloc
= new_nranges
;
2710 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2711 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2714 /* Build the table for single byte characters. */
2715 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2718 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2719 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2720 bitset_set (sbcset
, wc
);
2723 # else /* not RE_ENABLE_I18N */
2726 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2727 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2729 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2730 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2732 if (start_ch
> end_ch
)
2734 /* Build the table for single byte characters. */
2735 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2736 if (start_ch
<= ch
&& ch
<= end_ch
)
2737 bitset_set (sbcset
, ch
);
2739 # endif /* not RE_ENABLE_I18N */
2742 #endif /* not _LIBC */
2745 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2746 Build the collating element which is represented by NAME.
2747 The result are written to MBCSET and SBCSET.
2748 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2749 pointer argument since we may update it. */
2751 static reg_errcode_t
2753 build_collating_symbol (bitset_t sbcset
,
2754 # ifdef RE_ENABLE_I18N
2755 re_charset_t
*mbcset
, Idx
*coll_sym_alloc
,
2757 const unsigned char *name
)
2759 size_t name_len
= strlen ((const char *) name
);
2760 if (BE (name_len
!= 1, 0))
2761 return REG_ECOLLATE
;
2764 bitset_set (sbcset
, name
[0]);
2768 #endif /* not _LIBC */
2770 /* This function parse bracket expression like "[abc]", "[a-c]",
2774 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2775 reg_syntax_t syntax
, reg_errcode_t
*err
)
2778 const unsigned char *collseqmb
;
2779 const char *collseqwc
;
2782 const int32_t *symb_table
;
2783 const unsigned char *extra
;
2785 /* Local function for parse_bracket_exp used in _LIBC environement.
2786 Seek the collating symbol entry correspondings to NAME.
2787 Return the index of the symbol in the SYMB_TABLE. */
2790 __attribute ((always_inline
))
2791 seek_collating_symbol_entry (name
, name_len
)
2792 const unsigned char *name
;
2795 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2796 int32_t elem
= hash
% table_size
;
2797 if (symb_table
[2 * elem
] != 0)
2799 int32_t second
= hash
% (table_size
- 2) + 1;
2803 /* First compare the hashing value. */
2804 if (symb_table
[2 * elem
] == hash
2805 /* Compare the length of the name. */
2806 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2807 /* Compare the name. */
2808 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2811 /* Yep, this is the entry. */
2818 while (symb_table
[2 * elem
] != 0);
2823 /* Local function for parse_bracket_exp used in _LIBC environment.
2824 Look up the collation sequence value of BR_ELEM.
2825 Return the value if succeeded, UINT_MAX otherwise. */
2827 auto inline unsigned int
2828 __attribute ((always_inline
))
2829 lookup_collation_sequence_value (br_elem
)
2830 bracket_elem_t
*br_elem
;
2832 if (br_elem
->type
== SB_CHAR
)
2835 if (MB_CUR_MAX == 1)
2838 return collseqmb
[br_elem
->opr
.ch
];
2841 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2842 return __collseq_table_lookup (collseqwc
, wc
);
2845 else if (br_elem
->type
== MB_CHAR
)
2848 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2850 else if (br_elem
->type
== COLL_SYM
)
2852 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2856 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2858 if (symb_table
[2 * elem
] != 0)
2860 /* We found the entry. */
2861 idx
= symb_table
[2 * elem
+ 1];
2862 /* Skip the name of collating element name. */
2863 idx
+= 1 + extra
[idx
];
2864 /* Skip the byte sequence of the collating element. */
2865 idx
+= 1 + extra
[idx
];
2866 /* Adjust for the alignment. */
2867 idx
= (idx
+ 3) & ~3;
2868 /* Skip the multibyte collation sequence value. */
2869 idx
+= sizeof (unsigned int);
2870 /* Skip the wide char sequence of the collating element. */
2871 idx
+= sizeof (unsigned int) *
2872 (1 + *(unsigned int *) (extra
+ idx
));
2873 /* Return the collation sequence value. */
2874 return *(unsigned int *) (extra
+ idx
);
2876 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2878 /* No valid character. Match it as a single byte
2880 return collseqmb
[br_elem
->opr
.name
[0]];
2883 else if (sym_name_len
== 1)
2884 return collseqmb
[br_elem
->opr
.name
[0]];
2889 /* Local function for parse_bracket_exp used in _LIBC environement.
2890 Build the range expression which starts from START_ELEM, and ends
2891 at END_ELEM. The result are written to MBCSET and SBCSET.
2892 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2893 mbcset->range_ends, is a pointer argument sinse we may
2896 auto inline reg_errcode_t
2897 __attribute ((always_inline
))
2898 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2899 re_charset_t
*mbcset
;
2902 bracket_elem_t
*start_elem
, *end_elem
;
2905 uint32_t start_collseq
;
2906 uint32_t end_collseq
;
2908 /* Equivalence Classes and Character Classes can't be a range
2910 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2911 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2915 start_collseq
= lookup_collation_sequence_value (start_elem
);
2916 end_collseq
= lookup_collation_sequence_value (end_elem
);
2917 /* Check start/end collation sequence values. */
2918 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2919 return REG_ECOLLATE
;
2920 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2923 /* Got valid collation sequence values, add them as a new entry.
2924 However, if we have no collation elements, and the character set
2925 is single byte, the single byte character set that we
2926 build below suffices. */
2927 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2929 /* Check the space of the arrays. */
2930 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2932 /* There is not enough space, need realloc. */
2933 uint32_t *new_array_start
;
2934 uint32_t *new_array_end
;
2937 /* +1 in case of mbcset->nranges is 0. */
2938 new_nranges
= 2 * mbcset
->nranges
+ 1;
2939 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2941 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2944 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2947 mbcset
->range_starts
= new_array_start
;
2948 mbcset
->range_ends
= new_array_end
;
2949 *range_alloc
= new_nranges
;
2952 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2953 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2956 /* Build the table for single byte characters. */
2957 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2959 uint32_t ch_collseq
;
2961 if (MB_CUR_MAX == 1)
2964 ch_collseq
= collseqmb
[ch
];
2966 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2967 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2968 bitset_set (sbcset
, ch
);
2973 /* Local function for parse_bracket_exp used in _LIBC environement.
2974 Build the collating element which is represented by NAME.
2975 The result are written to MBCSET and SBCSET.
2976 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2977 pointer argument sinse we may update it. */
2979 auto inline reg_errcode_t
2980 __attribute ((always_inline
))
2981 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2982 re_charset_t
*mbcset
;
2983 Idx
*coll_sym_alloc
;
2985 const unsigned char *name
;
2988 size_t name_len
= strlen ((const char *) name
);
2991 elem
= seek_collating_symbol_entry (name
, name_len
);
2992 if (symb_table
[2 * elem
] != 0)
2994 /* We found the entry. */
2995 idx
= symb_table
[2 * elem
+ 1];
2996 /* Skip the name of collating element name. */
2997 idx
+= 1 + extra
[idx
];
2999 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3001 /* No valid character, treat it as a normal
3003 bitset_set (sbcset
, name
[0]);
3007 return REG_ECOLLATE
;
3009 /* Got valid collation sequence, add it as a new entry. */
3010 /* Check the space of the arrays. */
3011 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3013 /* Not enough, realloc it. */
3014 /* +1 in case of mbcset->ncoll_syms is 0. */
3015 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3016 /* Use realloc since mbcset->coll_syms is NULL
3018 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3019 new_coll_sym_alloc
);
3020 if (BE (new_coll_syms
== NULL
, 0))
3022 mbcset
->coll_syms
= new_coll_syms
;
3023 *coll_sym_alloc
= new_coll_sym_alloc
;
3025 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3030 if (BE (name_len
!= 1, 0))
3031 return REG_ECOLLATE
;
3034 bitset_set (sbcset
, name
[0]);
3041 re_token_t br_token
;
3042 re_bitset_ptr_t sbcset
;
3043 #ifdef RE_ENABLE_I18N
3044 re_charset_t
*mbcset
;
3045 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3046 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3047 #endif /* not RE_ENABLE_I18N */
3048 bool non_match
= false;
3049 bin_tree_t
*work_tree
;
3051 bool first_round
= true;
3053 collseqmb
= (const unsigned char *)
3054 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3055 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3061 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3062 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3063 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3064 _NL_COLLATE_SYMB_TABLEMB
);
3065 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3066 _NL_COLLATE_SYMB_EXTRAMB
);
3069 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3070 #ifdef RE_ENABLE_I18N
3071 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3072 #endif /* RE_ENABLE_I18N */
3073 #ifdef RE_ENABLE_I18N
3074 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3076 if (BE (sbcset
== NULL
, 0))
3077 #endif /* RE_ENABLE_I18N */
3083 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3084 if (BE (token
->type
== END_OF_RE
, 0))
3087 goto parse_bracket_exp_free_return
;
3089 if (token
->type
== OP_NON_MATCH_LIST
)
3091 #ifdef RE_ENABLE_I18N
3092 mbcset
->non_match
= 1;
3093 #endif /* not RE_ENABLE_I18N */
3095 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3096 bitset_set (sbcset
, '\n');
3097 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3098 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3099 if (BE (token
->type
== END_OF_RE
, 0))
3102 goto parse_bracket_exp_free_return
;
3106 /* We treat the first ']' as a normal character. */
3107 if (token
->type
== OP_CLOSE_BRACKET
)
3108 token
->type
= CHARACTER
;
3112 bracket_elem_t start_elem
, end_elem
;
3113 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3114 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3117 bool is_range_exp
= false;
3120 start_elem
.opr
.name
= start_name_buf
;
3121 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3122 syntax
, first_round
);
3123 if (BE (ret
!= REG_NOERROR
, 0))
3126 goto parse_bracket_exp_free_return
;
3128 first_round
= false;
3130 /* Get information about the next token. We need it in any case. */
3131 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3133 /* Do not check for ranges if we know they are not allowed. */
3134 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3136 if (BE (token
->type
== END_OF_RE
, 0))
3139 goto parse_bracket_exp_free_return
;
3141 if (token
->type
== OP_CHARSET_RANGE
)
3143 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3144 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3145 if (BE (token2
.type
== END_OF_RE
, 0))
3148 goto parse_bracket_exp_free_return
;
3150 if (token2
.type
== OP_CLOSE_BRACKET
)
3152 /* We treat the last '-' as a normal character. */
3153 re_string_skip_bytes (regexp
, -token_len
);
3154 token
->type
= CHARACTER
;
3157 is_range_exp
= true;
3161 if (is_range_exp
== true)
3163 end_elem
.opr
.name
= end_name_buf
;
3164 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3166 if (BE (ret
!= REG_NOERROR
, 0))
3169 goto parse_bracket_exp_free_return
;
3172 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3175 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3176 &start_elem
, &end_elem
);
3178 # ifdef RE_ENABLE_I18N
3179 *err
= build_range_exp (syntax
, sbcset
,
3180 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3181 &range_alloc
, &start_elem
, &end_elem
);
3183 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3185 #endif /* RE_ENABLE_I18N */
3186 if (BE (*err
!= REG_NOERROR
, 0))
3187 goto parse_bracket_exp_free_return
;
3191 switch (start_elem
.type
)
3194 bitset_set (sbcset
, start_elem
.opr
.ch
);
3196 #ifdef RE_ENABLE_I18N
3198 /* Check whether the array has enough space. */
3199 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3201 wchar_t *new_mbchars
;
3202 /* Not enough, realloc it. */
3203 /* +1 in case of mbcset->nmbchars is 0. */
3204 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3205 /* Use realloc since array is NULL if *alloc == 0. */
3206 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3208 if (BE (new_mbchars
== NULL
, 0))
3209 goto parse_bracket_exp_espace
;
3210 mbcset
->mbchars
= new_mbchars
;
3212 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3214 #endif /* RE_ENABLE_I18N */
3216 *err
= build_equiv_class (sbcset
,
3217 #ifdef RE_ENABLE_I18N
3218 mbcset
, &equiv_class_alloc
,
3219 #endif /* RE_ENABLE_I18N */
3220 start_elem
.opr
.name
);
3221 if (BE (*err
!= REG_NOERROR
, 0))
3222 goto parse_bracket_exp_free_return
;
3225 *err
= build_collating_symbol (sbcset
,
3226 #ifdef RE_ENABLE_I18N
3227 mbcset
, &coll_sym_alloc
,
3228 #endif /* RE_ENABLE_I18N */
3229 start_elem
.opr
.name
);
3230 if (BE (*err
!= REG_NOERROR
, 0))
3231 goto parse_bracket_exp_free_return
;
3234 *err
= build_charclass (regexp
->trans
, sbcset
,
3235 #ifdef RE_ENABLE_I18N
3236 mbcset
, &char_class_alloc
,
3237 #endif /* RE_ENABLE_I18N */
3238 start_elem
.opr
.name
, syntax
);
3239 if (BE (*err
!= REG_NOERROR
, 0))
3240 goto parse_bracket_exp_free_return
;
3247 if (BE (token
->type
== END_OF_RE
, 0))
3250 goto parse_bracket_exp_free_return
;
3252 if (token
->type
== OP_CLOSE_BRACKET
)
3256 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3258 /* If it is non-matching list. */
3260 bitset_not (sbcset
);
3262 #ifdef RE_ENABLE_I18N
3263 /* Ensure only single byte characters are set. */
3264 if (dfa
->mb_cur_max
> 1)
3265 bitset_mask (sbcset
, dfa
->sb_char
);
3267 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3268 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3269 || mbcset
->non_match
)))
3271 bin_tree_t
*mbc_tree
;
3273 /* Build a tree for complex bracket. */
3274 dfa
->has_mb_node
= 1;
3275 br_token
.type
= COMPLEX_BRACKET
;
3276 br_token
.opr
.mbcset
= mbcset
;
3277 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3278 if (BE (mbc_tree
== NULL
, 0))
3279 goto parse_bracket_exp_espace
;
3280 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3281 if (sbcset
[sbc_idx
])
3283 /* If there are no bits set in sbcset, there is no point
3284 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3285 if (sbc_idx
< BITSET_WORDS
)
3287 /* Build a tree for simple bracket. */
3288 br_token
.type
= SIMPLE_BRACKET
;
3289 br_token
.opr
.sbcset
= sbcset
;
3290 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3291 if (BE (work_tree
== NULL
, 0))
3292 goto parse_bracket_exp_espace
;
3294 /* Then join them by ALT node. */
3295 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3296 if (BE (work_tree
== NULL
, 0))
3297 goto parse_bracket_exp_espace
;
3302 work_tree
= mbc_tree
;
3306 #endif /* not RE_ENABLE_I18N */
3308 #ifdef RE_ENABLE_I18N
3309 free_charset (mbcset
);
3311 /* Build a tree for simple bracket. */
3312 br_token
.type
= SIMPLE_BRACKET
;
3313 br_token
.opr
.sbcset
= sbcset
;
3314 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3315 if (BE (work_tree
== NULL
, 0))
3316 goto parse_bracket_exp_espace
;
3320 parse_bracket_exp_espace
:
3322 parse_bracket_exp_free_return
:
3324 #ifdef RE_ENABLE_I18N
3325 free_charset (mbcset
);
3326 #endif /* RE_ENABLE_I18N */
3330 /* Parse an element in the bracket expression. */
3332 static reg_errcode_t
3333 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3334 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3335 reg_syntax_t syntax
, bool accept_hyphen
)
3337 #ifdef RE_ENABLE_I18N
3339 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3340 if (cur_char_size
> 1)
3342 elem
->type
= MB_CHAR
;
3343 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3344 re_string_skip_bytes (regexp
, cur_char_size
);
3347 #endif /* RE_ENABLE_I18N */
3348 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3349 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3350 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3351 return parse_bracket_symbol (elem
, regexp
, token
);
3352 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3354 /* A '-' must only appear as anything but a range indicator before
3355 the closing bracket. Everything else is an error. */
3357 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3358 if (token2
.type
!= OP_CLOSE_BRACKET
)
3359 /* The actual error value is not standardized since this whole
3360 case is undefined. But ERANGE makes good sense. */
3363 elem
->type
= SB_CHAR
;
3364 elem
->opr
.ch
= token
->opr
.c
;
3368 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3369 such as [:<character_class>:], [.<collating_element>.], and
3370 [=<equivalent_class>=]. */
3372 static reg_errcode_t
3373 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3376 unsigned char ch
, delim
= token
->opr
.c
;
3378 if (re_string_eoi(regexp
))
3382 if (i
>= BRACKET_NAME_BUF_SIZE
)
3384 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3385 ch
= re_string_fetch_byte_case (regexp
);
3387 ch
= re_string_fetch_byte (regexp
);
3388 if (re_string_eoi(regexp
))
3390 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3392 elem
->opr
.name
[i
] = ch
;
3394 re_string_skip_bytes (regexp
, 1);
3395 elem
->opr
.name
[i
] = '\0';
3396 switch (token
->type
)
3398 case OP_OPEN_COLL_ELEM
:
3399 elem
->type
= COLL_SYM
;
3401 case OP_OPEN_EQUIV_CLASS
:
3402 elem
->type
= EQUIV_CLASS
;
3404 case OP_OPEN_CHAR_CLASS
:
3405 elem
->type
= CHAR_CLASS
;
3413 /* Helper function for parse_bracket_exp.
3414 Build the equivalence class which is represented by NAME.
3415 The result are written to MBCSET and SBCSET.
3416 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3417 is a pointer argument sinse we may update it. */
3419 static reg_errcode_t
3420 #ifdef RE_ENABLE_I18N
3421 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3422 Idx
*equiv_class_alloc
, const unsigned char *name
)
3423 #else /* not RE_ENABLE_I18N */
3424 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3425 #endif /* not RE_ENABLE_I18N */
3428 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3431 const int32_t *table
, *indirect
;
3432 const unsigned char *weights
, *extra
, *cp
;
3433 unsigned char char_buf
[2];
3437 /* This #include defines a local function! */
3438 # include <locale/weight.h>
3439 /* Calculate the index for equivalence class. */
3441 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3442 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3443 _NL_COLLATE_WEIGHTMB
);
3444 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3445 _NL_COLLATE_EXTRAMB
);
3446 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3447 _NL_COLLATE_INDIRECTMB
);
3448 idx1
= findidx (&cp
);
3449 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3450 /* This isn't a valid character. */
3451 return REG_ECOLLATE
;
3453 /* Build single byte matcing table for this equivalence class. */
3454 char_buf
[1] = (unsigned char) '\0';
3455 len
= weights
[idx1
& 0xffffff];
3456 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3460 idx2
= findidx (&cp
);
3465 /* This isn't a valid character. */
3467 /* Compare only if the length matches and the collation rule
3468 index is the same. */
3469 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3473 while (cnt
<= len
&&
3474 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3475 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3479 bitset_set (sbcset
, ch
);
3482 /* Check whether the array has enough space. */
3483 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3485 /* Not enough, realloc it. */
3486 /* +1 in case of mbcset->nequiv_classes is 0. */
3487 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3488 /* Use realloc since the array is NULL if *alloc == 0. */
3489 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3491 new_equiv_class_alloc
);
3492 if (BE (new_equiv_classes
== NULL
, 0))
3494 mbcset
->equiv_classes
= new_equiv_classes
;
3495 *equiv_class_alloc
= new_equiv_class_alloc
;
3497 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3502 if (BE (strlen ((const char *) name
) != 1, 0))
3503 return REG_ECOLLATE
;
3504 bitset_set (sbcset
, *name
);
3509 /* Helper function for parse_bracket_exp.
3510 Build the character class which is represented by NAME.
3511 The result are written to MBCSET and SBCSET.
3512 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3513 is a pointer argument sinse we may update it. */
3515 static reg_errcode_t
3516 #ifdef RE_ENABLE_I18N
3517 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3518 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3519 const unsigned char *class_name
, reg_syntax_t syntax
)
3520 #else /* not RE_ENABLE_I18N */
3521 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3522 const unsigned char *class_name
, reg_syntax_t syntax
)
3523 #endif /* not RE_ENABLE_I18N */
3526 const char *name
= (const char *) class_name
;
3528 /* In case of REG_ICASE "upper" and "lower" match the both of
3529 upper and lower cases. */
3530 if ((syntax
& RE_ICASE
)
3531 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3534 #ifdef RE_ENABLE_I18N
3535 /* Check the space of the arrays. */
3536 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3538 /* Not enough, realloc it. */
3539 /* +1 in case of mbcset->nchar_classes is 0. */
3540 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3541 /* Use realloc since array is NULL if *alloc == 0. */
3542 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3543 new_char_class_alloc
);
3544 if (BE (new_char_classes
== NULL
, 0))
3546 mbcset
->char_classes
= new_char_classes
;
3547 *char_class_alloc
= new_char_class_alloc
;
3549 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3550 #endif /* RE_ENABLE_I18N */
3552 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3554 if (BE (trans != NULL, 0)) \
3556 for (i = 0; i < SBC_MAX; ++i) \
3557 if (ctype_func (i)) \
3558 bitset_set (sbcset, trans[i]); \
3562 for (i = 0; i < SBC_MAX; ++i) \
3563 if (ctype_func (i)) \
3564 bitset_set (sbcset, i); \
3568 if (strcmp (name
, "alnum") == 0)
3569 BUILD_CHARCLASS_LOOP (isalnum
);
3570 else if (strcmp (name
, "cntrl") == 0)
3571 BUILD_CHARCLASS_LOOP (iscntrl
);
3572 else if (strcmp (name
, "lower") == 0)
3573 BUILD_CHARCLASS_LOOP (islower
);
3574 else if (strcmp (name
, "space") == 0)
3575 BUILD_CHARCLASS_LOOP (isspace
);
3576 else if (strcmp (name
, "alpha") == 0)
3577 BUILD_CHARCLASS_LOOP (isalpha
);
3578 else if (strcmp (name
, "digit") == 0)
3579 BUILD_CHARCLASS_LOOP (isdigit
);
3580 else if (strcmp (name
, "print") == 0)
3581 BUILD_CHARCLASS_LOOP (isprint
);
3582 else if (strcmp (name
, "upper") == 0)
3583 BUILD_CHARCLASS_LOOP (isupper
);
3584 else if (strcmp (name
, "blank") == 0)
3585 BUILD_CHARCLASS_LOOP (isblank
);
3586 else if (strcmp (name
, "graph") == 0)
3587 BUILD_CHARCLASS_LOOP (isgraph
);
3588 else if (strcmp (name
, "punct") == 0)
3589 BUILD_CHARCLASS_LOOP (ispunct
);
3590 else if (strcmp (name
, "xdigit") == 0)
3591 BUILD_CHARCLASS_LOOP (isxdigit
);
3599 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3600 const unsigned char *class_name
,
3601 const unsigned char *extra
, bool non_match
,
3604 re_bitset_ptr_t sbcset
;
3605 #ifdef RE_ENABLE_I18N
3606 re_charset_t
*mbcset
;
3608 #endif /* not RE_ENABLE_I18N */
3610 re_token_t br_token
;
3613 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3614 #ifdef RE_ENABLE_I18N
3615 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3616 #endif /* RE_ENABLE_I18N */
3618 #ifdef RE_ENABLE_I18N
3619 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3620 #else /* not RE_ENABLE_I18N */
3621 if (BE (sbcset
== NULL
, 0))
3622 #endif /* not RE_ENABLE_I18N */
3630 #ifdef RE_ENABLE_I18N
3631 mbcset
->non_match
= 1;
3632 #endif /* not RE_ENABLE_I18N */
3635 /* We don't care the syntax in this case. */
3636 ret
= build_charclass (trans
, sbcset
,
3637 #ifdef RE_ENABLE_I18N
3639 #endif /* RE_ENABLE_I18N */
3642 if (BE (ret
!= REG_NOERROR
, 0))
3645 #ifdef RE_ENABLE_I18N
3646 free_charset (mbcset
);
3647 #endif /* RE_ENABLE_I18N */
3651 /* \w match '_' also. */
3652 for (; *extra
; extra
++)
3653 bitset_set (sbcset
, *extra
);
3655 /* If it is non-matching list. */
3657 bitset_not (sbcset
);
3659 #ifdef RE_ENABLE_I18N
3660 /* Ensure only single byte characters are set. */
3661 if (dfa
->mb_cur_max
> 1)
3662 bitset_mask (sbcset
, dfa
->sb_char
);
3665 /* Build a tree for simple bracket. */
3666 br_token
.type
= SIMPLE_BRACKET
;
3667 br_token
.opr
.sbcset
= sbcset
;
3668 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3669 if (BE (tree
== NULL
, 0))
3670 goto build_word_op_espace
;
3672 #ifdef RE_ENABLE_I18N
3673 if (dfa
->mb_cur_max
> 1)
3675 bin_tree_t
*mbc_tree
;
3676 /* Build a tree for complex bracket. */
3677 br_token
.type
= COMPLEX_BRACKET
;
3678 br_token
.opr
.mbcset
= mbcset
;
3679 dfa
->has_mb_node
= 1;
3680 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3681 if (BE (mbc_tree
== NULL
, 0))
3682 goto build_word_op_espace
;
3683 /* Then join them by ALT node. */
3684 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3685 if (BE (mbc_tree
!= NULL
, 1))
3690 free_charset (mbcset
);
3693 #else /* not RE_ENABLE_I18N */
3695 #endif /* not RE_ENABLE_I18N */
3697 build_word_op_espace
:
3699 #ifdef RE_ENABLE_I18N
3700 free_charset (mbcset
);
3701 #endif /* RE_ENABLE_I18N */
3706 /* This is intended for the expressions like "a{1,3}".
3707 Fetch a number from `input', and return the number.
3708 Return REG_MISSING if the number field is empty like "{,1}".
3709 Return REG_ERROR if an error occurred. */
3712 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3714 Idx num
= REG_MISSING
;
3718 fetch_token (token
, input
, syntax
);
3720 if (BE (token
->type
== END_OF_RE
, 0))
3722 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3724 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
3725 || num
== REG_ERROR
)
3727 : ((num
== REG_MISSING
) ? c
- '0' : num
* 10 + c
- '0'));
3728 num
= (num
> RE_DUP_MAX
) ? REG_ERROR
: num
;
3733 #ifdef RE_ENABLE_I18N
3735 free_charset (re_charset_t
*cset
)
3737 re_free (cset
->mbchars
);
3739 re_free (cset
->coll_syms
);
3740 re_free (cset
->equiv_classes
);
3741 re_free (cset
->range_starts
);
3742 re_free (cset
->range_ends
);
3744 re_free (cset
->char_classes
);
3747 #endif /* RE_ENABLE_I18N */
3749 /* Functions for binary tree operation. */
3751 /* Create a tree node. */
3754 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3755 re_token_type_t type
)
3759 return create_token_tree (dfa
, left
, right
, &t
);
3763 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3764 const re_token_t
*token
)
3767 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3769 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3771 if (storage
== NULL
)
3773 storage
->next
= dfa
->str_tree_storage
;
3774 dfa
->str_tree_storage
= storage
;
3775 dfa
->str_tree_storage_idx
= 0;
3777 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3779 tree
->parent
= NULL
;
3781 tree
->right
= right
;
3782 tree
->token
= *token
;
3783 tree
->token
.duplicated
= 0;
3784 tree
->token
.opt_subexp
= 0;
3787 tree
->node_idx
= REG_MISSING
;
3790 left
->parent
= tree
;
3792 right
->parent
= tree
;
3796 /* Mark the tree SRC as an optional subexpression.
3797 To be called from preorder or postorder. */
3799 static reg_errcode_t
3800 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3802 Idx idx
= (Idx
) (long) extra
;
3803 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3804 node
->token
.opt_subexp
= 1;
3809 /* Free the allocated memory inside NODE. */
3812 free_token (re_token_t
*node
)
3814 #ifdef RE_ENABLE_I18N
3815 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3816 free_charset (node
->opr
.mbcset
);
3818 #endif /* RE_ENABLE_I18N */
3819 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3820 re_free (node
->opr
.sbcset
);
3823 /* Worker function for tree walking. Free the allocated memory inside NODE
3824 and its children. */
3826 static reg_errcode_t
3827 free_tree (void *extra
, bin_tree_t
*node
)
3829 free_token (&node
->token
);
3834 /* Duplicate the node SRC, and return new node. This is a preorder
3835 visit similar to the one implemented by the generic visitor, but
3836 we need more infrastructure to maintain two parallel trees --- so,
3837 it's easier to duplicate. */
3840 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3842 const bin_tree_t
*node
;
3843 bin_tree_t
*dup_root
;
3844 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3846 for (node
= root
; ; )
3848 /* Create a new tree and link it back to the current parent. */
3849 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3852 (*p_new
)->parent
= dup_node
;
3853 (*p_new
)->token
.duplicated
= 1;
3856 /* Go to the left node, or up and to the right. */
3860 p_new
= &dup_node
->left
;
3864 const bin_tree_t
*prev
= NULL
;
3865 while (node
->right
== prev
|| node
->right
== NULL
)
3868 node
= node
->parent
;
3869 dup_node
= dup_node
->parent
;
3874 p_new
= &dup_node
->right
;