1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
25 size_t length
, reg_syntax_t syntax
);
26 static void re_compile_fastmap_iter (regex_t
*bufp
,
27 const re_dfastate_t
*init_state
,
29 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
31 static void free_charset (re_charset_t
*cset
);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t
*preg
);
34 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
36 static void optimize_utf8 (re_dfa_t
*dfa
);
38 static reg_errcode_t
analyze (regex_t
*preg
);
39 static reg_errcode_t
preorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
postorder (bin_tree_t
*root
,
43 reg_errcode_t (fn (void *, bin_tree_t
*)),
45 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
47 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
49 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
52 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
53 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
54 unsigned int constraint
);
55 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
56 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
58 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
59 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
61 static int peek_token (re_token_t
*token
, re_string_t
*input
,
63 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
64 reg_syntax_t syntax
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 Idx nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 Idx nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 Idx nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
75 re_token_t
*token
, reg_syntax_t syntax
,
76 Idx nest
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
78 re_dfa_t
*dfa
, re_token_t
*token
,
79 reg_syntax_t syntax
, reg_errcode_t
*err
);
80 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
81 re_token_t
*token
, reg_syntax_t syntax
,
83 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
85 re_token_t
*token
, int token_len
,
89 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
93 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
95 Idx
*equiv_class_alloc
,
96 const unsigned char *name
);
97 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
100 Idx
*char_class_alloc
,
101 const char *class_name
,
102 reg_syntax_t syntax
);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
105 const unsigned char *name
);
106 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
108 const char *class_name
,
109 reg_syntax_t syntax
);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
112 RE_TRANSLATE_TYPE trans
,
113 const char *class_name
,
115 bool non_match
, reg_errcode_t
*err
);
116 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 re_token_type_t type
);
119 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
120 bin_tree_t
*left
, bin_tree_t
*right
,
121 const re_token_t
*token
);
122 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
123 static void free_token (re_token_t
*node
);
124 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
125 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid
[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx
[] =
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
217 re_compile_pattern (const char *pattern
, size_t length
,
218 struct re_pattern_buffer
*bufp
)
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
227 /* Match anchors at newline. */
228 bufp
->newline_anchor
= 1;
230 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
234 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
236 weak_alias (__re_compile_pattern
, re_compile_pattern
)
238 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options
;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
254 re_set_syntax (reg_syntax_t syntax
)
256 reg_syntax_t ret
= re_syntax_options
;
258 re_syntax_options
= syntax
;
261 weak_alias (__re_set_syntax
, re_set_syntax
)
264 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
266 re_dfa_t
*dfa
= bufp
->buffer
;
267 char *fastmap
= bufp
->fastmap
;
269 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
270 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
271 if (dfa
->init_state
!= dfa
->init_state_word
)
272 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
273 if (dfa
->init_state
!= dfa
->init_state_nl
)
274 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
275 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
276 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
277 bufp
->fastmap_accurate
= 1;
280 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
283 __attribute__ ((always_inline
))
284 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
288 fastmap
[tolower (ch
)] = 1;
291 /* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
295 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
298 re_dfa_t
*dfa
= bufp
->buffer
;
300 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
301 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
303 Idx node
= init_state
->nodes
.elems
[node_cnt
];
304 re_token_type_t type
= dfa
->nodes
[node
].type
;
306 if (type
== CHARACTER
)
308 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
309 #ifdef RE_ENABLE_I18N
310 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
312 unsigned char buf
[MB_LEN_MAX
];
318 *p
++ = dfa
->nodes
[node
].opr
.c
;
319 while (++node
< dfa
->nodes_len
320 && dfa
->nodes
[node
].type
== CHARACTER
321 && dfa
->nodes
[node
].mb_partial
)
322 *p
++ = dfa
->nodes
[node
].opr
.c
;
323 memset (&state
, '\0', sizeof (state
));
324 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
326 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
328 re_set_fastmap (fastmap
, false, buf
[0]);
332 else if (type
== SIMPLE_BRACKET
)
335 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
338 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
339 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
340 if (w
& ((bitset_word_t
) 1 << j
))
341 re_set_fastmap (fastmap
, icase
, ch
);
344 #ifdef RE_ENABLE_I18N
345 else if (type
== COMPLEX_BRACKET
)
347 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
351 /* See if we have to try all bytes which start multiple collation
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
358 && (cset
->ncoll_syms
|| cset
->nranges
))
360 const int32_t *table
= (const int32_t *)
361 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
362 for (i
= 0; i
< SBC_MAX
; ++i
)
364 re_set_fastmap (fastmap
, icase
, i
);
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa
->mb_cur_max
> 1
373 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
375 || cset
->nequiv_classes
383 memset (&mbs
, 0, sizeof (mbs
));
384 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
385 re_set_fastmap (fastmap
, false, (int) c
);
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i
= 0; i
< cset
->nmbchars
; ++i
)
397 memset (&state
, '\0', sizeof (state
));
398 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
399 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
400 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
402 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
404 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
409 #endif /* RE_ENABLE_I18N */
410 else if (type
== OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type
== OP_UTF8_PERIOD
413 #endif /* RE_ENABLE_I18N */
414 || type
== END_OF_RE
)
416 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
417 if (type
== END_OF_RE
)
418 bufp
->can_be_null
= 1;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (regex_t
*__restrict preg
, const char *__restrict pattern
, int cflags
)
464 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC
);
471 /* Try to allocate space for the fastmap. */
472 preg
->fastmap
= re_malloc (char, SBC_MAX
);
473 if (__glibc_unlikely (preg
->fastmap
== NULL
))
476 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags
& REG_NEWLINE
)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax
&= ~RE_DOT_NEWLINE
;
482 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
483 /* It also changes the matching behavior. */
484 preg
->newline_anchor
= 1;
487 preg
->newline_anchor
= 0;
488 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
489 preg
->translate
= NULL
;
491 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret
== REG_ERPAREN
)
498 /* We have already checked preg->fastmap != NULL. */
499 if (__glibc_likely (ret
== REG_NOERROR
))
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. This function never fails in this implementation. */
502 (void) re_compile_fastmap (preg
);
505 /* Some error occurred while compiling the expression. */
506 re_free (preg
->fastmap
);
507 preg
->fastmap
= NULL
;
512 libc_hidden_def (__regcomp
)
513 weak_alias (__regcomp
, regcomp
)
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
519 regerror (int errcode
, const regex_t
*__restrict preg
, char *__restrict errbuf
,
524 int nerrcodes
= sizeof __re_error_msgid_idx
/ sizeof __re_error_msgid_idx
[0];
526 if (__glibc_unlikely (errcode
< 0 || errcode
>= nerrcodes
))
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
533 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
535 msg_size
= strlen (msg
) + 1; /* Includes the null. */
537 if (__glibc_likely (errbuf_size
!= 0))
539 size_t cpy_size
= msg_size
;
540 if (__glibc_unlikely (msg_size
> errbuf_size
))
542 cpy_size
= errbuf_size
- 1;
543 errbuf
[cpy_size
] = '\0';
545 memcpy (errbuf
, msg
, cpy_size
);
550 weak_alias (__regerror
, regerror
)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map
=
560 /* Set the first 128 bits. */
561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
564 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 # error "bitset_word_t is narrower than 32 bits"
566 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
568 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
570 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
574 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
576 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
583 free_dfa_content (re_dfa_t
*dfa
)
588 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
589 free_token (dfa
->nodes
+ i
);
590 re_free (dfa
->nexts
);
591 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
593 if (dfa
->eclosures
!= NULL
)
594 re_node_set_free (dfa
->eclosures
+ i
);
595 if (dfa
->inveclosures
!= NULL
)
596 re_node_set_free (dfa
->inveclosures
+ i
);
597 if (dfa
->edests
!= NULL
)
598 re_node_set_free (dfa
->edests
+ i
);
600 re_free (dfa
->edests
);
601 re_free (dfa
->eclosures
);
602 re_free (dfa
->inveclosures
);
603 re_free (dfa
->nodes
);
605 if (dfa
->state_table
)
606 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
608 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
609 for (j
= 0; j
< entry
->num
; ++j
)
611 re_dfastate_t
*state
= entry
->array
[j
];
614 re_free (entry
->array
);
616 re_free (dfa
->state_table
);
617 #ifdef RE_ENABLE_I18N
618 if (dfa
->sb_char
!= utf8_sb_map
)
619 re_free (dfa
->sb_char
);
621 re_free (dfa
->subexp_map
);
623 re_free (dfa
->re_str
);
630 /* Free dynamically allocated space used by PREG. */
633 regfree (regex_t
*preg
)
635 re_dfa_t
*dfa
= preg
->buffer
;
636 if (__glibc_likely (dfa
!= NULL
))
638 lock_fini (dfa
->lock
);
639 free_dfa_content (dfa
);
644 re_free (preg
->fastmap
);
645 preg
->fastmap
= NULL
;
647 re_free (preg
->translate
);
648 preg
->translate
= NULL
;
650 libc_hidden_def (__regfree
)
651 weak_alias (__regfree
, regfree
)
653 /* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
656 #if defined _REGEX_RE_COMP || defined _LIBC
658 /* BSD has one and only one pattern buffer. */
659 static struct re_pattern_buffer re_comp_buf
;
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
668 re_comp (const char *s
)
675 if (!re_comp_buf
.buffer
)
676 return gettext ("No previous regular expression");
680 if (re_comp_buf
.buffer
)
682 fastmap
= re_comp_buf
.fastmap
;
683 re_comp_buf
.fastmap
= NULL
;
684 __regfree (&re_comp_buf
);
685 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
686 re_comp_buf
.fastmap
= fastmap
;
689 if (re_comp_buf
.fastmap
== NULL
)
691 re_comp_buf
.fastmap
= re_malloc (char, SBC_MAX
);
692 if (re_comp_buf
.fastmap
== NULL
)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
700 /* Match anchors at newlines. */
701 re_comp_buf
.newline_anchor
= 1;
703 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
714 __libc_regcomp_freemem (void)
716 __regfree (&re_comp_buf
);
720 #endif /* _REGEX_RE_COMP */
722 /* Internal entry point.
723 Compile the regular expression PATTERN, whose length is LENGTH.
724 SYNTAX indicate regular expression's syntax. */
727 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
730 reg_errcode_t err
= REG_NOERROR
;
734 /* Initialize the pattern buffer. */
735 preg
->fastmap_accurate
= 0;
736 preg
->syntax
= syntax
;
737 preg
->not_bol
= preg
->not_eol
= 0;
740 preg
->can_be_null
= 0;
741 preg
->regs_allocated
= REGS_UNALLOCATED
;
743 /* Initialize the dfa. */
745 if (__glibc_unlikely (preg
->allocated
< sizeof (re_dfa_t
)))
747 /* If zero allocated, but buffer is non-null, try to realloc
748 enough space. This loses if buffer's address is bogus, but
749 that is the user's responsibility. If ->buffer is NULL this
750 is a simple allocation. */
751 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
754 preg
->allocated
= sizeof (re_dfa_t
);
757 preg
->used
= sizeof (re_dfa_t
);
759 err
= init_dfa (dfa
, length
);
760 if (__glibc_unlikely (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0))
762 if (__glibc_unlikely (err
!= REG_NOERROR
))
764 free_dfa_content (dfa
);
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa
->re_str
= re_malloc (char, length
+ 1);
772 strncpy (dfa
->re_str
, pattern
, length
+ 1);
775 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
776 (syntax
& RE_ICASE
) != 0, dfa
);
777 if (__glibc_unlikely (err
!= REG_NOERROR
))
779 re_compile_internal_free_return
:
780 free_workarea_compile (preg
);
781 re_string_destruct (®exp
);
782 lock_fini (dfa
->lock
);
783 free_dfa_content (dfa
);
789 /* Parse the regular expression, and build a structure tree. */
791 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
792 if (__glibc_unlikely (dfa
->str_tree
== NULL
))
793 goto re_compile_internal_free_return
;
795 /* Analyze the tree and create the nfa. */
796 err
= analyze (preg
);
797 if (__glibc_unlikely (err
!= REG_NOERROR
))
798 goto re_compile_internal_free_return
;
800 #ifdef RE_ENABLE_I18N
801 /* If possible, do searching in single byte encoding to speed things up. */
802 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
806 /* Then create the initial state of the dfa. */
807 err
= create_initial_state (dfa
);
809 /* Release work areas. */
810 free_workarea_compile (preg
);
811 re_string_destruct (®exp
);
813 if (__glibc_unlikely (err
!= REG_NOERROR
))
815 lock_fini (dfa
->lock
);
816 free_dfa_content (dfa
);
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
828 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
830 __re_size_t table_size
;
832 const char *codeset_name
;
834 #ifdef RE_ENABLE_I18N
835 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
837 size_t max_i18n_object_size
= 0;
839 size_t max_object_size
=
840 MAX (sizeof (struct re_state_table_entry
),
841 MAX (sizeof (re_token_t
),
842 MAX (sizeof (re_node_set
),
843 MAX (sizeof (regmatch_t
),
844 max_i18n_object_size
))));
846 memset (dfa
, '\0', sizeof (re_dfa_t
));
848 /* Force allocation of str_tree_storage the first time. */
849 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
851 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
852 calculation below, and for similar doubling calculations
853 elsewhere. And it's <= rather than <, because some of the
854 doubling calculations add 1 afterwards. */
855 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2
859 dfa
->nodes_alloc
= pat_len
+ 1;
860 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
862 /* table_size = 2 ^ ceil(log pat_len) */
863 for (table_size
= 1; ; table_size
<<= 1)
864 if (table_size
> pat_len
)
867 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
868 dfa
->state_hash_mask
= table_size
- 1;
870 dfa
->mb_cur_max
= MB_CUR_MAX
;
872 if (dfa
->mb_cur_max
== 6
873 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
875 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
878 codeset_name
= nl_langinfo (CODESET
);
879 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
880 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
881 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
882 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
885 /* We check exhaustively in the loop below if this charset is a
886 superset of ASCII. */
887 dfa
->map_notascii
= 0;
890 #ifdef RE_ENABLE_I18N
891 if (dfa
->mb_cur_max
> 1)
894 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
899 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
900 if (__glibc_unlikely (dfa
->sb_char
== NULL
))
903 /* Set the bits corresponding to single byte chars. */
904 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
905 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
907 wint_t wch
= __btowc (ch
);
909 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
911 if (isascii (ch
) && wch
!= ch
)
912 dfa
->map_notascii
= 1;
919 if (__glibc_unlikely (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
))
924 /* Initialize WORD_CHAR table, which indicate which character is
925 "word". In this case "word" means that it is the word construction
926 character used by some operators like "\<", "\>", etc. */
929 init_word_char (re_dfa_t
*dfa
)
934 dfa
->word_ops_used
= 1;
935 if (__glibc_likely (dfa
->map_notascii
== 0))
937 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
938 them, an issue when this code is used in Gnulib. */
939 bitset_word_t bits0
= 0x00000000;
940 bitset_word_t bits1
= 0x03ff0000;
941 bitset_word_t bits2
= 0x87fffffe;
942 bitset_word_t bits3
= 0x07fffffe;
943 if (BITSET_WORD_BITS
== 64)
945 /* Pacify gcc -Woverflow on 32-bit platformns. */
946 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
947 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
950 else if (BITSET_WORD_BITS
== 32)
952 dfa
->word_char
[0] = bits0
;
953 dfa
->word_char
[1] = bits1
;
954 dfa
->word_char
[2] = bits2
;
955 dfa
->word_char
[3] = bits3
;
962 if (__glibc_likely (dfa
->is_utf8
))
964 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
970 for (; i
< BITSET_WORDS
; ++i
)
971 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
972 if (isalnum (ch
) || ch
== '_')
973 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
976 /* Free the work area which are only used while compiling. */
979 free_workarea_compile (regex_t
*preg
)
981 re_dfa_t
*dfa
= preg
->buffer
;
982 bin_tree_storage_t
*storage
, *next
;
983 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
985 next
= storage
->next
;
988 dfa
->str_tree_storage
= NULL
;
989 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
990 dfa
->str_tree
= NULL
;
991 re_free (dfa
->org_indices
);
992 dfa
->org_indices
= NULL
;
995 /* Create initial states for all contexts. */
998 create_initial_state (re_dfa_t
*dfa
)
1002 re_node_set init_nodes
;
1004 /* Initial states have the epsilon closure of the node which is
1005 the first node of the regular expression. */
1006 first
= dfa
->str_tree
->first
->node_idx
;
1007 dfa
->init_node
= first
;
1008 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1009 if (__glibc_unlikely (err
!= REG_NOERROR
))
1012 /* The back-references which are in initial states can epsilon transit,
1013 since in this case all of the subexpressions can be null.
1014 Then we add epsilon closures of the nodes which are the next nodes of
1015 the back-references. */
1016 if (dfa
->nbackref
> 0)
1017 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1019 Idx node_idx
= init_nodes
.elems
[i
];
1020 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1023 if (type
!= OP_BACK_REF
)
1025 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1027 re_token_t
*clexp_node
;
1028 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1029 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1030 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1033 if (clexp_idx
== init_nodes
.nelem
)
1036 if (type
== OP_BACK_REF
)
1038 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1039 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1041 reg_errcode_t merge_err
1042 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1043 if (merge_err
!= REG_NOERROR
)
1050 /* It must be the first time to invoke acquire_state. */
1051 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1052 /* We don't check ERR here, since the initial state must not be NULL. */
1053 if (__glibc_unlikely (dfa
->init_state
== NULL
))
1055 if (dfa
->init_state
->has_constraint
)
1057 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1059 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1061 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1065 if (__glibc_unlikely (dfa
->init_state_word
== NULL
1066 || dfa
->init_state_nl
== NULL
1067 || dfa
->init_state_begbuf
== NULL
))
1071 dfa
->init_state_word
= dfa
->init_state_nl
1072 = dfa
->init_state_begbuf
= dfa
->init_state
;
1074 re_node_set_free (&init_nodes
);
1078 #ifdef RE_ENABLE_I18N
1079 /* If it is possible to do searching in single byte encoding instead of UTF-8
1080 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1081 DFA nodes where needed. */
1084 optimize_utf8 (re_dfa_t
*dfa
)
1088 bool mb_chars
= false;
1089 bool has_period
= false;
1091 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1092 switch (dfa
->nodes
[node
].type
)
1095 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1099 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1107 /* Word anchors etc. cannot be handled. It's okay to test
1108 opr.ctx_type since constraints (for all DFA nodes) are
1109 created by ORing one or more opr.ctx_type values. */
1119 case OP_DUP_ASTERISK
:
1120 case OP_OPEN_SUBEXP
:
1121 case OP_CLOSE_SUBEXP
:
1123 case COMPLEX_BRACKET
:
1125 case SIMPLE_BRACKET
:
1126 /* Just double check. */
1128 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1130 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1131 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1133 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1143 if (mb_chars
|| has_period
)
1144 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1146 if (dfa
->nodes
[node
].type
== CHARACTER
1147 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1148 dfa
->nodes
[node
].mb_partial
= 0;
1149 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1150 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1153 /* The search can be in single byte locale. */
1154 dfa
->mb_cur_max
= 1;
1156 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1160 /* Analyze the structure tree, and calculate "first", "next", "edest",
1161 "eclosure", and "inveclosure". */
1163 static reg_errcode_t
1164 analyze (regex_t
*preg
)
1166 re_dfa_t
*dfa
= preg
->buffer
;
1169 /* Allocate arrays. */
1170 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1171 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1172 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1173 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1174 if (__glibc_unlikely (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
1175 || dfa
->edests
== NULL
|| dfa
->eclosures
== NULL
))
1178 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1179 if (dfa
->subexp_map
!= NULL
)
1182 for (i
= 0; i
< preg
->re_nsub
; i
++)
1183 dfa
->subexp_map
[i
] = i
;
1184 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1185 for (i
= 0; i
< preg
->re_nsub
; i
++)
1186 if (dfa
->subexp_map
[i
] != i
)
1188 if (i
== preg
->re_nsub
)
1190 re_free (dfa
->subexp_map
);
1191 dfa
->subexp_map
= NULL
;
1195 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1196 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1198 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1199 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1201 preorder (dfa
->str_tree
, calc_next
, dfa
);
1202 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1203 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1205 ret
= calc_eclosure (dfa
);
1206 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1209 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1210 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1211 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1214 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1215 if (__glibc_unlikely (dfa
->inveclosures
== NULL
))
1217 ret
= calc_inveclosure (dfa
);
1223 /* Our parse trees are very unbalanced, so we cannot use a stack to
1224 implement parse tree visits. Instead, we use parent pointers and
1225 some hairy code in these two functions. */
1226 static reg_errcode_t
1227 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1230 bin_tree_t
*node
, *prev
;
1232 for (node
= root
; ; )
1234 /* Descend down the tree, preferably to the left (or to the right
1235 if that's the only child). */
1236 while (node
->left
|| node
->right
)
1244 reg_errcode_t err
= fn (extra
, node
);
1245 if (__glibc_unlikely (err
!= REG_NOERROR
))
1247 if (node
->parent
== NULL
)
1250 node
= node
->parent
;
1252 /* Go up while we have a node that is reached from the right. */
1253 while (node
->right
== prev
|| node
->right
== NULL
);
1258 static reg_errcode_t
1259 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1264 for (node
= root
; ; )
1266 reg_errcode_t err
= fn (extra
, node
);
1267 if (__glibc_unlikely (err
!= REG_NOERROR
))
1270 /* Go to the left node, or up and to the right. */
1275 bin_tree_t
*prev
= NULL
;
1276 while (node
->right
== prev
|| node
->right
== NULL
)
1279 node
= node
->parent
;
1288 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1289 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1290 backreferences as well. Requires a preorder visit. */
1291 static reg_errcode_t
1292 optimize_subexps (void *extra
, bin_tree_t
*node
)
1294 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1296 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1298 int idx
= node
->token
.opr
.idx
;
1299 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1300 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1303 else if (node
->token
.type
== SUBEXP
1304 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1306 Idx other_idx
= node
->left
->token
.opr
.idx
;
1308 node
->left
= node
->left
->left
;
1310 node
->left
->parent
= node
;
1312 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1313 if (other_idx
< BITSET_WORD_BITS
)
1314 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1320 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1321 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1322 static reg_errcode_t
1323 lower_subexps (void *extra
, bin_tree_t
*node
)
1325 regex_t
*preg
= (regex_t
*) extra
;
1326 reg_errcode_t err
= REG_NOERROR
;
1328 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1330 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1332 node
->left
->parent
= node
;
1334 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1336 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1338 node
->right
->parent
= node
;
1345 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1347 re_dfa_t
*dfa
= preg
->buffer
;
1348 bin_tree_t
*body
= node
->left
;
1349 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1352 /* We do not optimize empty subexpressions, because otherwise we may
1353 have bad CONCAT nodes with NULL children. This is obviously not
1354 very common, so we do not lose much. An example that triggers
1355 this case is the sed "script" /\(\)/x. */
1356 && node
->left
!= NULL
1357 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1358 || !(dfa
->used_bkref_map
1359 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1362 /* Convert the SUBEXP node to the concatenation of an
1363 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1364 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1365 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1366 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1367 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1368 if (__glibc_unlikely (tree
== NULL
|| tree1
== NULL
1369 || op
== NULL
|| cls
== NULL
))
1375 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1376 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1380 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1381 nodes. Requires a postorder visit. */
1382 static reg_errcode_t
1383 calc_first (void *extra
, bin_tree_t
*node
)
1385 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1386 if (node
->token
.type
== CONCAT
)
1388 node
->first
= node
->left
->first
;
1389 node
->node_idx
= node
->left
->node_idx
;
1394 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1395 if (__glibc_unlikely (node
->node_idx
== -1))
1397 if (node
->token
.type
== ANCHOR
)
1398 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1403 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1404 static reg_errcode_t
1405 calc_next (void *extra
, bin_tree_t
*node
)
1407 switch (node
->token
.type
)
1409 case OP_DUP_ASTERISK
:
1410 node
->left
->next
= node
;
1413 node
->left
->next
= node
->right
->first
;
1414 node
->right
->next
= node
->next
;
1418 node
->left
->next
= node
->next
;
1420 node
->right
->next
= node
->next
;
1426 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1427 static reg_errcode_t
1428 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1430 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1431 Idx idx
= node
->node_idx
;
1432 reg_errcode_t err
= REG_NOERROR
;
1434 switch (node
->token
.type
)
1440 DEBUG_ASSERT (node
->next
== NULL
);
1443 case OP_DUP_ASTERISK
:
1447 dfa
->has_plural_match
= 1;
1448 if (node
->left
!= NULL
)
1449 left
= node
->left
->first
->node_idx
;
1451 left
= node
->next
->node_idx
;
1452 if (node
->right
!= NULL
)
1453 right
= node
->right
->first
->node_idx
;
1455 right
= node
->next
->node_idx
;
1456 DEBUG_ASSERT (left
> -1);
1457 DEBUG_ASSERT (right
> -1);
1458 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1463 case OP_OPEN_SUBEXP
:
1464 case OP_CLOSE_SUBEXP
:
1465 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1469 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1470 if (node
->token
.type
== OP_BACK_REF
)
1471 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1475 DEBUG_ASSERT (!IS_EPSILON_NODE (node
->token
.type
));
1476 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1483 /* Duplicate the epsilon closure of the node ROOT_NODE.
1484 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1485 to their own constraint. */
1487 static reg_errcode_t
1488 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1489 Idx root_node
, unsigned int init_constraint
)
1491 Idx org_node
, clone_node
;
1493 unsigned int constraint
= init_constraint
;
1494 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1496 Idx org_dest
, clone_dest
;
1497 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1499 /* If the back reference epsilon-transit, its destination must
1500 also have the constraint. Then duplicate the epsilon closure
1501 of the destination of the back reference, and store it in
1502 edests of the back reference. */
1503 org_dest
= dfa
->nexts
[org_node
];
1504 re_node_set_empty (dfa
->edests
+ clone_node
);
1505 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1506 if (__glibc_unlikely (clone_dest
== -1))
1508 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1509 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1510 if (__glibc_unlikely (! ok
))
1513 else if (dfa
->edests
[org_node
].nelem
== 0)
1515 /* In case of the node can't epsilon-transit, don't duplicate the
1516 destination and store the original destination as the
1517 destination of the node. */
1518 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1521 else if (dfa
->edests
[org_node
].nelem
== 1)
1523 /* In case of the node can epsilon-transit, and it has only one
1525 org_dest
= dfa
->edests
[org_node
].elems
[0];
1526 re_node_set_empty (dfa
->edests
+ clone_node
);
1527 /* If the node is root_node itself, it means the epsilon closure
1528 has a loop. Then tie it to the destination of the root_node. */
1529 if (org_node
== root_node
&& clone_node
!= org_node
)
1531 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1532 if (__glibc_unlikely (! ok
))
1536 /* In case the node has another constraint, append it. */
1537 constraint
|= dfa
->nodes
[org_node
].constraint
;
1538 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1539 if (__glibc_unlikely (clone_dest
== -1))
1541 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1542 if (__glibc_unlikely (! ok
))
1545 else /* dfa->edests[org_node].nelem == 2 */
1547 /* In case of the node can epsilon-transit, and it has two
1548 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1549 org_dest
= dfa
->edests
[org_node
].elems
[0];
1550 re_node_set_empty (dfa
->edests
+ clone_node
);
1551 /* Search for a duplicated node which satisfies the constraint. */
1552 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1553 if (clone_dest
== -1)
1555 /* There is no such duplicated node, create a new one. */
1557 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1558 if (__glibc_unlikely (clone_dest
== -1))
1560 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1561 if (__glibc_unlikely (! ok
))
1563 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1564 root_node
, constraint
);
1565 if (__glibc_unlikely (err
!= REG_NOERROR
))
1570 /* There is a duplicated node which satisfies the constraint,
1571 use it to avoid infinite loop. */
1572 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1573 if (__glibc_unlikely (! ok
))
1577 org_dest
= dfa
->edests
[org_node
].elems
[1];
1578 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1579 if (__glibc_unlikely (clone_dest
== -1))
1581 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1582 if (__glibc_unlikely (! ok
))
1585 org_node
= org_dest
;
1586 clone_node
= clone_dest
;
1591 /* Search for a node which is duplicated from the node ORG_NODE, and
1592 satisfies the constraint CONSTRAINT. */
1595 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1596 unsigned int constraint
)
1599 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1601 if (org_node
== dfa
->org_indices
[idx
]
1602 && constraint
== dfa
->nodes
[idx
].constraint
)
1603 return idx
; /* Found. */
1605 return -1; /* Not found. */
1608 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1609 Return the index of the new node, or -1 if insufficient storage is
1613 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1615 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1616 if (__glibc_likely (dup_idx
!= -1))
1618 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1619 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1620 dfa
->nodes
[dup_idx
].duplicated
= 1;
1622 /* Store the index of the original node. */
1623 dfa
->org_indices
[dup_idx
] = org_idx
;
1628 static reg_errcode_t
1629 calc_inveclosure (re_dfa_t
*dfa
)
1633 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1634 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1636 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1638 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1639 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1641 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1642 if (__glibc_unlikely (! ok
))
1650 /* Calculate "eclosure" for all the node in DFA. */
1652 static reg_errcode_t
1653 calc_eclosure (re_dfa_t
*dfa
)
1657 DEBUG_ASSERT (dfa
->nodes_len
> 0);
1659 /* For each nodes, calculate epsilon closure. */
1660 for (node_idx
= 0; ; ++node_idx
)
1663 re_node_set eclosure_elem
;
1664 if (node_idx
== dfa
->nodes_len
)
1672 DEBUG_ASSERT (dfa
->eclosures
[node_idx
].nelem
!= -1);
1674 /* If we have already calculated, skip it. */
1675 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1677 /* Calculate epsilon closure of 'node_idx'. */
1678 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1679 if (__glibc_unlikely (err
!= REG_NOERROR
))
1682 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1685 re_node_set_free (&eclosure_elem
);
1691 /* Calculate epsilon closure of NODE. */
1693 static reg_errcode_t
1694 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1698 re_node_set eclosure
;
1699 bool incomplete
= false;
1700 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1701 if (__glibc_unlikely (err
!= REG_NOERROR
))
1704 /* An epsilon closure includes itself. */
1705 eclosure
.elems
[eclosure
.nelem
++] = node
;
1707 /* This indicates that we are calculating this node now.
1708 We reference this value to avoid infinite loop. */
1709 dfa
->eclosures
[node
].nelem
= -1;
1711 /* If the current node has constraints, duplicate all nodes
1712 since they must inherit the constraints. */
1713 if (dfa
->nodes
[node
].constraint
1714 && dfa
->edests
[node
].nelem
1715 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1717 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1718 dfa
->nodes
[node
].constraint
);
1719 if (__glibc_unlikely (err
!= REG_NOERROR
))
1723 /* Expand each epsilon destination nodes. */
1724 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1725 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1727 re_node_set eclosure_elem
;
1728 Idx edest
= dfa
->edests
[node
].elems
[i
];
1729 /* If calculating the epsilon closure of 'edest' is in progress,
1730 return intermediate result. */
1731 if (dfa
->eclosures
[edest
].nelem
== -1)
1736 /* If we haven't calculated the epsilon closure of 'edest' yet,
1737 calculate now. Otherwise use calculated epsilon closure. */
1738 if (dfa
->eclosures
[edest
].nelem
== 0)
1740 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1741 if (__glibc_unlikely (err
!= REG_NOERROR
))
1745 eclosure_elem
= dfa
->eclosures
[edest
];
1746 /* Merge the epsilon closure of 'edest'. */
1747 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1748 if (__glibc_unlikely (err
!= REG_NOERROR
))
1750 /* If the epsilon closure of 'edest' is incomplete,
1751 the epsilon closure of this node is also incomplete. */
1752 if (dfa
->eclosures
[edest
].nelem
== 0)
1755 re_node_set_free (&eclosure_elem
);
1759 if (incomplete
&& !root
)
1760 dfa
->eclosures
[node
].nelem
= 0;
1762 dfa
->eclosures
[node
] = eclosure
;
1763 *new_set
= eclosure
;
1767 /* Functions for token which are used in the parser. */
1769 /* Fetch a token from INPUT.
1770 We must not use this function inside bracket expressions. */
1773 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1775 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1778 /* Peek a token from INPUT, and return the length of the token.
1779 We must not use this function inside bracket expressions. */
1782 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1786 if (re_string_eoi (input
))
1788 token
->type
= END_OF_RE
;
1792 c
= re_string_peek_byte (input
, 0);
1795 token
->word_char
= 0;
1796 #ifdef RE_ENABLE_I18N
1797 token
->mb_partial
= 0;
1798 if (input
->mb_cur_max
> 1
1799 && !re_string_first_byte (input
, re_string_cur_idx (input
)))
1801 token
->type
= CHARACTER
;
1802 token
->mb_partial
= 1;
1809 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1811 token
->type
= BACK_SLASH
;
1815 c2
= re_string_peek_byte_case (input
, 1);
1817 token
->type
= CHARACTER
;
1818 #ifdef RE_ENABLE_I18N
1819 if (input
->mb_cur_max
> 1)
1821 wint_t wc
= re_string_wchar_at (input
,
1822 re_string_cur_idx (input
) + 1);
1823 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1827 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1832 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1833 token
->type
= OP_ALT
;
1835 case '1': case '2': case '3': case '4': case '5':
1836 case '6': case '7': case '8': case '9':
1837 if (!(syntax
& RE_NO_BK_REFS
))
1839 token
->type
= OP_BACK_REF
;
1840 token
->opr
.idx
= c2
- '1';
1844 if (!(syntax
& RE_NO_GNU_OPS
))
1846 token
->type
= ANCHOR
;
1847 token
->opr
.ctx_type
= WORD_FIRST
;
1851 if (!(syntax
& RE_NO_GNU_OPS
))
1853 token
->type
= ANCHOR
;
1854 token
->opr
.ctx_type
= WORD_LAST
;
1858 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= ANCHOR
;
1861 token
->opr
.ctx_type
= WORD_DELIM
;
1865 if (!(syntax
& RE_NO_GNU_OPS
))
1867 token
->type
= ANCHOR
;
1868 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1872 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= OP_WORD
;
1876 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= OP_NOTWORD
;
1880 if (!(syntax
& RE_NO_GNU_OPS
))
1881 token
->type
= OP_SPACE
;
1884 if (!(syntax
& RE_NO_GNU_OPS
))
1885 token
->type
= OP_NOTSPACE
;
1888 if (!(syntax
& RE_NO_GNU_OPS
))
1890 token
->type
= ANCHOR
;
1891 token
->opr
.ctx_type
= BUF_FIRST
;
1895 if (!(syntax
& RE_NO_GNU_OPS
))
1897 token
->type
= ANCHOR
;
1898 token
->opr
.ctx_type
= BUF_LAST
;
1902 if (!(syntax
& RE_NO_BK_PARENS
))
1903 token
->type
= OP_OPEN_SUBEXP
;
1906 if (!(syntax
& RE_NO_BK_PARENS
))
1907 token
->type
= OP_CLOSE_SUBEXP
;
1910 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1911 token
->type
= OP_DUP_PLUS
;
1914 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1915 token
->type
= OP_DUP_QUESTION
;
1918 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1919 token
->type
= OP_OPEN_DUP_NUM
;
1922 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1923 token
->type
= OP_CLOSE_DUP_NUM
;
1931 token
->type
= CHARACTER
;
1932 #ifdef RE_ENABLE_I18N
1933 if (input
->mb_cur_max
> 1)
1935 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1936 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1940 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1945 if (syntax
& RE_NEWLINE_ALT
)
1946 token
->type
= OP_ALT
;
1949 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1950 token
->type
= OP_ALT
;
1953 token
->type
= OP_DUP_ASTERISK
;
1956 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1957 token
->type
= OP_DUP_PLUS
;
1960 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1961 token
->type
= OP_DUP_QUESTION
;
1964 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1965 token
->type
= OP_OPEN_DUP_NUM
;
1968 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1969 token
->type
= OP_CLOSE_DUP_NUM
;
1972 if (syntax
& RE_NO_BK_PARENS
)
1973 token
->type
= OP_OPEN_SUBEXP
;
1976 if (syntax
& RE_NO_BK_PARENS
)
1977 token
->type
= OP_CLOSE_SUBEXP
;
1980 token
->type
= OP_OPEN_BRACKET
;
1983 token
->type
= OP_PERIOD
;
1986 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
))
1987 && re_string_cur_idx (input
) != 0)
1989 char prev
= re_string_peek_byte (input
, -1);
1990 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1993 token
->type
= ANCHOR
;
1994 token
->opr
.ctx_type
= LINE_FIRST
;
1997 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
)
1998 && re_string_cur_idx (input
) + 1 != re_string_length (input
))
2001 re_string_skip_bytes (input
, 1);
2002 peek_token (&next
, input
, syntax
);
2003 re_string_skip_bytes (input
, -1);
2004 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2007 token
->type
= ANCHOR
;
2008 token
->opr
.ctx_type
= LINE_LAST
;
2016 /* Peek a token from INPUT, and return the length of the token.
2017 We must not use this function out of bracket expressions. */
2020 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2023 if (re_string_eoi (input
))
2025 token
->type
= END_OF_RE
;
2028 c
= re_string_peek_byte (input
, 0);
2031 #ifdef RE_ENABLE_I18N
2032 if (input
->mb_cur_max
> 1
2033 && !re_string_first_byte (input
, re_string_cur_idx (input
)))
2035 token
->type
= CHARACTER
;
2038 #endif /* RE_ENABLE_I18N */
2040 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2041 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2043 /* In this case, '\' escape a character. */
2045 re_string_skip_bytes (input
, 1);
2046 c2
= re_string_peek_byte (input
, 0);
2048 token
->type
= CHARACTER
;
2051 if (c
== '[') /* '[' is a special char in a bracket exps. */
2055 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2056 c2
= re_string_peek_byte (input
, 1);
2064 token
->type
= OP_OPEN_COLL_ELEM
;
2068 token
->type
= OP_OPEN_EQUIV_CLASS
;
2072 if (syntax
& RE_CHAR_CLASSES
)
2074 token
->type
= OP_OPEN_CHAR_CLASS
;
2079 token
->type
= CHARACTER
;
2089 token
->type
= OP_CHARSET_RANGE
;
2092 token
->type
= OP_CLOSE_BRACKET
;
2095 token
->type
= OP_NON_MATCH_LIST
;
2098 token
->type
= CHARACTER
;
2103 /* Functions for parser. */
2105 /* Entry point of the parser.
2106 Parse the regular expression REGEXP and return the structure tree.
2107 If an error occurs, ERR is set by error code, and return NULL.
2108 This function build the following tree, from regular expression <reg_exp>:
2114 CAT means concatenation.
2115 EOR means end of regular expression. */
2118 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2121 re_dfa_t
*dfa
= preg
->buffer
;
2122 bin_tree_t
*tree
, *eor
, *root
;
2123 re_token_t current_token
;
2124 dfa
->syntax
= syntax
;
2125 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2126 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2127 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2129 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2131 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2134 if (__glibc_unlikely (eor
== NULL
|| root
== NULL
))
2142 /* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
2149 ALT means alternative, which represents the operator '|'. */
2152 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2153 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2155 re_dfa_t
*dfa
= preg
->buffer
;
2156 bin_tree_t
*tree
, *branch
= NULL
;
2157 bitset_word_t initial_bkref_map
= dfa
->completed_bkref_map
;
2158 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2159 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2162 while (token
->type
== OP_ALT
)
2164 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2165 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2166 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2168 bitset_word_t accumulated_bkref_map
= dfa
->completed_bkref_map
;
2169 dfa
->completed_bkref_map
= initial_bkref_map
;
2170 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2171 if (__glibc_unlikely (*err
!= REG_NOERROR
&& branch
== NULL
))
2174 postorder (tree
, free_tree
, NULL
);
2177 dfa
->completed_bkref_map
|= accumulated_bkref_map
;
2181 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2182 if (__glibc_unlikely (tree
== NULL
))
2191 /* This function build the following tree, from regular expression
2198 CAT means concatenation. */
2201 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2202 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2204 bin_tree_t
*tree
, *expr
;
2205 re_dfa_t
*dfa
= preg
->buffer
;
2206 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2207 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2210 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2211 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2213 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2214 if (__glibc_unlikely (*err
!= REG_NOERROR
&& expr
== NULL
))
2217 postorder (tree
, free_tree
, NULL
);
2220 if (tree
!= NULL
&& expr
!= NULL
)
2222 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2223 if (newtree
== NULL
)
2225 postorder (expr
, free_tree
, NULL
);
2226 postorder (tree
, free_tree
, NULL
);
2232 else if (tree
== NULL
)
2234 /* Otherwise expr == NULL, we don't need to create new tree. */
2239 /* This function build the following tree, from regular expression a*:
2246 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2247 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2249 re_dfa_t
*dfa
= preg
->buffer
;
2251 switch (token
->type
)
2254 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2255 if (__glibc_unlikely (tree
== NULL
))
2260 #ifdef RE_ENABLE_I18N
2261 if (dfa
->mb_cur_max
> 1)
2263 while (!re_string_eoi (regexp
)
2264 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2266 bin_tree_t
*mbc_remain
;
2267 fetch_token (token
, regexp
, syntax
);
2268 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2269 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2270 if (__glibc_unlikely (mbc_remain
== NULL
|| tree
== NULL
))
2280 case OP_OPEN_SUBEXP
:
2281 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2282 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2286 case OP_OPEN_BRACKET
:
2287 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2288 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2293 if (!__glibc_likely (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
)))
2298 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2299 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2300 if (__glibc_unlikely (tree
== NULL
))
2306 dfa
->has_mb_node
= 1;
2309 case OP_OPEN_DUP_NUM
:
2310 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2316 case OP_DUP_ASTERISK
:
2318 case OP_DUP_QUESTION
:
2319 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2324 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2326 fetch_token (token
, regexp
, syntax
);
2327 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2330 case OP_CLOSE_SUBEXP
:
2331 if ((token
->type
== OP_CLOSE_SUBEXP
)
2332 && !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2338 case OP_CLOSE_DUP_NUM
:
2339 /* We treat it as a normal character. */
2341 /* Then we can these characters as normal characters. */
2342 token
->type
= CHARACTER
;
2343 /* mb_partial and word_char bits should be initialized already
2345 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2346 if (__glibc_unlikely (tree
== NULL
))
2354 if ((token
->opr
.ctx_type
2355 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2356 && dfa
->word_ops_used
== 0)
2357 init_word_char (dfa
);
2358 if (token
->opr
.ctx_type
== WORD_DELIM
2359 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2361 bin_tree_t
*tree_first
, *tree_last
;
2362 if (token
->opr
.ctx_type
== WORD_DELIM
)
2364 token
->opr
.ctx_type
= WORD_FIRST
;
2365 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2366 token
->opr
.ctx_type
= WORD_LAST
;
2370 token
->opr
.ctx_type
= INSIDE_WORD
;
2371 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2372 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2374 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2375 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2376 if (__glibc_unlikely (tree_first
== NULL
|| tree_last
== NULL
2385 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2386 if (__glibc_unlikely (tree
== NULL
))
2392 /* We must return here, since ANCHORs can't be followed
2393 by repetition operators.
2394 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2395 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2396 fetch_token (token
, regexp
, syntax
);
2400 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2401 if (__glibc_unlikely (tree
== NULL
))
2406 if (dfa
->mb_cur_max
> 1)
2407 dfa
->has_mb_node
= 1;
2412 tree
= build_charclass_op (dfa
, regexp
->trans
,
2415 token
->type
== OP_NOTWORD
, err
);
2416 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2422 tree
= build_charclass_op (dfa
, regexp
->trans
,
2425 token
->type
== OP_NOTSPACE
, err
);
2426 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2439 /* Must not happen? */
2440 DEBUG_ASSERT (false);
2443 fetch_token (token
, regexp
, syntax
);
2445 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2446 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2448 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
,
2450 if (__glibc_unlikely (*err
!= REG_NOERROR
&& dup_tree
== NULL
))
2453 postorder (tree
, free_tree
, NULL
);
2457 /* In BRE consecutive duplications are not allowed. */
2458 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2459 && (token
->type
== OP_DUP_ASTERISK
2460 || token
->type
== OP_OPEN_DUP_NUM
))
2463 postorder (tree
, free_tree
, NULL
);
2472 /* This function build the following tree, from regular expression
2480 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2481 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2483 re_dfa_t
*dfa
= preg
->buffer
;
2486 cur_nsub
= preg
->re_nsub
++;
2488 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2490 /* The subexpression may be a null string. */
2491 if (token
->type
== OP_CLOSE_SUBEXP
)
2495 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2496 if (__glibc_unlikely (*err
== REG_NOERROR
2497 && token
->type
!= OP_CLOSE_SUBEXP
))
2500 postorder (tree
, free_tree
, NULL
);
2503 if (__glibc_unlikely (*err
!= REG_NOERROR
))
2507 if (cur_nsub
<= '9' - '1')
2508 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2510 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2511 if (__glibc_unlikely (tree
== NULL
))
2516 tree
->token
.opr
.idx
= cur_nsub
;
2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2523 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2524 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2526 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2527 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2528 re_token_t start_token
= *token
;
2530 if (token
->type
== OP_OPEN_DUP_NUM
)
2533 start
= fetch_number (regexp
, token
, syntax
);
2536 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2537 start
= 0; /* We treat "{,m}" as "{0,m}". */
2540 *err
= REG_BADBR
; /* <re>{} is invalid. */
2544 if (__glibc_likely (start
!= -2))
2546 /* We treat "{n}" as "{n,n}". */
2547 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2548 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2549 ? fetch_number (regexp
, token
, syntax
) : -2));
2551 if (__glibc_unlikely (start
== -2 || end
== -2))
2553 /* Invalid sequence. */
2554 if (__glibc_unlikely (!(syntax
& RE_INVALID_INTERVAL_ORD
)))
2556 if (token
->type
== END_OF_RE
)
2564 /* If the syntax bit is set, rollback. */
2565 re_string_set_index (regexp
, start_idx
);
2566 *token
= start_token
;
2567 token
->type
= CHARACTER
;
2568 /* mb_partial and word_char bits should be already initialized by
2573 if (__glibc_unlikely ((end
!= -1 && start
> end
)
2574 || token
->type
!= OP_CLOSE_DUP_NUM
))
2576 /* First number greater than second. */
2581 if (__glibc_unlikely (RE_DUP_MAX
< (end
== -1 ? start
: end
)))
2589 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2590 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2593 fetch_token (token
, regexp
, syntax
);
2595 if (__glibc_unlikely (elem
== NULL
))
2597 if (__glibc_unlikely (start
== 0 && end
== 0))
2599 postorder (elem
, free_tree
, NULL
);
2603 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2604 if (__glibc_unlikely (start
> 0))
2607 for (i
= 2; i
<= start
; ++i
)
2609 elem
= duplicate_tree (elem
, dfa
);
2610 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2611 if (__glibc_unlikely (elem
== NULL
|| tree
== NULL
))
2612 goto parse_dup_op_espace
;
2618 /* Duplicate ELEM before it is marked optional. */
2619 elem
= duplicate_tree (elem
, dfa
);
2620 if (__glibc_unlikely (elem
== NULL
))
2621 goto parse_dup_op_espace
;
2627 if (elem
->token
.type
== SUBEXP
)
2629 uintptr_t subidx
= elem
->token
.opr
.idx
;
2630 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2633 tree
= create_tree (dfa
, elem
, NULL
,
2634 (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2635 if (__glibc_unlikely (tree
== NULL
))
2636 goto parse_dup_op_espace
;
2638 /* This loop is actually executed only when end != -1,
2639 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2640 already created the start+1-th copy. */
2641 if (TYPE_SIGNED (Idx
) || end
!= -1)
2642 for (i
= start
+ 2; i
<= end
; ++i
)
2644 elem
= duplicate_tree (elem
, dfa
);
2645 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2646 if (__glibc_unlikely (elem
== NULL
|| tree
== NULL
))
2647 goto parse_dup_op_espace
;
2649 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2650 if (__glibc_unlikely (tree
== NULL
))
2651 goto parse_dup_op_espace
;
2655 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2659 parse_dup_op_espace
:
2664 /* Size of the names for collating symbol/equivalence_class/character_class.
2665 I'm not sure, but maybe enough. */
2666 #define BRACKET_NAME_BUF_SIZE 32
2670 # ifdef RE_ENABLE_I18N
2671 /* Convert the byte B to the corresponding wide character. In a
2672 unibyte locale, treat B as itself. In a multibyte locale, return
2673 WEOF if B is an encoding error. */
2675 parse_byte (unsigned char b
, re_charset_t
*mbcset
)
2677 return mbcset
== NULL
? b
: __btowc (b
);
2681 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2682 Build the range expression which starts from START_ELEM, and ends
2683 at END_ELEM. The result are written to MBCSET and SBCSET.
2684 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2685 mbcset->range_ends, is a pointer argument since we may
2688 static reg_errcode_t
2689 # ifdef RE_ENABLE_I18N
2690 build_range_exp (const reg_syntax_t syntax
,
2692 re_charset_t
*mbcset
,
2694 const bracket_elem_t
*start_elem
,
2695 const bracket_elem_t
*end_elem
)
2696 # else /* not RE_ENABLE_I18N */
2697 build_range_exp (const reg_syntax_t syntax
,
2699 const bracket_elem_t
*start_elem
,
2700 const bracket_elem_t
*end_elem
)
2701 # endif /* not RE_ENABLE_I18N */
2703 unsigned int start_ch
, end_ch
;
2704 /* Equivalence Classes and Character Classes can't be a range start/end. */
2705 if (__glibc_unlikely (start_elem
->type
== EQUIV_CLASS
2706 || start_elem
->type
== CHAR_CLASS
2707 || end_elem
->type
== EQUIV_CLASS
2708 || end_elem
->type
== CHAR_CLASS
))
2711 /* We can handle no multi character collating elements without libc
2713 if (__glibc_unlikely ((start_elem
->type
== COLL_SYM
2714 && strlen ((char *) start_elem
->opr
.name
) > 1)
2715 || (end_elem
->type
== COLL_SYM
2716 && strlen ((char *) end_elem
->opr
.name
) > 1)))
2717 return REG_ECOLLATE
;
2719 # ifdef RE_ENABLE_I18N
2725 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2726 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2728 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2729 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2731 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2732 ? parse_byte (start_ch
, mbcset
) : start_elem
->opr
.wch
);
2733 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2734 ? parse_byte (end_ch
, mbcset
) : end_elem
->opr
.wch
);
2735 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2736 return REG_ECOLLATE
;
2737 else if (__glibc_unlikely ((syntax
& RE_NO_EMPTY_RANGES
)
2738 && start_wc
> end_wc
))
2741 /* Got valid collation sequence values, add them as a new entry.
2742 However, for !_LIBC we have no collation elements: if the
2743 character set is single byte, the single byte character set
2744 that we build below suffices. parse_bracket_exp passes
2745 no MBCSET if dfa->mb_cur_max == 1. */
2748 /* Check the space of the arrays. */
2749 if (__glibc_unlikely (*range_alloc
== mbcset
->nranges
))
2751 /* There is not enough space, need realloc. */
2752 wchar_t *new_array_start
, *new_array_end
;
2755 /* +1 in case of mbcset->nranges is 0. */
2756 new_nranges
= 2 * mbcset
->nranges
+ 1;
2757 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2758 are NULL if *range_alloc == 0. */
2759 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2761 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2764 if (__glibc_unlikely (new_array_start
== NULL
2765 || new_array_end
== NULL
))
2767 re_free (new_array_start
);
2768 re_free (new_array_end
);
2772 mbcset
->range_starts
= new_array_start
;
2773 mbcset
->range_ends
= new_array_end
;
2774 *range_alloc
= new_nranges
;
2777 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2778 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2781 /* Build the table for single byte characters. */
2782 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2784 if (start_wc
<= wc
&& wc
<= end_wc
)
2785 bitset_set (sbcset
, wc
);
2788 # else /* not RE_ENABLE_I18N */
2791 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2792 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2794 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2795 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2797 if (start_ch
> end_ch
)
2799 /* Build the table for single byte characters. */
2800 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2801 if (start_ch
<= ch
&& ch
<= end_ch
)
2802 bitset_set (sbcset
, ch
);
2804 # endif /* not RE_ENABLE_I18N */
2807 #endif /* not _LIBC */
2810 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2811 Build the collating element which is represented by NAME.
2812 The result are written to MBCSET and SBCSET.
2813 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2814 pointer argument since we may update it. */
2816 static reg_errcode_t
2817 # ifdef RE_ENABLE_I18N
2818 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2819 Idx
*coll_sym_alloc
, const unsigned char *name
)
2820 # else /* not RE_ENABLE_I18N */
2821 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2822 # endif /* not RE_ENABLE_I18N */
2824 size_t name_len
= strlen ((const char *) name
);
2825 if (__glibc_unlikely (name_len
!= 1))
2826 return REG_ECOLLATE
;
2829 bitset_set (sbcset
, name
[0]);
2833 #endif /* not _LIBC */
2836 /* Local function for parse_bracket_exp used in _LIBC environment.
2837 Seek the collating symbol entry corresponding to NAME.
2838 Return the index of the symbol in the SYMB_TABLE,
2839 or -1 if not found. */
2841 static inline int32_t
2842 __attribute__ ((always_inline
))
2843 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
,
2844 const int32_t *symb_table
, int32_t table_size
,
2845 const unsigned char *extra
)
2849 for (elem
= 0; elem
< table_size
; elem
++)
2850 if (symb_table
[2 * elem
] != 0)
2852 int32_t idx
= symb_table
[2 * elem
+ 1];
2853 /* Skip the name of collating element name. */
2854 idx
+= 1 + extra
[idx
];
2855 if (/* Compare the length of the name. */
2856 name_len
== extra
[idx
]
2857 /* Compare the name. */
2858 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2859 /* Yep, this is the entry. */
2865 /* Local function for parse_bracket_exp used in _LIBC environment.
2866 Look up the collation sequence value of BR_ELEM.
2867 Return the value if succeeded, UINT_MAX otherwise. */
2869 static inline unsigned int
2870 __attribute__ ((always_inline
))
2871 lookup_collation_sequence_value (bracket_elem_t
*br_elem
, uint32_t nrules
,
2872 const unsigned char *collseqmb
,
2873 const char *collseqwc
, int32_t table_size
,
2874 const int32_t *symb_table
,
2875 const unsigned char *extra
)
2877 if (br_elem
->type
== SB_CHAR
)
2879 /* if (MB_CUR_MAX == 1) */
2881 return collseqmb
[br_elem
->opr
.ch
];
2884 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2885 return __collseq_table_lookup (collseqwc
, wc
);
2888 else if (br_elem
->type
== MB_CHAR
)
2891 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2893 else if (br_elem
->type
== COLL_SYM
)
2895 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2899 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2901 symb_table
, table_size
,
2905 /* We found the entry. */
2906 idx
= symb_table
[2 * elem
+ 1];
2907 /* Skip the name of collating element name. */
2908 idx
+= 1 + extra
[idx
];
2909 /* Skip the byte sequence of the collating element. */
2910 idx
+= 1 + extra
[idx
];
2911 /* Adjust for the alignment. */
2912 idx
= (idx
+ 3) & ~3;
2913 /* Skip the multibyte collation sequence value. */
2914 idx
+= sizeof (unsigned int);
2915 /* Skip the wide char sequence of the collating element. */
2916 idx
+= sizeof (unsigned int) *
2917 (1 + *(unsigned int *) (extra
+ idx
));
2918 /* Return the collation sequence value. */
2919 return *(unsigned int *) (extra
+ idx
);
2921 else if (sym_name_len
== 1)
2923 /* No valid character. Match it as a single byte
2925 return collseqmb
[br_elem
->opr
.name
[0]];
2928 else if (sym_name_len
== 1)
2929 return collseqmb
[br_elem
->opr
.name
[0]];
2934 /* Local function for parse_bracket_exp used in _LIBC environment.
2935 Build the range expression which starts from START_ELEM, and ends
2936 at END_ELEM. The result are written to MBCSET and SBCSET.
2937 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2938 mbcset->range_ends, is a pointer argument since we may
2941 static inline reg_errcode_t
2942 __attribute__ ((always_inline
))
2943 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, Idx
*range_alloc
,
2944 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
,
2945 re_dfa_t
*dfa
, reg_syntax_t syntax
, uint32_t nrules
,
2946 const unsigned char *collseqmb
, const char *collseqwc
,
2947 int32_t table_size
, const int32_t *symb_table
,
2948 const unsigned char *extra
)
2951 uint32_t start_collseq
;
2952 uint32_t end_collseq
;
2954 /* Equivalence Classes and Character Classes can't be a range
2956 if (__glibc_unlikely (start_elem
->type
== EQUIV_CLASS
2957 || start_elem
->type
== CHAR_CLASS
2958 || end_elem
->type
== EQUIV_CLASS
2959 || end_elem
->type
== CHAR_CLASS
))
2962 /* FIXME: Implement rational ranges here, too. */
2963 start_collseq
= lookup_collation_sequence_value (start_elem
, nrules
, collseqmb
, collseqwc
,
2964 table_size
, symb_table
, extra
);
2965 end_collseq
= lookup_collation_sequence_value (end_elem
, nrules
, collseqmb
, collseqwc
,
2966 table_size
, symb_table
, extra
);
2967 /* Check start/end collation sequence values. */
2968 if (__glibc_unlikely (start_collseq
== UINT_MAX
2969 || end_collseq
== UINT_MAX
))
2970 return REG_ECOLLATE
;
2971 if (__glibc_unlikely ((syntax
& RE_NO_EMPTY_RANGES
)
2972 && start_collseq
> end_collseq
))
2975 /* Got valid collation sequence values, add them as a new entry.
2976 However, if we have no collation elements, and the character set
2977 is single byte, the single byte character set that we
2978 build below suffices. */
2979 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2981 /* Check the space of the arrays. */
2982 if (__glibc_unlikely (*range_alloc
== mbcset
->nranges
))
2984 /* There is not enough space, need realloc. */
2985 uint32_t *new_array_start
;
2986 uint32_t *new_array_end
;
2989 /* +1 in case of mbcset->nranges is 0. */
2990 new_nranges
= 2 * mbcset
->nranges
+ 1;
2991 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2993 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2996 if (__glibc_unlikely (new_array_start
== NULL
2997 || new_array_end
== NULL
))
3000 mbcset
->range_starts
= new_array_start
;
3001 mbcset
->range_ends
= new_array_end
;
3002 *range_alloc
= new_nranges
;
3005 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
3006 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
3009 /* Build the table for single byte characters. */
3010 for (ch
= 0; ch
< SBC_MAX
; ch
++)
3012 uint32_t ch_collseq
;
3013 /* if (MB_CUR_MAX == 1) */
3015 ch_collseq
= collseqmb
[ch
];
3017 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3018 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3019 bitset_set (sbcset
, ch
);
3024 /* Local function for parse_bracket_exp used in _LIBC environment.
3025 Build the collating element which is represented by NAME.
3026 The result are written to MBCSET and SBCSET.
3027 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3028 pointer argument since we may update it. */
3030 static inline reg_errcode_t
3031 __attribute__ ((always_inline
))
3032 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
3033 Idx
*coll_sym_alloc
, const unsigned char *name
,
3034 uint32_t nrules
, int32_t table_size
,
3035 const int32_t *symb_table
, const unsigned char *extra
)
3038 size_t name_len
= strlen ((const char *) name
);
3041 elem
= seek_collating_symbol_entry (name
, name_len
, symb_table
,
3045 /* We found the entry. */
3046 idx
= symb_table
[2 * elem
+ 1];
3047 /* Skip the name of collating element name. */
3048 idx
+= 1 + extra
[idx
];
3050 else if (name_len
== 1)
3052 /* No valid character, treat it as a normal
3054 bitset_set (sbcset
, name
[0]);
3058 return REG_ECOLLATE
;
3060 /* Got valid collation sequence, add it as a new entry. */
3061 /* Check the space of the arrays. */
3062 if (__glibc_unlikely (*coll_sym_alloc
== mbcset
->ncoll_syms
))
3064 /* Not enough, realloc it. */
3065 /* +1 in case of mbcset->ncoll_syms is 0. */
3066 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3067 /* Use realloc since mbcset->coll_syms is NULL
3069 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3070 new_coll_sym_alloc
);
3071 if (__glibc_unlikely (new_coll_syms
== NULL
))
3073 mbcset
->coll_syms
= new_coll_syms
;
3074 *coll_sym_alloc
= new_coll_sym_alloc
;
3076 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3081 if (__glibc_unlikely (name_len
!= 1))
3082 return REG_ECOLLATE
;
3085 bitset_set (sbcset
, name
[0]);
3092 /* This function parse bracket expression like "[abc]", "[a-c]",
3096 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
3097 reg_syntax_t syntax
, reg_errcode_t
*err
)
3100 const unsigned char *collseqmb
;
3101 const char *collseqwc
= NULL
;
3103 int32_t table_size
= 0;
3104 const int32_t *symb_table
= NULL
;
3105 const unsigned char *extra
= NULL
;
3108 re_token_t br_token
;
3109 re_bitset_ptr_t sbcset
;
3110 #ifdef RE_ENABLE_I18N
3111 re_charset_t
*mbcset
;
3112 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3113 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3114 #endif /* not RE_ENABLE_I18N */
3115 bool non_match
= false;
3116 bin_tree_t
*work_tree
;
3118 bool first_round
= true;
3120 collseqmb
= (const unsigned char *)
3121 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3122 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3128 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3129 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3130 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3131 _NL_COLLATE_SYMB_TABLEMB
);
3132 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3133 _NL_COLLATE_SYMB_EXTRAMB
);
3136 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3137 #ifdef RE_ENABLE_I18N
3138 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3139 #endif /* RE_ENABLE_I18N */
3140 #ifdef RE_ENABLE_I18N
3141 if (__glibc_unlikely (sbcset
== NULL
|| mbcset
== NULL
))
3143 if (__glibc_unlikely (sbcset
== NULL
))
3144 #endif /* RE_ENABLE_I18N */
3147 #ifdef RE_ENABLE_I18N
3154 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3155 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3158 goto parse_bracket_exp_free_return
;
3160 if (token
->type
== OP_NON_MATCH_LIST
)
3162 #ifdef RE_ENABLE_I18N
3163 mbcset
->non_match
= 1;
3164 #endif /* not RE_ENABLE_I18N */
3166 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3167 bitset_set (sbcset
, '\n');
3168 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3169 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3170 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3173 goto parse_bracket_exp_free_return
;
3177 /* We treat the first ']' as a normal character. */
3178 if (token
->type
== OP_CLOSE_BRACKET
)
3179 token
->type
= CHARACTER
;
3183 bracket_elem_t start_elem
, end_elem
;
3184 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3185 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3188 bool is_range_exp
= false;
3191 start_elem
.opr
.name
= start_name_buf
;
3192 start_elem
.type
= COLL_SYM
;
3193 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3194 syntax
, first_round
);
3195 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3198 goto parse_bracket_exp_free_return
;
3200 first_round
= false;
3202 /* Get information about the next token. We need it in any case. */
3203 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3205 /* Do not check for ranges if we know they are not allowed. */
3206 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3208 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3211 goto parse_bracket_exp_free_return
;
3213 if (token
->type
== OP_CHARSET_RANGE
)
3215 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3216 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3217 if (__glibc_unlikely (token2
.type
== END_OF_RE
))
3220 goto parse_bracket_exp_free_return
;
3222 if (token2
.type
== OP_CLOSE_BRACKET
)
3224 /* We treat the last '-' as a normal character. */
3225 re_string_skip_bytes (regexp
, -token_len
);
3226 token
->type
= CHARACTER
;
3229 is_range_exp
= true;
3233 if (is_range_exp
== true)
3235 end_elem
.opr
.name
= end_name_buf
;
3236 end_elem
.type
= COLL_SYM
;
3237 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3239 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3242 goto parse_bracket_exp_free_return
;
3245 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3248 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3249 &start_elem
, &end_elem
,
3250 dfa
, syntax
, nrules
, collseqmb
, collseqwc
,
3251 table_size
, symb_table
, extra
);
3253 # ifdef RE_ENABLE_I18N
3254 *err
= build_range_exp (syntax
, sbcset
,
3255 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3256 &range_alloc
, &start_elem
, &end_elem
);
3258 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3260 #endif /* RE_ENABLE_I18N */
3261 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3262 goto parse_bracket_exp_free_return
;
3266 switch (start_elem
.type
)
3269 bitset_set (sbcset
, start_elem
.opr
.ch
);
3271 #ifdef RE_ENABLE_I18N
3273 /* Check whether the array has enough space. */
3274 if (__glibc_unlikely (mbchar_alloc
== mbcset
->nmbchars
))
3276 wchar_t *new_mbchars
;
3277 /* Not enough, realloc it. */
3278 /* +1 in case of mbcset->nmbchars is 0. */
3279 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3280 /* Use realloc since array is NULL if *alloc == 0. */
3281 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3283 if (__glibc_unlikely (new_mbchars
== NULL
))
3284 goto parse_bracket_exp_espace
;
3285 mbcset
->mbchars
= new_mbchars
;
3287 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3289 #endif /* RE_ENABLE_I18N */
3291 *err
= build_equiv_class (sbcset
,
3292 #ifdef RE_ENABLE_I18N
3293 mbcset
, &equiv_class_alloc
,
3294 #endif /* RE_ENABLE_I18N */
3295 start_elem
.opr
.name
);
3296 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3297 goto parse_bracket_exp_free_return
;
3300 *err
= build_collating_symbol (sbcset
,
3301 #ifdef RE_ENABLE_I18N
3302 mbcset
, &coll_sym_alloc
,
3303 #endif /* RE_ENABLE_I18N */
3304 start_elem
.opr
.name
,
3305 nrules
, table_size
, symb_table
, extra
);
3306 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3307 goto parse_bracket_exp_free_return
;
3310 *err
= build_charclass (regexp
->trans
, sbcset
,
3311 #ifdef RE_ENABLE_I18N
3312 mbcset
, &char_class_alloc
,
3313 #endif /* RE_ENABLE_I18N */
3314 (const char *) start_elem
.opr
.name
,
3316 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3317 goto parse_bracket_exp_free_return
;
3320 DEBUG_ASSERT (false);
3324 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3327 goto parse_bracket_exp_free_return
;
3329 if (token
->type
== OP_CLOSE_BRACKET
)
3333 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3335 /* If it is non-matching list. */
3337 bitset_not (sbcset
);
3339 #ifdef RE_ENABLE_I18N
3340 /* Ensure only single byte characters are set. */
3341 if (dfa
->mb_cur_max
> 1)
3342 bitset_mask (sbcset
, dfa
->sb_char
);
3344 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3345 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3346 || mbcset
->non_match
)))
3348 bin_tree_t
*mbc_tree
;
3350 /* Build a tree for complex bracket. */
3351 dfa
->has_mb_node
= 1;
3352 br_token
.type
= COMPLEX_BRACKET
;
3353 br_token
.opr
.mbcset
= mbcset
;
3354 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3355 if (__glibc_unlikely (mbc_tree
== NULL
))
3356 goto parse_bracket_exp_espace
;
3357 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3358 if (sbcset
[sbc_idx
])
3360 /* If there are no bits set in sbcset, there is no point
3361 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3362 if (sbc_idx
< BITSET_WORDS
)
3364 /* Build a tree for simple bracket. */
3365 br_token
.type
= SIMPLE_BRACKET
;
3366 br_token
.opr
.sbcset
= sbcset
;
3367 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3368 if (__glibc_unlikely (work_tree
== NULL
))
3369 goto parse_bracket_exp_espace
;
3371 /* Then join them by ALT node. */
3372 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3373 if (__glibc_unlikely (work_tree
== NULL
))
3374 goto parse_bracket_exp_espace
;
3379 work_tree
= mbc_tree
;
3383 #endif /* not RE_ENABLE_I18N */
3385 #ifdef RE_ENABLE_I18N
3386 free_charset (mbcset
);
3388 /* Build a tree for simple bracket. */
3389 br_token
.type
= SIMPLE_BRACKET
;
3390 br_token
.opr
.sbcset
= sbcset
;
3391 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3392 if (__glibc_unlikely (work_tree
== NULL
))
3393 goto parse_bracket_exp_espace
;
3397 parse_bracket_exp_espace
:
3399 parse_bracket_exp_free_return
:
3401 #ifdef RE_ENABLE_I18N
3402 free_charset (mbcset
);
3403 #endif /* RE_ENABLE_I18N */
3407 /* Parse an element in the bracket expression. */
3409 static reg_errcode_t
3410 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3411 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3412 reg_syntax_t syntax
, bool accept_hyphen
)
3414 #ifdef RE_ENABLE_I18N
3416 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3417 if (cur_char_size
> 1)
3419 elem
->type
= MB_CHAR
;
3420 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3421 re_string_skip_bytes (regexp
, cur_char_size
);
3424 #endif /* RE_ENABLE_I18N */
3425 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3426 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3427 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3428 return parse_bracket_symbol (elem
, regexp
, token
);
3429 if (__glibc_unlikely (token
->type
== OP_CHARSET_RANGE
) && !accept_hyphen
)
3431 /* A '-' must only appear as anything but a range indicator before
3432 the closing bracket. Everything else is an error. */
3434 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3435 if (token2
.type
!= OP_CLOSE_BRACKET
)
3436 /* The actual error value is not standardized since this whole
3437 case is undefined. But ERANGE makes good sense. */
3440 elem
->type
= SB_CHAR
;
3441 elem
->opr
.ch
= token
->opr
.c
;
3445 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3446 such as [:<character_class>:], [.<collating_element>.], and
3447 [=<equivalent_class>=]. */
3449 static reg_errcode_t
3450 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3453 unsigned char ch
, delim
= token
->opr
.c
;
3455 if (re_string_eoi(regexp
))
3459 if (i
>= BRACKET_NAME_BUF_SIZE
)
3461 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3462 ch
= re_string_fetch_byte_case (regexp
);
3464 ch
= re_string_fetch_byte (regexp
);
3465 if (re_string_eoi(regexp
))
3467 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3469 elem
->opr
.name
[i
] = ch
;
3471 re_string_skip_bytes (regexp
, 1);
3472 elem
->opr
.name
[i
] = '\0';
3473 switch (token
->type
)
3475 case OP_OPEN_COLL_ELEM
:
3476 elem
->type
= COLL_SYM
;
3478 case OP_OPEN_EQUIV_CLASS
:
3479 elem
->type
= EQUIV_CLASS
;
3481 case OP_OPEN_CHAR_CLASS
:
3482 elem
->type
= CHAR_CLASS
;
3490 /* Helper function for parse_bracket_exp.
3491 Build the equivalence class which is represented by NAME.
3492 The result are written to MBCSET and SBCSET.
3493 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3494 is a pointer argument since we may update it. */
3496 static reg_errcode_t
3497 #ifdef RE_ENABLE_I18N
3498 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3499 Idx
*equiv_class_alloc
, const unsigned char *name
)
3500 #else /* not RE_ENABLE_I18N */
3501 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3502 #endif /* not RE_ENABLE_I18N */
3505 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3508 const int32_t *table
, *indirect
;
3509 const unsigned char *weights
, *extra
, *cp
;
3510 unsigned char char_buf
[2];
3514 /* Calculate the index for equivalence class. */
3516 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3517 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3518 _NL_COLLATE_WEIGHTMB
);
3519 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3520 _NL_COLLATE_EXTRAMB
);
3521 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3522 _NL_COLLATE_INDIRECTMB
);
3523 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3524 if (__glibc_unlikely (idx1
== 0 || *cp
!= '\0'))
3525 /* This isn't a valid character. */
3526 return REG_ECOLLATE
;
3528 /* Build single byte matching table for this equivalence class. */
3529 len
= weights
[idx1
& 0xffffff];
3530 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3534 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3539 /* This isn't a valid character. */
3541 /* Compare only if the length matches and the collation rule
3542 index is the same. */
3543 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24)
3544 && memcmp (weights
+ (idx1
& 0xffffff) + 1,
3545 weights
+ (idx2
& 0xffffff) + 1, len
) == 0)
3546 bitset_set (sbcset
, ch
);
3548 /* Check whether the array has enough space. */
3549 if (__glibc_unlikely (*equiv_class_alloc
== mbcset
->nequiv_classes
))
3551 /* Not enough, realloc it. */
3552 /* +1 in case of mbcset->nequiv_classes is 0. */
3553 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3554 /* Use realloc since the array is NULL if *alloc == 0. */
3555 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3557 new_equiv_class_alloc
);
3558 if (__glibc_unlikely (new_equiv_classes
== NULL
))
3560 mbcset
->equiv_classes
= new_equiv_classes
;
3561 *equiv_class_alloc
= new_equiv_class_alloc
;
3563 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3568 if (__glibc_unlikely (strlen ((const char *) name
) != 1))
3569 return REG_ECOLLATE
;
3570 bitset_set (sbcset
, *name
);
3575 /* Helper function for parse_bracket_exp.
3576 Build the character class which is represented by NAME.
3577 The result are written to MBCSET and SBCSET.
3578 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3579 is a pointer argument since we may update it. */
3581 static reg_errcode_t
3582 #ifdef RE_ENABLE_I18N
3583 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3584 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3585 const char *class_name
, reg_syntax_t syntax
)
3586 #else /* not RE_ENABLE_I18N */
3587 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3588 const char *class_name
, reg_syntax_t syntax
)
3589 #endif /* not RE_ENABLE_I18N */
3592 const char *name
= class_name
;
3594 /* In case of REG_ICASE "upper" and "lower" match the both of
3595 upper and lower cases. */
3596 if ((syntax
& RE_ICASE
)
3597 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3600 #ifdef RE_ENABLE_I18N
3601 /* Check the space of the arrays. */
3602 if (__glibc_unlikely (*char_class_alloc
== mbcset
->nchar_classes
))
3604 /* Not enough, realloc it. */
3605 /* +1 in case of mbcset->nchar_classes is 0. */
3606 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3607 /* Use realloc since array is NULL if *alloc == 0. */
3608 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3609 new_char_class_alloc
);
3610 if (__glibc_unlikely (new_char_classes
== NULL
))
3612 mbcset
->char_classes
= new_char_classes
;
3613 *char_class_alloc
= new_char_class_alloc
;
3615 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3616 #endif /* RE_ENABLE_I18N */
3618 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3620 if (__glibc_unlikely (trans != NULL)) \
3622 for (i = 0; i < SBC_MAX; ++i) \
3623 if (ctype_func (i)) \
3624 bitset_set (sbcset, trans[i]); \
3628 for (i = 0; i < SBC_MAX; ++i) \
3629 if (ctype_func (i)) \
3630 bitset_set (sbcset, i); \
3634 if (strcmp (name
, "alnum") == 0)
3635 BUILD_CHARCLASS_LOOP (isalnum
);
3636 else if (strcmp (name
, "cntrl") == 0)
3637 BUILD_CHARCLASS_LOOP (iscntrl
);
3638 else if (strcmp (name
, "lower") == 0)
3639 BUILD_CHARCLASS_LOOP (islower
);
3640 else if (strcmp (name
, "space") == 0)
3641 BUILD_CHARCLASS_LOOP (isspace
);
3642 else if (strcmp (name
, "alpha") == 0)
3643 BUILD_CHARCLASS_LOOP (isalpha
);
3644 else if (strcmp (name
, "digit") == 0)
3645 BUILD_CHARCLASS_LOOP (isdigit
);
3646 else if (strcmp (name
, "print") == 0)
3647 BUILD_CHARCLASS_LOOP (isprint
);
3648 else if (strcmp (name
, "upper") == 0)
3649 BUILD_CHARCLASS_LOOP (isupper
);
3650 else if (strcmp (name
, "blank") == 0)
3651 BUILD_CHARCLASS_LOOP (isblank
);
3652 else if (strcmp (name
, "graph") == 0)
3653 BUILD_CHARCLASS_LOOP (isgraph
);
3654 else if (strcmp (name
, "punct") == 0)
3655 BUILD_CHARCLASS_LOOP (ispunct
);
3656 else if (strcmp (name
, "xdigit") == 0)
3657 BUILD_CHARCLASS_LOOP (isxdigit
);
3665 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3666 const char *class_name
,
3667 const char *extra
, bool non_match
,
3670 re_bitset_ptr_t sbcset
;
3671 #ifdef RE_ENABLE_I18N
3672 re_charset_t
*mbcset
;
3674 #endif /* not RE_ENABLE_I18N */
3678 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3679 if (__glibc_unlikely (sbcset
== NULL
))
3684 #ifdef RE_ENABLE_I18N
3685 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3686 if (__glibc_unlikely (mbcset
== NULL
))
3692 mbcset
->non_match
= non_match
;
3693 #endif /* RE_ENABLE_I18N */
3695 /* We don't care the syntax in this case. */
3696 ret
= build_charclass (trans
, sbcset
,
3697 #ifdef RE_ENABLE_I18N
3699 #endif /* RE_ENABLE_I18N */
3702 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3705 #ifdef RE_ENABLE_I18N
3706 free_charset (mbcset
);
3707 #endif /* RE_ENABLE_I18N */
3711 /* \w match '_' also. */
3712 for (; *extra
; extra
++)
3713 bitset_set (sbcset
, *extra
);
3715 /* If it is non-matching list. */
3717 bitset_not (sbcset
);
3719 #ifdef RE_ENABLE_I18N
3720 /* Ensure only single byte characters are set. */
3721 if (dfa
->mb_cur_max
> 1)
3722 bitset_mask (sbcset
, dfa
->sb_char
);
3725 /* Build a tree for simple bracket. */
3726 re_token_t br_token
= { .type
= SIMPLE_BRACKET
, .opr
.sbcset
= sbcset
};
3727 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3728 if (__glibc_unlikely (tree
== NULL
))
3729 goto build_word_op_espace
;
3731 #ifdef RE_ENABLE_I18N
3732 if (dfa
->mb_cur_max
> 1)
3734 bin_tree_t
*mbc_tree
;
3735 /* Build a tree for complex bracket. */
3736 br_token
.type
= COMPLEX_BRACKET
;
3737 br_token
.opr
.mbcset
= mbcset
;
3738 dfa
->has_mb_node
= 1;
3739 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3740 if (__glibc_unlikely (mbc_tree
== NULL
))
3741 goto build_word_op_espace
;
3742 /* Then join them by ALT node. */
3743 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3744 if (__glibc_likely (mbc_tree
!= NULL
))
3749 free_charset (mbcset
);
3752 #else /* not RE_ENABLE_I18N */
3754 #endif /* not RE_ENABLE_I18N */
3756 build_word_op_espace
:
3758 #ifdef RE_ENABLE_I18N
3759 free_charset (mbcset
);
3760 #endif /* RE_ENABLE_I18N */
3765 /* This is intended for the expressions like "a{1,3}".
3766 Fetch a number from 'input', and return the number.
3767 Return -1 if the number field is empty like "{,1}".
3768 Return RE_DUP_MAX + 1 if the number field is too large.
3769 Return -2 if an error occurred. */
3772 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3778 fetch_token (token
, input
, syntax
);
3780 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3782 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3784 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3788 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3793 #ifdef RE_ENABLE_I18N
3795 free_charset (re_charset_t
*cset
)
3797 re_free (cset
->mbchars
);
3799 re_free (cset
->coll_syms
);
3800 re_free (cset
->equiv_classes
);
3802 re_free (cset
->range_starts
);
3803 re_free (cset
->range_ends
);
3804 re_free (cset
->char_classes
);
3807 #endif /* RE_ENABLE_I18N */
3809 /* Functions for binary tree operation. */
3811 /* Create a tree node. */
3814 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3815 re_token_type_t type
)
3817 re_token_t t
= { .type
= type
};
3818 return create_token_tree (dfa
, left
, right
, &t
);
3822 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3823 const re_token_t
*token
)
3826 if (__glibc_unlikely (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
))
3828 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3830 if (storage
== NULL
)
3832 storage
->next
= dfa
->str_tree_storage
;
3833 dfa
->str_tree_storage
= storage
;
3834 dfa
->str_tree_storage_idx
= 0;
3836 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3838 tree
->parent
= NULL
;
3840 tree
->right
= right
;
3841 tree
->token
= *token
;
3842 tree
->token
.duplicated
= 0;
3843 tree
->token
.opt_subexp
= 0;
3846 tree
->node_idx
= -1;
3849 left
->parent
= tree
;
3851 right
->parent
= tree
;
3855 /* Mark the tree SRC as an optional subexpression.
3856 To be called from preorder or postorder. */
3858 static reg_errcode_t
3859 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3861 Idx idx
= (uintptr_t) extra
;
3862 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3863 node
->token
.opt_subexp
= 1;
3868 /* Free the allocated memory inside NODE. */
3871 free_token (re_token_t
*node
)
3873 #ifdef RE_ENABLE_I18N
3874 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3875 free_charset (node
->opr
.mbcset
);
3877 #endif /* RE_ENABLE_I18N */
3878 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3879 re_free (node
->opr
.sbcset
);
3882 /* Worker function for tree walking. Free the allocated memory inside NODE
3883 and its children. */
3885 static reg_errcode_t
3886 free_tree (void *extra
, bin_tree_t
*node
)
3888 free_token (&node
->token
);
3893 /* Duplicate the node SRC, and return new node. This is a preorder
3894 visit similar to the one implemented by the generic visitor, but
3895 we need more infrastructure to maintain two parallel trees --- so,
3896 it's easier to duplicate. */
3899 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3901 const bin_tree_t
*node
;
3902 bin_tree_t
*dup_root
;
3903 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3905 for (node
= root
; ; )
3907 /* Create a new tree and link it back to the current parent. */
3908 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3911 (*p_new
)->parent
= dup_node
;
3912 (*p_new
)->token
.duplicated
= 1;
3915 /* Go to the left node, or up and to the right. */
3919 p_new
= &dup_node
->left
;
3923 const bin_tree_t
*prev
= NULL
;
3924 while (node
->right
== prev
|| node
->right
== NULL
)
3927 node
= node
->parent
;
3928 dup_node
= dup_node
->parent
;
3933 p_new
= &dup_node
->right
;