1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
25 size_t length
, reg_syntax_t syntax
);
26 static void re_compile_fastmap_iter (regex_t
*bufp
,
27 const re_dfastate_t
*init_state
,
29 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
31 static void free_charset (re_charset_t
*cset
);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t
*preg
);
34 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
36 static void optimize_utf8 (re_dfa_t
*dfa
);
38 static reg_errcode_t
analyze (regex_t
*preg
);
39 static reg_errcode_t
preorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
postorder (bin_tree_t
*root
,
43 reg_errcode_t (fn (void *, bin_tree_t
*)),
45 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
47 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
49 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
52 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
53 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
54 unsigned int constraint
);
55 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
56 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
58 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
59 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
61 static int peek_token (re_token_t
*token
, re_string_t
*input
,
63 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
64 reg_syntax_t syntax
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 Idx nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 Idx nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 Idx nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
75 re_token_t
*token
, reg_syntax_t syntax
,
76 Idx nest
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
78 re_dfa_t
*dfa
, re_token_t
*token
,
79 reg_syntax_t syntax
, reg_errcode_t
*err
);
80 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
81 re_token_t
*token
, reg_syntax_t syntax
,
83 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
85 re_token_t
*token
, int token_len
,
89 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
93 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
95 Idx
*equiv_class_alloc
,
96 const unsigned char *name
);
97 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
100 Idx
*char_class_alloc
,
101 const char *class_name
,
102 reg_syntax_t syntax
);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
105 const unsigned char *name
);
106 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
108 const char *class_name
,
109 reg_syntax_t syntax
);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
112 RE_TRANSLATE_TYPE trans
,
113 const char *class_name
,
115 bool non_match
, reg_errcode_t
*err
);
116 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 re_token_type_t type
);
119 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
120 bin_tree_t
*left
, bin_tree_t
*right
,
121 const re_token_t
*token
);
122 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
123 static void free_token (re_token_t
*node
);
124 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
125 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid
[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx
[] =
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
217 re_compile_pattern (const char *pattern
, size_t length
,
218 struct re_pattern_buffer
*bufp
)
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
227 /* Match anchors at newline. */
228 bufp
->newline_anchor
= 1;
230 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
234 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
237 weak_alias (__re_compile_pattern
, re_compile_pattern
)
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options
;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
256 re_set_syntax (reg_syntax_t syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
270 re_dfa_t
*dfa
= bufp
->buffer
;
271 char *fastmap
= bufp
->fastmap
;
273 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
274 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
275 if (dfa
->init_state
!= dfa
->init_state_word
)
276 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_nl
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
281 bufp
->fastmap_accurate
= 1;
285 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
289 __attribute__ ((always_inline
))
290 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
294 fastmap
[tolower (ch
)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
301 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
304 re_dfa_t
*dfa
= bufp
->buffer
;
306 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
307 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
309 Idx node
= init_state
->nodes
.elems
[node_cnt
];
310 re_token_type_t type
= dfa
->nodes
[node
].type
;
312 if (type
== CHARACTER
)
314 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
318 unsigned char buf
[MB_LEN_MAX
];
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, false, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
353 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
357 /* See if we have to try all bytes which start multiple collation
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
364 && (cset
->ncoll_syms
|| cset
->nranges
))
366 const int32_t *table
= (const int32_t *)
367 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
368 for (i
= 0; i
< SBC_MAX
; ++i
)
370 re_set_fastmap (fastmap
, icase
, i
);
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa
->mb_cur_max
> 1
379 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
381 || cset
->nequiv_classes
389 memset (&mbs
, 0, sizeof (mbs
));
390 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
391 re_set_fastmap (fastmap
, false, (int) c
);
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i
= 0; i
< cset
->nmbchars
; ++i
)
403 memset (&state
, '\0', sizeof (state
));
404 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
405 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
406 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
408 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
410 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
415 #endif /* RE_ENABLE_I18N */
416 else if (type
== OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type
== OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type
== END_OF_RE
)
422 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
423 if (type
== END_OF_RE
)
424 bufp
->can_be_null
= 1;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t
*_Restrict_ preg
, const char *_Restrict_ pattern
, int cflags
)
470 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC
);
477 /* Try to allocate space for the fastmap. */
478 preg
->fastmap
= re_malloc (char, SBC_MAX
);
479 if (__glibc_unlikely (preg
->fastmap
== NULL
))
482 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags
& REG_NEWLINE
)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax
&= ~RE_DOT_NEWLINE
;
488 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
489 /* It also changes the matching behavior. */
490 preg
->newline_anchor
= 1;
493 preg
->newline_anchor
= 0;
494 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
495 preg
->translate
= NULL
;
497 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret
== REG_ERPAREN
)
504 /* We have already checked preg->fastmap != NULL. */
505 if (__glibc_likely (ret
== REG_NOERROR
))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg
);
511 /* Some error occurred while compiling the expression. */
512 re_free (preg
->fastmap
);
513 preg
->fastmap
= NULL
;
519 libc_hidden_def (__regcomp
)
520 weak_alias (__regcomp
, regcomp
)
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
527 regerror (int errcode
, const regex_t
*_Restrict_ preg
, char *_Restrict_ errbuf
,
532 int nerrcodes
= sizeof __re_error_msgid_idx
/ sizeof __re_error_msgid_idx
[0];
534 if (__glibc_unlikely (errcode
< 0 || errcode
>= nerrcodes
))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
541 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
543 msg_size
= strlen (msg
) + 1; /* Includes the null. */
545 if (__glibc_likely (errbuf_size
!= 0))
547 size_t cpy_size
= msg_size
;
548 if (__glibc_unlikely (msg_size
> errbuf_size
))
550 cpy_size
= errbuf_size
- 1;
551 errbuf
[cpy_size
] = '\0';
553 memcpy (errbuf
, msg
, cpy_size
);
559 weak_alias (__regerror
, regerror
)
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565 UTF-8 is used. Otherwise we would allocate memory just to initialize
566 it the same all the time. UTF-8 is the preferred encoding so this is
567 a worthwhile optimization. */
568 static const bitset_t utf8_sb_map
=
570 /* Set the first 128 bits. */
571 # if defined __GNUC__ && !defined __STRICT_ANSI__
572 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
574 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
575 # error "bitset_word_t is narrower than 32 bits"
576 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
577 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
578 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
579 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
580 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
584 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
586 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
593 free_dfa_content (re_dfa_t
*dfa
)
598 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
599 free_token (dfa
->nodes
+ i
);
600 re_free (dfa
->nexts
);
601 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
603 if (dfa
->eclosures
!= NULL
)
604 re_node_set_free (dfa
->eclosures
+ i
);
605 if (dfa
->inveclosures
!= NULL
)
606 re_node_set_free (dfa
->inveclosures
+ i
);
607 if (dfa
->edests
!= NULL
)
608 re_node_set_free (dfa
->edests
+ i
);
610 re_free (dfa
->edests
);
611 re_free (dfa
->eclosures
);
612 re_free (dfa
->inveclosures
);
613 re_free (dfa
->nodes
);
615 if (dfa
->state_table
)
616 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
618 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
619 for (j
= 0; j
< entry
->num
; ++j
)
621 re_dfastate_t
*state
= entry
->array
[j
];
624 re_free (entry
->array
);
626 re_free (dfa
->state_table
);
627 #ifdef RE_ENABLE_I18N
628 if (dfa
->sb_char
!= utf8_sb_map
)
629 re_free (dfa
->sb_char
);
631 re_free (dfa
->subexp_map
);
633 re_free (dfa
->re_str
);
640 /* Free dynamically allocated space used by PREG. */
643 regfree (regex_t
*preg
)
645 re_dfa_t
*dfa
= preg
->buffer
;
646 if (__glibc_likely (dfa
!= NULL
))
648 lock_fini (dfa
->lock
);
649 free_dfa_content (dfa
);
654 re_free (preg
->fastmap
);
655 preg
->fastmap
= NULL
;
657 re_free (preg
->translate
);
658 preg
->translate
= NULL
;
661 libc_hidden_def (__regfree
)
662 weak_alias (__regfree
, regfree
)
665 /* Entry points compatible with 4.2 BSD regex library. We don't define
666 them unless specifically requested. */
668 #if defined _REGEX_RE_COMP || defined _LIBC
670 /* BSD has one and only one pattern buffer. */
671 static struct re_pattern_buffer re_comp_buf
;
675 /* Make these definitions weak in libc, so POSIX programs can redefine
676 these names if they don't use our functions, and still use
677 regcomp/regexec above without link errors. */
680 re_comp (const char *s
)
687 if (!re_comp_buf
.buffer
)
688 return gettext ("No previous regular expression");
692 if (re_comp_buf
.buffer
)
694 fastmap
= re_comp_buf
.fastmap
;
695 re_comp_buf
.fastmap
= NULL
;
696 __regfree (&re_comp_buf
);
697 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
698 re_comp_buf
.fastmap
= fastmap
;
701 if (re_comp_buf
.fastmap
== NULL
)
703 re_comp_buf
.fastmap
= re_malloc (char, SBC_MAX
);
704 if (re_comp_buf
.fastmap
== NULL
)
705 return (char *) gettext (__re_error_msgid
706 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
709 /* Since 're_exec' always passes NULL for the 'regs' argument, we
710 don't need to initialize the pattern buffer fields which affect it. */
712 /* Match anchors at newlines. */
713 re_comp_buf
.newline_anchor
= 1;
715 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
720 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
721 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
725 libc_freeres_fn (free_mem
)
727 __regfree (&re_comp_buf
);
731 #endif /* _REGEX_RE_COMP */
733 /* Internal entry point.
734 Compile the regular expression PATTERN, whose length is LENGTH.
735 SYNTAX indicate regular expression's syntax. */
738 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
741 reg_errcode_t err
= REG_NOERROR
;
745 /* Initialize the pattern buffer. */
746 preg
->fastmap_accurate
= 0;
747 preg
->syntax
= syntax
;
748 preg
->not_bol
= preg
->not_eol
= 0;
751 preg
->can_be_null
= 0;
752 preg
->regs_allocated
= REGS_UNALLOCATED
;
754 /* Initialize the dfa. */
756 if (__glibc_unlikely (preg
->allocated
< sizeof (re_dfa_t
)))
758 /* If zero allocated, but buffer is non-null, try to realloc
759 enough space. This loses if buffer's address is bogus, but
760 that is the user's responsibility. If ->buffer is NULL this
761 is a simple allocation. */
762 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
765 preg
->allocated
= sizeof (re_dfa_t
);
768 preg
->used
= sizeof (re_dfa_t
);
770 err
= init_dfa (dfa
, length
);
771 if (__glibc_unlikely (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0))
773 if (__glibc_unlikely (err
!= REG_NOERROR
))
775 free_dfa_content (dfa
);
781 /* Note: length+1 will not overflow since it is checked in init_dfa. */
782 dfa
->re_str
= re_malloc (char, length
+ 1);
783 strncpy (dfa
->re_str
, pattern
, length
+ 1);
786 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
787 (syntax
& RE_ICASE
) != 0, dfa
);
788 if (__glibc_unlikely (err
!= REG_NOERROR
))
790 re_compile_internal_free_return
:
791 free_workarea_compile (preg
);
792 re_string_destruct (®exp
);
793 lock_fini (dfa
->lock
);
794 free_dfa_content (dfa
);
800 /* Parse the regular expression, and build a structure tree. */
802 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
803 if (__glibc_unlikely (dfa
->str_tree
== NULL
))
804 goto re_compile_internal_free_return
;
806 /* Analyze the tree and create the nfa. */
807 err
= analyze (preg
);
808 if (__glibc_unlikely (err
!= REG_NOERROR
))
809 goto re_compile_internal_free_return
;
811 #ifdef RE_ENABLE_I18N
812 /* If possible, do searching in single byte encoding to speed things up. */
813 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
817 /* Then create the initial state of the dfa. */
818 err
= create_initial_state (dfa
);
820 /* Release work areas. */
821 free_workarea_compile (preg
);
822 re_string_destruct (®exp
);
824 if (__glibc_unlikely (err
!= REG_NOERROR
))
826 lock_fini (dfa
->lock
);
827 free_dfa_content (dfa
);
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
839 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
841 __re_size_t table_size
;
843 const char *codeset_name
;
845 #ifdef RE_ENABLE_I18N
846 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
848 size_t max_i18n_object_size
= 0;
850 size_t max_object_size
=
851 MAX (sizeof (struct re_state_table_entry
),
852 MAX (sizeof (re_token_t
),
853 MAX (sizeof (re_node_set
),
854 MAX (sizeof (regmatch_t
),
855 max_i18n_object_size
))));
857 memset (dfa
, '\0', sizeof (re_dfa_t
));
859 /* Force allocation of str_tree_storage the first time. */
860 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
862 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
863 calculation below, and for similar doubling calculations
864 elsewhere. And it's <= rather than <, because some of the
865 doubling calculations add 1 afterwards. */
866 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2
870 dfa
->nodes_alloc
= pat_len
+ 1;
871 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size
= 1; ; table_size
<<= 1)
875 if (table_size
> pat_len
)
878 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
879 dfa
->state_hash_mask
= table_size
- 1;
881 dfa
->mb_cur_max
= MB_CUR_MAX
;
883 if (dfa
->mb_cur_max
== 6
884 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
886 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
889 codeset_name
= nl_langinfo (CODESET
);
890 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
891 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
892 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
893 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa
->map_notascii
= 0;
901 #ifdef RE_ENABLE_I18N
902 if (dfa
->mb_cur_max
> 1)
905 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
910 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
911 if (__glibc_unlikely (dfa
->sb_char
== NULL
))
914 /* Set the bits corresponding to single byte chars. */
915 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
916 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
918 wint_t wch
= __btowc (ch
);
920 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
922 if (isascii (ch
) && wch
!= ch
)
923 dfa
->map_notascii
= 1;
930 if (__glibc_unlikely (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
))
935 /* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
940 init_word_char (re_dfa_t
*dfa
)
945 dfa
->word_ops_used
= 1;
946 if (__glibc_likely (dfa
->map_notascii
== 0))
948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 them, an issue when this code is used in Gnulib. */
950 bitset_word_t bits0
= 0x00000000;
951 bitset_word_t bits1
= 0x03ff0000;
952 bitset_word_t bits2
= 0x87fffffe;
953 bitset_word_t bits3
= 0x07fffffe;
954 if (BITSET_WORD_BITS
== 64)
956 /* Pacify gcc -Woverflow on 32-bit platformns. */
957 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
958 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
961 else if (BITSET_WORD_BITS
== 32)
963 dfa
->word_char
[0] = bits0
;
964 dfa
->word_char
[1] = bits1
;
965 dfa
->word_char
[2] = bits2
;
966 dfa
->word_char
[3] = bits3
;
973 if (__glibc_likely (dfa
->is_utf8
))
975 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
981 for (; i
< BITSET_WORDS
; ++i
)
982 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
983 if (isalnum (ch
) || ch
== '_')
984 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
987 /* Free the work area which are only used while compiling. */
990 free_workarea_compile (regex_t
*preg
)
992 re_dfa_t
*dfa
= preg
->buffer
;
993 bin_tree_storage_t
*storage
, *next
;
994 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
996 next
= storage
->next
;
999 dfa
->str_tree_storage
= NULL
;
1000 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
1001 dfa
->str_tree
= NULL
;
1002 re_free (dfa
->org_indices
);
1003 dfa
->org_indices
= NULL
;
1006 /* Create initial states for all contexts. */
1008 static reg_errcode_t
1009 create_initial_state (re_dfa_t
*dfa
)
1013 re_node_set init_nodes
;
1015 /* Initial states have the epsilon closure of the node which is
1016 the first node of the regular expression. */
1017 first
= dfa
->str_tree
->first
->node_idx
;
1018 dfa
->init_node
= first
;
1019 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1020 if (__glibc_unlikely (err
!= REG_NOERROR
))
1023 /* The back-references which are in initial states can epsilon transit,
1024 since in this case all of the subexpressions can be null.
1025 Then we add epsilon closures of the nodes which are the next nodes of
1026 the back-references. */
1027 if (dfa
->nbackref
> 0)
1028 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1030 Idx node_idx
= init_nodes
.elems
[i
];
1031 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1034 if (type
!= OP_BACK_REF
)
1036 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1038 re_token_t
*clexp_node
;
1039 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1040 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1041 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1044 if (clexp_idx
== init_nodes
.nelem
)
1047 if (type
== OP_BACK_REF
)
1049 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1050 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1052 reg_errcode_t merge_err
1053 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1054 if (merge_err
!= REG_NOERROR
)
1061 /* It must be the first time to invoke acquire_state. */
1062 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1063 /* We don't check ERR here, since the initial state must not be NULL. */
1064 if (__glibc_unlikely (dfa
->init_state
== NULL
))
1066 if (dfa
->init_state
->has_constraint
)
1068 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1070 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1072 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1076 if (__glibc_unlikely (dfa
->init_state_word
== NULL
1077 || dfa
->init_state_nl
== NULL
1078 || dfa
->init_state_begbuf
== NULL
))
1082 dfa
->init_state_word
= dfa
->init_state_nl
1083 = dfa
->init_state_begbuf
= dfa
->init_state
;
1085 re_node_set_free (&init_nodes
);
1089 #ifdef RE_ENABLE_I18N
1090 /* If it is possible to do searching in single byte encoding instead of UTF-8
1091 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1092 DFA nodes where needed. */
1095 optimize_utf8 (re_dfa_t
*dfa
)
1099 bool mb_chars
= false;
1100 bool has_period
= false;
1102 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1103 switch (dfa
->nodes
[node
].type
)
1106 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1110 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1118 /* Word anchors etc. cannot be handled. It's okay to test
1119 opr.ctx_type since constraints (for all DFA nodes) are
1120 created by ORing one or more opr.ctx_type values. */
1130 case OP_DUP_ASTERISK
:
1131 case OP_OPEN_SUBEXP
:
1132 case OP_CLOSE_SUBEXP
:
1134 case COMPLEX_BRACKET
:
1136 case SIMPLE_BRACKET
:
1137 /* Just double check. */
1139 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1141 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1142 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1144 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1154 if (mb_chars
|| has_period
)
1155 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1157 if (dfa
->nodes
[node
].type
== CHARACTER
1158 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1159 dfa
->nodes
[node
].mb_partial
= 0;
1160 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1161 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1164 /* The search can be in single byte locale. */
1165 dfa
->mb_cur_max
= 1;
1167 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1171 /* Analyze the structure tree, and calculate "first", "next", "edest",
1172 "eclosure", and "inveclosure". */
1174 static reg_errcode_t
1175 analyze (regex_t
*preg
)
1177 re_dfa_t
*dfa
= preg
->buffer
;
1180 /* Allocate arrays. */
1181 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1182 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1183 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1184 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1185 if (__glibc_unlikely (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
1186 || dfa
->edests
== NULL
|| dfa
->eclosures
== NULL
))
1189 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1190 if (dfa
->subexp_map
!= NULL
)
1193 for (i
= 0; i
< preg
->re_nsub
; i
++)
1194 dfa
->subexp_map
[i
] = i
;
1195 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1196 for (i
= 0; i
< preg
->re_nsub
; i
++)
1197 if (dfa
->subexp_map
[i
] != i
)
1199 if (i
== preg
->re_nsub
)
1201 re_free (dfa
->subexp_map
);
1202 dfa
->subexp_map
= NULL
;
1206 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1207 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1209 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1210 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1212 preorder (dfa
->str_tree
, calc_next
, dfa
);
1213 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1214 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1216 ret
= calc_eclosure (dfa
);
1217 if (__glibc_unlikely (ret
!= REG_NOERROR
))
1220 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1221 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1222 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1225 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1226 if (__glibc_unlikely (dfa
->inveclosures
== NULL
))
1228 ret
= calc_inveclosure (dfa
);
1234 /* Our parse trees are very unbalanced, so we cannot use a stack to
1235 implement parse tree visits. Instead, we use parent pointers and
1236 some hairy code in these two functions. */
1237 static reg_errcode_t
1238 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1241 bin_tree_t
*node
, *prev
;
1243 for (node
= root
; ; )
1245 /* Descend down the tree, preferably to the left (or to the right
1246 if that's the only child). */
1247 while (node
->left
|| node
->right
)
1255 reg_errcode_t err
= fn (extra
, node
);
1256 if (__glibc_unlikely (err
!= REG_NOERROR
))
1258 if (node
->parent
== NULL
)
1261 node
= node
->parent
;
1263 /* Go up while we have a node that is reached from the right. */
1264 while (node
->right
== prev
|| node
->right
== NULL
);
1269 static reg_errcode_t
1270 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1275 for (node
= root
; ; )
1277 reg_errcode_t err
= fn (extra
, node
);
1278 if (__glibc_unlikely (err
!= REG_NOERROR
))
1281 /* Go to the left node, or up and to the right. */
1286 bin_tree_t
*prev
= NULL
;
1287 while (node
->right
== prev
|| node
->right
== NULL
)
1290 node
= node
->parent
;
1299 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1300 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1301 backreferences as well. Requires a preorder visit. */
1302 static reg_errcode_t
1303 optimize_subexps (void *extra
, bin_tree_t
*node
)
1305 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1307 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1309 int idx
= node
->token
.opr
.idx
;
1310 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1311 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1314 else if (node
->token
.type
== SUBEXP
1315 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1317 Idx other_idx
= node
->left
->token
.opr
.idx
;
1319 node
->left
= node
->left
->left
;
1321 node
->left
->parent
= node
;
1323 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1324 if (other_idx
< BITSET_WORD_BITS
)
1325 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1331 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1332 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1333 static reg_errcode_t
1334 lower_subexps (void *extra
, bin_tree_t
*node
)
1336 regex_t
*preg
= (regex_t
*) extra
;
1337 reg_errcode_t err
= REG_NOERROR
;
1339 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1341 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1343 node
->left
->parent
= node
;
1345 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1347 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1349 node
->right
->parent
= node
;
1356 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1358 re_dfa_t
*dfa
= preg
->buffer
;
1359 bin_tree_t
*body
= node
->left
;
1360 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1363 /* We do not optimize empty subexpressions, because otherwise we may
1364 have bad CONCAT nodes with NULL children. This is obviously not
1365 very common, so we do not lose much. An example that triggers
1366 this case is the sed "script" /\(\)/x. */
1367 && node
->left
!= NULL
1368 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1369 || !(dfa
->used_bkref_map
1370 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1373 /* Convert the SUBEXP node to the concatenation of an
1374 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1375 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1376 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1377 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1378 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1379 if (__glibc_unlikely (tree
== NULL
|| tree1
== NULL
1380 || op
== NULL
|| cls
== NULL
))
1386 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1387 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392 nodes. Requires a postorder visit. */
1393 static reg_errcode_t
1394 calc_first (void *extra
, bin_tree_t
*node
)
1396 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1397 if (node
->token
.type
== CONCAT
)
1399 node
->first
= node
->left
->first
;
1400 node
->node_idx
= node
->left
->node_idx
;
1405 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1406 if (__glibc_unlikely (node
->node_idx
== -1))
1408 if (node
->token
.type
== ANCHOR
)
1409 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1414 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1415 static reg_errcode_t
1416 calc_next (void *extra
, bin_tree_t
*node
)
1418 switch (node
->token
.type
)
1420 case OP_DUP_ASTERISK
:
1421 node
->left
->next
= node
;
1424 node
->left
->next
= node
->right
->first
;
1425 node
->right
->next
= node
->next
;
1429 node
->left
->next
= node
->next
;
1431 node
->right
->next
= node
->next
;
1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1438 static reg_errcode_t
1439 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1441 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1442 Idx idx
= node
->node_idx
;
1443 reg_errcode_t err
= REG_NOERROR
;
1445 switch (node
->token
.type
)
1451 assert (node
->next
== NULL
);
1454 case OP_DUP_ASTERISK
:
1458 dfa
->has_plural_match
= 1;
1459 if (node
->left
!= NULL
)
1460 left
= node
->left
->first
->node_idx
;
1462 left
= node
->next
->node_idx
;
1463 if (node
->right
!= NULL
)
1464 right
= node
->right
->first
->node_idx
;
1466 right
= node
->next
->node_idx
;
1468 assert (right
> -1);
1469 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1474 case OP_OPEN_SUBEXP
:
1475 case OP_CLOSE_SUBEXP
:
1476 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1480 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1481 if (node
->token
.type
== OP_BACK_REF
)
1482 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1486 assert (!IS_EPSILON_NODE (node
->token
.type
));
1487 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1494 /* Duplicate the epsilon closure of the node ROOT_NODE.
1495 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496 to their own constraint. */
1498 static reg_errcode_t
1499 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1500 Idx root_node
, unsigned int init_constraint
)
1502 Idx org_node
, clone_node
;
1504 unsigned int constraint
= init_constraint
;
1505 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1507 Idx org_dest
, clone_dest
;
1508 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1510 /* If the back reference epsilon-transit, its destination must
1511 also have the constraint. Then duplicate the epsilon closure
1512 of the destination of the back reference, and store it in
1513 edests of the back reference. */
1514 org_dest
= dfa
->nexts
[org_node
];
1515 re_node_set_empty (dfa
->edests
+ clone_node
);
1516 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1517 if (__glibc_unlikely (clone_dest
== -1))
1519 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1520 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1521 if (__glibc_unlikely (! ok
))
1524 else if (dfa
->edests
[org_node
].nelem
== 0)
1526 /* In case of the node can't epsilon-transit, don't duplicate the
1527 destination and store the original destination as the
1528 destination of the node. */
1529 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1532 else if (dfa
->edests
[org_node
].nelem
== 1)
1534 /* In case of the node can epsilon-transit, and it has only one
1536 org_dest
= dfa
->edests
[org_node
].elems
[0];
1537 re_node_set_empty (dfa
->edests
+ clone_node
);
1538 /* If the node is root_node itself, it means the epsilon closure
1539 has a loop. Then tie it to the destination of the root_node. */
1540 if (org_node
== root_node
&& clone_node
!= org_node
)
1542 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1543 if (__glibc_unlikely (! ok
))
1547 /* In case the node has another constraint, append it. */
1548 constraint
|= dfa
->nodes
[org_node
].constraint
;
1549 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1550 if (__glibc_unlikely (clone_dest
== -1))
1552 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1553 if (__glibc_unlikely (! ok
))
1556 else /* dfa->edests[org_node].nelem == 2 */
1558 /* In case of the node can epsilon-transit, and it has two
1559 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1560 org_dest
= dfa
->edests
[org_node
].elems
[0];
1561 re_node_set_empty (dfa
->edests
+ clone_node
);
1562 /* Search for a duplicated node which satisfies the constraint. */
1563 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1564 if (clone_dest
== -1)
1566 /* There is no such duplicated node, create a new one. */
1568 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1569 if (__glibc_unlikely (clone_dest
== -1))
1571 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1572 if (__glibc_unlikely (! ok
))
1574 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1575 root_node
, constraint
);
1576 if (__glibc_unlikely (err
!= REG_NOERROR
))
1581 /* There is a duplicated node which satisfies the constraint,
1582 use it to avoid infinite loop. */
1583 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1584 if (__glibc_unlikely (! ok
))
1588 org_dest
= dfa
->edests
[org_node
].elems
[1];
1589 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1590 if (__glibc_unlikely (clone_dest
== -1))
1592 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1593 if (__glibc_unlikely (! ok
))
1596 org_node
= org_dest
;
1597 clone_node
= clone_dest
;
1602 /* Search for a node which is duplicated from the node ORG_NODE, and
1603 satisfies the constraint CONSTRAINT. */
1606 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1607 unsigned int constraint
)
1610 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1612 if (org_node
== dfa
->org_indices
[idx
]
1613 && constraint
== dfa
->nodes
[idx
].constraint
)
1614 return idx
; /* Found. */
1616 return -1; /* Not found. */
1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620 Return the index of the new node, or -1 if insufficient storage is
1624 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1626 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1627 if (__glibc_likely (dup_idx
!= -1))
1629 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1630 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1631 dfa
->nodes
[dup_idx
].duplicated
= 1;
1633 /* Store the index of the original node. */
1634 dfa
->org_indices
[dup_idx
] = org_idx
;
1639 static reg_errcode_t
1640 calc_inveclosure (re_dfa_t
*dfa
)
1644 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1645 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1647 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1649 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1650 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1652 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1653 if (__glibc_unlikely (! ok
))
1661 /* Calculate "eclosure" for all the node in DFA. */
1663 static reg_errcode_t
1664 calc_eclosure (re_dfa_t
*dfa
)
1669 assert (dfa
->nodes_len
> 0);
1672 /* For each nodes, calculate epsilon closure. */
1673 for (node_idx
= 0; ; ++node_idx
)
1676 re_node_set eclosure_elem
;
1677 if (node_idx
== dfa
->nodes_len
)
1686 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1689 /* If we have already calculated, skip it. */
1690 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1692 /* Calculate epsilon closure of 'node_idx'. */
1693 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1694 if (__glibc_unlikely (err
!= REG_NOERROR
))
1697 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1700 re_node_set_free (&eclosure_elem
);
1706 /* Calculate epsilon closure of NODE. */
1708 static reg_errcode_t
1709 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1713 re_node_set eclosure
;
1715 bool incomplete
= false;
1716 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1717 if (__glibc_unlikely (err
!= REG_NOERROR
))
1720 /* This indicates that we are calculating this node now.
1721 We reference this value to avoid infinite loop. */
1722 dfa
->eclosures
[node
].nelem
= -1;
1724 /* If the current node has constraints, duplicate all nodes
1725 since they must inherit the constraints. */
1726 if (dfa
->nodes
[node
].constraint
1727 && dfa
->edests
[node
].nelem
1728 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1730 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1731 dfa
->nodes
[node
].constraint
);
1732 if (__glibc_unlikely (err
!= REG_NOERROR
))
1736 /* Expand each epsilon destination nodes. */
1737 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1738 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1740 re_node_set eclosure_elem
;
1741 Idx edest
= dfa
->edests
[node
].elems
[i
];
1742 /* If calculating the epsilon closure of 'edest' is in progress,
1743 return intermediate result. */
1744 if (dfa
->eclosures
[edest
].nelem
== -1)
1749 /* If we haven't calculated the epsilon closure of 'edest' yet,
1750 calculate now. Otherwise use calculated epsilon closure. */
1751 if (dfa
->eclosures
[edest
].nelem
== 0)
1753 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1754 if (__glibc_unlikely (err
!= REG_NOERROR
))
1758 eclosure_elem
= dfa
->eclosures
[edest
];
1759 /* Merge the epsilon closure of 'edest'. */
1760 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1761 if (__glibc_unlikely (err
!= REG_NOERROR
))
1763 /* If the epsilon closure of 'edest' is incomplete,
1764 the epsilon closure of this node is also incomplete. */
1765 if (dfa
->eclosures
[edest
].nelem
== 0)
1768 re_node_set_free (&eclosure_elem
);
1772 /* An epsilon closure includes itself. */
1773 ok
= re_node_set_insert (&eclosure
, node
);
1774 if (__glibc_unlikely (! ok
))
1776 if (incomplete
&& !root
)
1777 dfa
->eclosures
[node
].nelem
= 0;
1779 dfa
->eclosures
[node
] = eclosure
;
1780 *new_set
= eclosure
;
1784 /* Functions for token which are used in the parser. */
1786 /* Fetch a token from INPUT.
1787 We must not use this function inside bracket expressions. */
1790 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1792 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1795 /* Peek a token from INPUT, and return the length of the token.
1796 We must not use this function inside bracket expressions. */
1799 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1803 if (re_string_eoi (input
))
1805 token
->type
= END_OF_RE
;
1809 c
= re_string_peek_byte (input
, 0);
1812 token
->word_char
= 0;
1813 #ifdef RE_ENABLE_I18N
1814 token
->mb_partial
= 0;
1815 if (input
->mb_cur_max
> 1 &&
1816 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1818 token
->type
= CHARACTER
;
1819 token
->mb_partial
= 1;
1826 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1828 token
->type
= BACK_SLASH
;
1832 c2
= re_string_peek_byte_case (input
, 1);
1834 token
->type
= CHARACTER
;
1835 #ifdef RE_ENABLE_I18N
1836 if (input
->mb_cur_max
> 1)
1838 wint_t wc
= re_string_wchar_at (input
,
1839 re_string_cur_idx (input
) + 1);
1840 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1844 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1849 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1850 token
->type
= OP_ALT
;
1852 case '1': case '2': case '3': case '4': case '5':
1853 case '6': case '7': case '8': case '9':
1854 if (!(syntax
& RE_NO_BK_REFS
))
1856 token
->type
= OP_BACK_REF
;
1857 token
->opr
.idx
= c2
- '1';
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1863 token
->type
= ANCHOR
;
1864 token
->opr
.ctx_type
= WORD_FIRST
;
1868 if (!(syntax
& RE_NO_GNU_OPS
))
1870 token
->type
= ANCHOR
;
1871 token
->opr
.ctx_type
= WORD_LAST
;
1875 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= ANCHOR
;
1878 token
->opr
.ctx_type
= WORD_DELIM
;
1882 if (!(syntax
& RE_NO_GNU_OPS
))
1884 token
->type
= ANCHOR
;
1885 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1889 if (!(syntax
& RE_NO_GNU_OPS
))
1890 token
->type
= OP_WORD
;
1893 if (!(syntax
& RE_NO_GNU_OPS
))
1894 token
->type
= OP_NOTWORD
;
1897 if (!(syntax
& RE_NO_GNU_OPS
))
1898 token
->type
= OP_SPACE
;
1901 if (!(syntax
& RE_NO_GNU_OPS
))
1902 token
->type
= OP_NOTSPACE
;
1905 if (!(syntax
& RE_NO_GNU_OPS
))
1907 token
->type
= ANCHOR
;
1908 token
->opr
.ctx_type
= BUF_FIRST
;
1912 if (!(syntax
& RE_NO_GNU_OPS
))
1914 token
->type
= ANCHOR
;
1915 token
->opr
.ctx_type
= BUF_LAST
;
1919 if (!(syntax
& RE_NO_BK_PARENS
))
1920 token
->type
= OP_OPEN_SUBEXP
;
1923 if (!(syntax
& RE_NO_BK_PARENS
))
1924 token
->type
= OP_CLOSE_SUBEXP
;
1927 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1928 token
->type
= OP_DUP_PLUS
;
1931 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1932 token
->type
= OP_DUP_QUESTION
;
1935 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1936 token
->type
= OP_OPEN_DUP_NUM
;
1939 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1940 token
->type
= OP_CLOSE_DUP_NUM
;
1948 token
->type
= CHARACTER
;
1949 #ifdef RE_ENABLE_I18N
1950 if (input
->mb_cur_max
> 1)
1952 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1953 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1957 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1962 if (syntax
& RE_NEWLINE_ALT
)
1963 token
->type
= OP_ALT
;
1966 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1967 token
->type
= OP_ALT
;
1970 token
->type
= OP_DUP_ASTERISK
;
1973 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1974 token
->type
= OP_DUP_PLUS
;
1977 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1978 token
->type
= OP_DUP_QUESTION
;
1981 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1982 token
->type
= OP_OPEN_DUP_NUM
;
1985 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1986 token
->type
= OP_CLOSE_DUP_NUM
;
1989 if (syntax
& RE_NO_BK_PARENS
)
1990 token
->type
= OP_OPEN_SUBEXP
;
1993 if (syntax
& RE_NO_BK_PARENS
)
1994 token
->type
= OP_CLOSE_SUBEXP
;
1997 token
->type
= OP_OPEN_BRACKET
;
2000 token
->type
= OP_PERIOD
;
2003 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
2004 re_string_cur_idx (input
) != 0)
2006 char prev
= re_string_peek_byte (input
, -1);
2007 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
2010 token
->type
= ANCHOR
;
2011 token
->opr
.ctx_type
= LINE_FIRST
;
2014 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2015 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2018 re_string_skip_bytes (input
, 1);
2019 peek_token (&next
, input
, syntax
);
2020 re_string_skip_bytes (input
, -1);
2021 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2024 token
->type
= ANCHOR
;
2025 token
->opr
.ctx_type
= LINE_LAST
;
2033 /* Peek a token from INPUT, and return the length of the token.
2034 We must not use this function out of bracket expressions. */
2037 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2040 if (re_string_eoi (input
))
2042 token
->type
= END_OF_RE
;
2045 c
= re_string_peek_byte (input
, 0);
2048 #ifdef RE_ENABLE_I18N
2049 if (input
->mb_cur_max
> 1 &&
2050 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2052 token
->type
= CHARACTER
;
2055 #endif /* RE_ENABLE_I18N */
2057 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2058 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2060 /* In this case, '\' escape a character. */
2062 re_string_skip_bytes (input
, 1);
2063 c2
= re_string_peek_byte (input
, 0);
2065 token
->type
= CHARACTER
;
2068 if (c
== '[') /* '[' is a special char in a bracket exps. */
2072 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2073 c2
= re_string_peek_byte (input
, 1);
2081 token
->type
= OP_OPEN_COLL_ELEM
;
2085 token
->type
= OP_OPEN_EQUIV_CLASS
;
2089 if (syntax
& RE_CHAR_CLASSES
)
2091 token
->type
= OP_OPEN_CHAR_CLASS
;
2096 token
->type
= CHARACTER
;
2106 token
->type
= OP_CHARSET_RANGE
;
2109 token
->type
= OP_CLOSE_BRACKET
;
2112 token
->type
= OP_NON_MATCH_LIST
;
2115 token
->type
= CHARACTER
;
2120 /* Functions for parser. */
2122 /* Entry point of the parser.
2123 Parse the regular expression REGEXP and return the structure tree.
2124 If an error occurs, ERR is set by error code, and return NULL.
2125 This function build the following tree, from regular expression <reg_exp>:
2131 CAT means concatenation.
2132 EOR means end of regular expression. */
2135 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2138 re_dfa_t
*dfa
= preg
->buffer
;
2139 bin_tree_t
*tree
, *eor
, *root
;
2140 re_token_t current_token
;
2141 dfa
->syntax
= syntax
;
2142 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2143 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2144 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2146 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2148 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2151 if (__glibc_unlikely (eor
== NULL
|| root
== NULL
))
2159 /* This function build the following tree, from regular expression
2160 <branch1>|<branch2>:
2166 ALT means alternative, which represents the operator '|'. */
2169 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2170 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2172 re_dfa_t
*dfa
= preg
->buffer
;
2173 bin_tree_t
*tree
, *branch
= NULL
;
2174 bitset_word_t initial_bkref_map
= dfa
->completed_bkref_map
;
2175 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2176 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2179 while (token
->type
== OP_ALT
)
2181 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2182 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2183 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2185 bitset_word_t accumulated_bkref_map
= dfa
->completed_bkref_map
;
2186 dfa
->completed_bkref_map
= initial_bkref_map
;
2187 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2188 if (__glibc_unlikely (*err
!= REG_NOERROR
&& branch
== NULL
))
2191 postorder (tree
, free_tree
, NULL
);
2194 dfa
->completed_bkref_map
|= accumulated_bkref_map
;
2198 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2199 if (__glibc_unlikely (tree
== NULL
))
2208 /* This function build the following tree, from regular expression
2215 CAT means concatenation. */
2218 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2219 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2221 bin_tree_t
*tree
, *expr
;
2222 re_dfa_t
*dfa
= preg
->buffer
;
2223 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2224 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2227 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2228 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2230 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2231 if (__glibc_unlikely (*err
!= REG_NOERROR
&& expr
== NULL
))
2234 postorder (tree
, free_tree
, NULL
);
2237 if (tree
!= NULL
&& expr
!= NULL
)
2239 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2240 if (newtree
== NULL
)
2242 postorder (expr
, free_tree
, NULL
);
2243 postorder (tree
, free_tree
, NULL
);
2249 else if (tree
== NULL
)
2251 /* Otherwise expr == NULL, we don't need to create new tree. */
2256 /* This function build the following tree, from regular expression a*:
2263 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2264 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2266 re_dfa_t
*dfa
= preg
->buffer
;
2268 switch (token
->type
)
2271 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2272 if (__glibc_unlikely (tree
== NULL
))
2277 #ifdef RE_ENABLE_I18N
2278 if (dfa
->mb_cur_max
> 1)
2280 while (!re_string_eoi (regexp
)
2281 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2283 bin_tree_t
*mbc_remain
;
2284 fetch_token (token
, regexp
, syntax
);
2285 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2286 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2287 if (__glibc_unlikely (mbc_remain
== NULL
|| tree
== NULL
))
2297 case OP_OPEN_SUBEXP
:
2298 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2299 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2303 case OP_OPEN_BRACKET
:
2304 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2305 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2310 if (!__glibc_likely (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
)))
2315 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2316 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2317 if (__glibc_unlikely (tree
== NULL
))
2323 dfa
->has_mb_node
= 1;
2326 case OP_OPEN_DUP_NUM
:
2327 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2333 case OP_DUP_ASTERISK
:
2335 case OP_DUP_QUESTION
:
2336 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2341 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2343 fetch_token (token
, regexp
, syntax
);
2344 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2347 case OP_CLOSE_SUBEXP
:
2348 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2349 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2355 case OP_CLOSE_DUP_NUM
:
2356 /* We treat it as a normal character. */
2358 /* Then we can these characters as normal characters. */
2359 token
->type
= CHARACTER
;
2360 /* mb_partial and word_char bits should be initialized already
2362 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2363 if (__glibc_unlikely (tree
== NULL
))
2371 if ((token
->opr
.ctx_type
2372 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2373 && dfa
->word_ops_used
== 0)
2374 init_word_char (dfa
);
2375 if (token
->opr
.ctx_type
== WORD_DELIM
2376 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2378 bin_tree_t
*tree_first
, *tree_last
;
2379 if (token
->opr
.ctx_type
== WORD_DELIM
)
2381 token
->opr
.ctx_type
= WORD_FIRST
;
2382 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2383 token
->opr
.ctx_type
= WORD_LAST
;
2387 token
->opr
.ctx_type
= INSIDE_WORD
;
2388 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2389 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2391 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2392 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2393 if (__glibc_unlikely (tree_first
== NULL
|| tree_last
== NULL
2402 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2403 if (__glibc_unlikely (tree
== NULL
))
2409 /* We must return here, since ANCHORs can't be followed
2410 by repetition operators.
2411 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2412 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2413 fetch_token (token
, regexp
, syntax
);
2417 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2418 if (__glibc_unlikely (tree
== NULL
))
2423 if (dfa
->mb_cur_max
> 1)
2424 dfa
->has_mb_node
= 1;
2429 tree
= build_charclass_op (dfa
, regexp
->trans
,
2432 token
->type
== OP_NOTWORD
, err
);
2433 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2439 tree
= build_charclass_op (dfa
, regexp
->trans
,
2442 token
->type
== OP_NOTSPACE
, err
);
2443 if (__glibc_unlikely (*err
!= REG_NOERROR
&& tree
== NULL
))
2456 /* Must not happen? */
2462 fetch_token (token
, regexp
, syntax
);
2464 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2465 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2467 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
,
2469 if (__glibc_unlikely (*err
!= REG_NOERROR
&& dup_tree
== NULL
))
2472 postorder (tree
, free_tree
, NULL
);
2476 /* In BRE consecutive duplications are not allowed. */
2477 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2478 && (token
->type
== OP_DUP_ASTERISK
2479 || token
->type
== OP_OPEN_DUP_NUM
))
2482 postorder (tree
, free_tree
, NULL
);
2491 /* This function build the following tree, from regular expression
2499 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2500 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2502 re_dfa_t
*dfa
= preg
->buffer
;
2505 cur_nsub
= preg
->re_nsub
++;
2507 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2509 /* The subexpression may be a null string. */
2510 if (token
->type
== OP_CLOSE_SUBEXP
)
2514 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2515 if (__glibc_unlikely (*err
== REG_NOERROR
2516 && token
->type
!= OP_CLOSE_SUBEXP
))
2519 postorder (tree
, free_tree
, NULL
);
2522 if (__glibc_unlikely (*err
!= REG_NOERROR
))
2526 if (cur_nsub
<= '9' - '1')
2527 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2529 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2530 if (__glibc_unlikely (tree
== NULL
))
2535 tree
->token
.opr
.idx
= cur_nsub
;
2539 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2542 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2543 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2545 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2546 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2547 re_token_t start_token
= *token
;
2549 if (token
->type
== OP_OPEN_DUP_NUM
)
2552 start
= fetch_number (regexp
, token
, syntax
);
2555 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2556 start
= 0; /* We treat "{,m}" as "{0,m}". */
2559 *err
= REG_BADBR
; /* <re>{} is invalid. */
2563 if (__glibc_likely (start
!= -2))
2565 /* We treat "{n}" as "{n,n}". */
2566 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2567 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2568 ? fetch_number (regexp
, token
, syntax
) : -2));
2570 if (__glibc_unlikely (start
== -2 || end
== -2))
2572 /* Invalid sequence. */
2573 if (__glibc_unlikely (!(syntax
& RE_INVALID_INTERVAL_ORD
)))
2575 if (token
->type
== END_OF_RE
)
2583 /* If the syntax bit is set, rollback. */
2584 re_string_set_index (regexp
, start_idx
);
2585 *token
= start_token
;
2586 token
->type
= CHARACTER
;
2587 /* mb_partial and word_char bits should be already initialized by
2592 if (__glibc_unlikely ((end
!= -1 && start
> end
)
2593 || token
->type
!= OP_CLOSE_DUP_NUM
))
2595 /* First number greater than second. */
2600 if (__glibc_unlikely (RE_DUP_MAX
< (end
== -1 ? start
: end
)))
2608 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2609 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2612 fetch_token (token
, regexp
, syntax
);
2614 if (__glibc_unlikely (elem
== NULL
))
2616 if (__glibc_unlikely (start
== 0 && end
== 0))
2618 postorder (elem
, free_tree
, NULL
);
2622 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2623 if (__glibc_unlikely (start
> 0))
2626 for (i
= 2; i
<= start
; ++i
)
2628 elem
= duplicate_tree (elem
, dfa
);
2629 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2630 if (__glibc_unlikely (elem
== NULL
|| tree
== NULL
))
2631 goto parse_dup_op_espace
;
2637 /* Duplicate ELEM before it is marked optional. */
2638 elem
= duplicate_tree (elem
, dfa
);
2639 if (__glibc_unlikely (elem
== NULL
))
2640 goto parse_dup_op_espace
;
2646 if (elem
->token
.type
== SUBEXP
)
2648 uintptr_t subidx
= elem
->token
.opr
.idx
;
2649 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2652 tree
= create_tree (dfa
, elem
, NULL
,
2653 (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2654 if (__glibc_unlikely (tree
== NULL
))
2655 goto parse_dup_op_espace
;
2657 /* This loop is actually executed only when end != -1,
2658 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2659 already created the start+1-th copy. */
2660 if (TYPE_SIGNED (Idx
) || end
!= -1)
2661 for (i
= start
+ 2; i
<= end
; ++i
)
2663 elem
= duplicate_tree (elem
, dfa
);
2664 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2665 if (__glibc_unlikely (elem
== NULL
|| tree
== NULL
))
2666 goto parse_dup_op_espace
;
2668 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2669 if (__glibc_unlikely (tree
== NULL
))
2670 goto parse_dup_op_espace
;
2674 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2678 parse_dup_op_espace
:
2683 /* Size of the names for collating symbol/equivalence_class/character_class.
2684 I'm not sure, but maybe enough. */
2685 #define BRACKET_NAME_BUF_SIZE 32
2689 # ifdef RE_ENABLE_I18N
2690 /* Convert the byte B to the corresponding wide character. In a
2691 unibyte locale, treat B as itself. In a multibyte locale, return
2692 WEOF if B is an encoding error. */
2694 parse_byte (unsigned char b
, re_charset_t
*mbcset
)
2696 return mbcset
== NULL
? b
: __btowc (b
);
2700 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2701 Build the range expression which starts from START_ELEM, and ends
2702 at END_ELEM. The result are written to MBCSET and SBCSET.
2703 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2704 mbcset->range_ends, is a pointer argument since we may
2707 static reg_errcode_t
2708 # ifdef RE_ENABLE_I18N
2709 build_range_exp (const reg_syntax_t syntax
,
2711 re_charset_t
*mbcset
,
2713 const bracket_elem_t
*start_elem
,
2714 const bracket_elem_t
*end_elem
)
2715 # else /* not RE_ENABLE_I18N */
2716 build_range_exp (const reg_syntax_t syntax
,
2718 const bracket_elem_t
*start_elem
,
2719 const bracket_elem_t
*end_elem
)
2720 # endif /* not RE_ENABLE_I18N */
2722 unsigned int start_ch
, end_ch
;
2723 /* Equivalence Classes and Character Classes can't be a range start/end. */
2724 if (__glibc_unlikely (start_elem
->type
== EQUIV_CLASS
2725 || start_elem
->type
== CHAR_CLASS
2726 || end_elem
->type
== EQUIV_CLASS
2727 || end_elem
->type
== CHAR_CLASS
))
2730 /* We can handle no multi character collating elements without libc
2732 if (__glibc_unlikely ((start_elem
->type
== COLL_SYM
2733 && strlen ((char *) start_elem
->opr
.name
) > 1)
2734 || (end_elem
->type
== COLL_SYM
2735 && strlen ((char *) end_elem
->opr
.name
) > 1)))
2736 return REG_ECOLLATE
;
2738 # ifdef RE_ENABLE_I18N
2744 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2745 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2747 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2748 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2750 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2751 ? parse_byte (start_ch
, mbcset
) : start_elem
->opr
.wch
);
2752 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2753 ? parse_byte (end_ch
, mbcset
) : end_elem
->opr
.wch
);
2754 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2755 return REG_ECOLLATE
;
2756 else if (__glibc_unlikely ((syntax
& RE_NO_EMPTY_RANGES
)
2757 && start_wc
> end_wc
))
2760 /* Got valid collation sequence values, add them as a new entry.
2761 However, for !_LIBC we have no collation elements: if the
2762 character set is single byte, the single byte character set
2763 that we build below suffices. parse_bracket_exp passes
2764 no MBCSET if dfa->mb_cur_max == 1. */
2767 /* Check the space of the arrays. */
2768 if (__glibc_unlikely (*range_alloc
== mbcset
->nranges
))
2770 /* There is not enough space, need realloc. */
2771 wchar_t *new_array_start
, *new_array_end
;
2774 /* +1 in case of mbcset->nranges is 0. */
2775 new_nranges
= 2 * mbcset
->nranges
+ 1;
2776 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2777 are NULL if *range_alloc == 0. */
2778 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2780 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2783 if (__glibc_unlikely (new_array_start
== NULL
2784 || new_array_end
== NULL
))
2786 re_free (new_array_start
);
2787 re_free (new_array_end
);
2791 mbcset
->range_starts
= new_array_start
;
2792 mbcset
->range_ends
= new_array_end
;
2793 *range_alloc
= new_nranges
;
2796 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2797 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2800 /* Build the table for single byte characters. */
2801 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2803 if (start_wc
<= wc
&& wc
<= end_wc
)
2804 bitset_set (sbcset
, wc
);
2807 # else /* not RE_ENABLE_I18N */
2810 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2811 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2813 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2814 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2816 if (start_ch
> end_ch
)
2818 /* Build the table for single byte characters. */
2819 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2820 if (start_ch
<= ch
&& ch
<= end_ch
)
2821 bitset_set (sbcset
, ch
);
2823 # endif /* not RE_ENABLE_I18N */
2826 #endif /* not _LIBC */
2829 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2830 Build the collating element which is represented by NAME.
2831 The result are written to MBCSET and SBCSET.
2832 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2833 pointer argument since we may update it. */
2835 static reg_errcode_t
2836 # ifdef RE_ENABLE_I18N
2837 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2838 Idx
*coll_sym_alloc
, const unsigned char *name
)
2839 # else /* not RE_ENABLE_I18N */
2840 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2841 # endif /* not RE_ENABLE_I18N */
2843 size_t name_len
= strlen ((const char *) name
);
2844 if (__glibc_unlikely (name_len
!= 1))
2845 return REG_ECOLLATE
;
2848 bitset_set (sbcset
, name
[0]);
2852 #endif /* not _LIBC */
2854 /* This function parse bracket expression like "[abc]", "[a-c]",
2858 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2859 reg_syntax_t syntax
, reg_errcode_t
*err
)
2862 const unsigned char *collseqmb
;
2863 const char *collseqwc
;
2866 const int32_t *symb_table
;
2867 const unsigned char *extra
;
2869 /* Local function for parse_bracket_exp used in _LIBC environment.
2870 Seek the collating symbol entry corresponding to NAME.
2871 Return the index of the symbol in the SYMB_TABLE,
2872 or -1 if not found. */
2875 __attribute__ ((always_inline
))
2876 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2880 for (elem
= 0; elem
< table_size
; elem
++)
2881 if (symb_table
[2 * elem
] != 0)
2883 int32_t idx
= symb_table
[2 * elem
+ 1];
2884 /* Skip the name of collating element name. */
2885 idx
+= 1 + extra
[idx
];
2886 if (/* Compare the length of the name. */
2887 name_len
== extra
[idx
]
2888 /* Compare the name. */
2889 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2890 /* Yep, this is the entry. */
2896 /* Local function for parse_bracket_exp used in _LIBC environment.
2897 Look up the collation sequence value of BR_ELEM.
2898 Return the value if succeeded, UINT_MAX otherwise. */
2900 auto inline unsigned int
2901 __attribute__ ((always_inline
))
2902 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2904 if (br_elem
->type
== SB_CHAR
)
2907 if (MB_CUR_MAX == 1)
2910 return collseqmb
[br_elem
->opr
.ch
];
2913 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2914 return __collseq_table_lookup (collseqwc
, wc
);
2917 else if (br_elem
->type
== MB_CHAR
)
2920 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2922 else if (br_elem
->type
== COLL_SYM
)
2924 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2928 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2932 /* We found the entry. */
2933 idx
= symb_table
[2 * elem
+ 1];
2934 /* Skip the name of collating element name. */
2935 idx
+= 1 + extra
[idx
];
2936 /* Skip the byte sequence of the collating element. */
2937 idx
+= 1 + extra
[idx
];
2938 /* Adjust for the alignment. */
2939 idx
= (idx
+ 3) & ~3;
2940 /* Skip the multibyte collation sequence value. */
2941 idx
+= sizeof (unsigned int);
2942 /* Skip the wide char sequence of the collating element. */
2943 idx
+= sizeof (unsigned int) *
2944 (1 + *(unsigned int *) (extra
+ idx
));
2945 /* Return the collation sequence value. */
2946 return *(unsigned int *) (extra
+ idx
);
2948 else if (sym_name_len
== 1)
2950 /* No valid character. Match it as a single byte
2952 return collseqmb
[br_elem
->opr
.name
[0]];
2955 else if (sym_name_len
== 1)
2956 return collseqmb
[br_elem
->opr
.name
[0]];
2961 /* Local function for parse_bracket_exp used in _LIBC environment.
2962 Build the range expression which starts from START_ELEM, and ends
2963 at END_ELEM. The result are written to MBCSET and SBCSET.
2964 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2965 mbcset->range_ends, is a pointer argument since we may
2968 auto inline reg_errcode_t
2969 __attribute__ ((always_inline
))
2970 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2971 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2974 uint32_t start_collseq
;
2975 uint32_t end_collseq
;
2977 /* Equivalence Classes and Character Classes can't be a range
2979 if (__glibc_unlikely (start_elem
->type
== EQUIV_CLASS
2980 || start_elem
->type
== CHAR_CLASS
2981 || end_elem
->type
== EQUIV_CLASS
2982 || end_elem
->type
== CHAR_CLASS
))
2985 /* FIXME: Implement rational ranges here, too. */
2986 start_collseq
= lookup_collation_sequence_value (start_elem
);
2987 end_collseq
= lookup_collation_sequence_value (end_elem
);
2988 /* Check start/end collation sequence values. */
2989 if (__glibc_unlikely (start_collseq
== UINT_MAX
2990 || end_collseq
== UINT_MAX
))
2991 return REG_ECOLLATE
;
2992 if (__glibc_unlikely ((syntax
& RE_NO_EMPTY_RANGES
)
2993 && start_collseq
> end_collseq
))
2996 /* Got valid collation sequence values, add them as a new entry.
2997 However, if we have no collation elements, and the character set
2998 is single byte, the single byte character set that we
2999 build below suffices. */
3000 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
3002 /* Check the space of the arrays. */
3003 if (__glibc_unlikely (*range_alloc
== mbcset
->nranges
))
3005 /* There is not enough space, need realloc. */
3006 uint32_t *new_array_start
;
3007 uint32_t *new_array_end
;
3010 /* +1 in case of mbcset->nranges is 0. */
3011 new_nranges
= 2 * mbcset
->nranges
+ 1;
3012 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
3014 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
3017 if (__glibc_unlikely (new_array_start
== NULL
3018 || new_array_end
== NULL
))
3021 mbcset
->range_starts
= new_array_start
;
3022 mbcset
->range_ends
= new_array_end
;
3023 *range_alloc
= new_nranges
;
3026 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
3027 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
3030 /* Build the table for single byte characters. */
3031 for (ch
= 0; ch
< SBC_MAX
; ch
++)
3033 uint32_t ch_collseq
;
3035 if (MB_CUR_MAX == 1)
3038 ch_collseq
= collseqmb
[ch
];
3040 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3041 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3042 bitset_set (sbcset
, ch
);
3047 /* Local function for parse_bracket_exp used in _LIBC environment.
3048 Build the collating element which is represented by NAME.
3049 The result are written to MBCSET and SBCSET.
3050 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3051 pointer argument since we may update it. */
3053 auto inline reg_errcode_t
3054 __attribute__ ((always_inline
))
3055 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
3056 Idx
*coll_sym_alloc
, const unsigned char *name
)
3059 size_t name_len
= strlen ((const char *) name
);
3062 elem
= seek_collating_symbol_entry (name
, name_len
);
3065 /* We found the entry. */
3066 idx
= symb_table
[2 * elem
+ 1];
3067 /* Skip the name of collating element name. */
3068 idx
+= 1 + extra
[idx
];
3070 else if (name_len
== 1)
3072 /* No valid character, treat it as a normal
3074 bitset_set (sbcset
, name
[0]);
3078 return REG_ECOLLATE
;
3080 /* Got valid collation sequence, add it as a new entry. */
3081 /* Check the space of the arrays. */
3082 if (__glibc_unlikely (*coll_sym_alloc
== mbcset
->ncoll_syms
))
3084 /* Not enough, realloc it. */
3085 /* +1 in case of mbcset->ncoll_syms is 0. */
3086 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3087 /* Use realloc since mbcset->coll_syms is NULL
3089 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3090 new_coll_sym_alloc
);
3091 if (__glibc_unlikely (new_coll_syms
== NULL
))
3093 mbcset
->coll_syms
= new_coll_syms
;
3094 *coll_sym_alloc
= new_coll_sym_alloc
;
3096 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3101 if (__glibc_unlikely (name_len
!= 1))
3102 return REG_ECOLLATE
;
3105 bitset_set (sbcset
, name
[0]);
3112 re_token_t br_token
;
3113 re_bitset_ptr_t sbcset
;
3114 #ifdef RE_ENABLE_I18N
3115 re_charset_t
*mbcset
;
3116 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3117 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3118 #endif /* not RE_ENABLE_I18N */
3119 bool non_match
= false;
3120 bin_tree_t
*work_tree
;
3122 bool first_round
= true;
3124 collseqmb
= (const unsigned char *)
3125 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3126 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3132 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3133 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3134 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3135 _NL_COLLATE_SYMB_TABLEMB
);
3136 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3137 _NL_COLLATE_SYMB_EXTRAMB
);
3140 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3141 #ifdef RE_ENABLE_I18N
3142 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3143 #endif /* RE_ENABLE_I18N */
3144 #ifdef RE_ENABLE_I18N
3145 if (__glibc_unlikely (sbcset
== NULL
|| mbcset
== NULL
))
3147 if (__glibc_unlikely (sbcset
== NULL
))
3148 #endif /* RE_ENABLE_I18N */
3151 #ifdef RE_ENABLE_I18N
3158 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3159 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3162 goto parse_bracket_exp_free_return
;
3164 if (token
->type
== OP_NON_MATCH_LIST
)
3166 #ifdef RE_ENABLE_I18N
3167 mbcset
->non_match
= 1;
3168 #endif /* not RE_ENABLE_I18N */
3170 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3171 bitset_set (sbcset
, '\n');
3172 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3173 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3174 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3177 goto parse_bracket_exp_free_return
;
3181 /* We treat the first ']' as a normal character. */
3182 if (token
->type
== OP_CLOSE_BRACKET
)
3183 token
->type
= CHARACTER
;
3187 bracket_elem_t start_elem
, end_elem
;
3188 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3189 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3192 bool is_range_exp
= false;
3195 start_elem
.opr
.name
= start_name_buf
;
3196 start_elem
.type
= COLL_SYM
;
3197 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3198 syntax
, first_round
);
3199 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3202 goto parse_bracket_exp_free_return
;
3204 first_round
= false;
3206 /* Get information about the next token. We need it in any case. */
3207 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3209 /* Do not check for ranges if we know they are not allowed. */
3210 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3212 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3215 goto parse_bracket_exp_free_return
;
3217 if (token
->type
== OP_CHARSET_RANGE
)
3219 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3220 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3221 if (__glibc_unlikely (token2
.type
== END_OF_RE
))
3224 goto parse_bracket_exp_free_return
;
3226 if (token2
.type
== OP_CLOSE_BRACKET
)
3228 /* We treat the last '-' as a normal character. */
3229 re_string_skip_bytes (regexp
, -token_len
);
3230 token
->type
= CHARACTER
;
3233 is_range_exp
= true;
3237 if (is_range_exp
== true)
3239 end_elem
.opr
.name
= end_name_buf
;
3240 end_elem
.type
= COLL_SYM
;
3241 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3243 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3246 goto parse_bracket_exp_free_return
;
3249 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3252 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3253 &start_elem
, &end_elem
);
3255 # ifdef RE_ENABLE_I18N
3256 *err
= build_range_exp (syntax
, sbcset
,
3257 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3258 &range_alloc
, &start_elem
, &end_elem
);
3260 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3262 #endif /* RE_ENABLE_I18N */
3263 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3264 goto parse_bracket_exp_free_return
;
3268 switch (start_elem
.type
)
3271 bitset_set (sbcset
, start_elem
.opr
.ch
);
3273 #ifdef RE_ENABLE_I18N
3275 /* Check whether the array has enough space. */
3276 if (__glibc_unlikely (mbchar_alloc
== mbcset
->nmbchars
))
3278 wchar_t *new_mbchars
;
3279 /* Not enough, realloc it. */
3280 /* +1 in case of mbcset->nmbchars is 0. */
3281 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3282 /* Use realloc since array is NULL if *alloc == 0. */
3283 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3285 if (__glibc_unlikely (new_mbchars
== NULL
))
3286 goto parse_bracket_exp_espace
;
3287 mbcset
->mbchars
= new_mbchars
;
3289 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3291 #endif /* RE_ENABLE_I18N */
3293 *err
= build_equiv_class (sbcset
,
3294 #ifdef RE_ENABLE_I18N
3295 mbcset
, &equiv_class_alloc
,
3296 #endif /* RE_ENABLE_I18N */
3297 start_elem
.opr
.name
);
3298 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3299 goto parse_bracket_exp_free_return
;
3302 *err
= build_collating_symbol (sbcset
,
3303 #ifdef RE_ENABLE_I18N
3304 mbcset
, &coll_sym_alloc
,
3305 #endif /* RE_ENABLE_I18N */
3306 start_elem
.opr
.name
);
3307 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3308 goto parse_bracket_exp_free_return
;
3311 *err
= build_charclass (regexp
->trans
, sbcset
,
3312 #ifdef RE_ENABLE_I18N
3313 mbcset
, &char_class_alloc
,
3314 #endif /* RE_ENABLE_I18N */
3315 (const char *) start_elem
.opr
.name
,
3317 if (__glibc_unlikely (*err
!= REG_NOERROR
))
3318 goto parse_bracket_exp_free_return
;
3325 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3328 goto parse_bracket_exp_free_return
;
3330 if (token
->type
== OP_CLOSE_BRACKET
)
3334 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3336 /* If it is non-matching list. */
3338 bitset_not (sbcset
);
3340 #ifdef RE_ENABLE_I18N
3341 /* Ensure only single byte characters are set. */
3342 if (dfa
->mb_cur_max
> 1)
3343 bitset_mask (sbcset
, dfa
->sb_char
);
3345 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3346 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3347 || mbcset
->non_match
)))
3349 bin_tree_t
*mbc_tree
;
3351 /* Build a tree for complex bracket. */
3352 dfa
->has_mb_node
= 1;
3353 br_token
.type
= COMPLEX_BRACKET
;
3354 br_token
.opr
.mbcset
= mbcset
;
3355 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3356 if (__glibc_unlikely (mbc_tree
== NULL
))
3357 goto parse_bracket_exp_espace
;
3358 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3359 if (sbcset
[sbc_idx
])
3361 /* If there are no bits set in sbcset, there is no point
3362 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3363 if (sbc_idx
< BITSET_WORDS
)
3365 /* Build a tree for simple bracket. */
3366 br_token
.type
= SIMPLE_BRACKET
;
3367 br_token
.opr
.sbcset
= sbcset
;
3368 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3369 if (__glibc_unlikely (work_tree
== NULL
))
3370 goto parse_bracket_exp_espace
;
3372 /* Then join them by ALT node. */
3373 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3374 if (__glibc_unlikely (work_tree
== NULL
))
3375 goto parse_bracket_exp_espace
;
3380 work_tree
= mbc_tree
;
3384 #endif /* not RE_ENABLE_I18N */
3386 #ifdef RE_ENABLE_I18N
3387 free_charset (mbcset
);
3389 /* Build a tree for simple bracket. */
3390 br_token
.type
= SIMPLE_BRACKET
;
3391 br_token
.opr
.sbcset
= sbcset
;
3392 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3393 if (__glibc_unlikely (work_tree
== NULL
))
3394 goto parse_bracket_exp_espace
;
3398 parse_bracket_exp_espace
:
3400 parse_bracket_exp_free_return
:
3402 #ifdef RE_ENABLE_I18N
3403 free_charset (mbcset
);
3404 #endif /* RE_ENABLE_I18N */
3408 /* Parse an element in the bracket expression. */
3410 static reg_errcode_t
3411 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3412 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3413 reg_syntax_t syntax
, bool accept_hyphen
)
3415 #ifdef RE_ENABLE_I18N
3417 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3418 if (cur_char_size
> 1)
3420 elem
->type
= MB_CHAR
;
3421 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3422 re_string_skip_bytes (regexp
, cur_char_size
);
3425 #endif /* RE_ENABLE_I18N */
3426 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3427 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3428 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3429 return parse_bracket_symbol (elem
, regexp
, token
);
3430 if (__glibc_unlikely (token
->type
== OP_CHARSET_RANGE
) && !accept_hyphen
)
3432 /* A '-' must only appear as anything but a range indicator before
3433 the closing bracket. Everything else is an error. */
3435 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3436 if (token2
.type
!= OP_CLOSE_BRACKET
)
3437 /* The actual error value is not standardized since this whole
3438 case is undefined. But ERANGE makes good sense. */
3441 elem
->type
= SB_CHAR
;
3442 elem
->opr
.ch
= token
->opr
.c
;
3446 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3447 such as [:<character_class>:], [.<collating_element>.], and
3448 [=<equivalent_class>=]. */
3450 static reg_errcode_t
3451 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3454 unsigned char ch
, delim
= token
->opr
.c
;
3456 if (re_string_eoi(regexp
))
3460 if (i
>= BRACKET_NAME_BUF_SIZE
)
3462 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3463 ch
= re_string_fetch_byte_case (regexp
);
3465 ch
= re_string_fetch_byte (regexp
);
3466 if (re_string_eoi(regexp
))
3468 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3470 elem
->opr
.name
[i
] = ch
;
3472 re_string_skip_bytes (regexp
, 1);
3473 elem
->opr
.name
[i
] = '\0';
3474 switch (token
->type
)
3476 case OP_OPEN_COLL_ELEM
:
3477 elem
->type
= COLL_SYM
;
3479 case OP_OPEN_EQUIV_CLASS
:
3480 elem
->type
= EQUIV_CLASS
;
3482 case OP_OPEN_CHAR_CLASS
:
3483 elem
->type
= CHAR_CLASS
;
3491 /* Helper function for parse_bracket_exp.
3492 Build the equivalence class which is represented by NAME.
3493 The result are written to MBCSET and SBCSET.
3494 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3495 is a pointer argument since we may update it. */
3497 static reg_errcode_t
3498 #ifdef RE_ENABLE_I18N
3499 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3500 Idx
*equiv_class_alloc
, const unsigned char *name
)
3501 #else /* not RE_ENABLE_I18N */
3502 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3503 #endif /* not RE_ENABLE_I18N */
3506 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3509 const int32_t *table
, *indirect
;
3510 const unsigned char *weights
, *extra
, *cp
;
3511 unsigned char char_buf
[2];
3515 /* Calculate the index for equivalence class. */
3517 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3518 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3519 _NL_COLLATE_WEIGHTMB
);
3520 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3521 _NL_COLLATE_EXTRAMB
);
3522 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3523 _NL_COLLATE_INDIRECTMB
);
3524 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3525 if (__glibc_unlikely (idx1
== 0 || *cp
!= '\0'))
3526 /* This isn't a valid character. */
3527 return REG_ECOLLATE
;
3529 /* Build single byte matching table for this equivalence class. */
3530 len
= weights
[idx1
& 0xffffff];
3531 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3535 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3540 /* This isn't a valid character. */
3542 /* Compare only if the length matches and the collation rule
3543 index is the same. */
3544 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24)
3545 && memcmp (weights
+ (idx1
& 0xffffff) + 1,
3546 weights
+ (idx2
& 0xffffff) + 1, len
) == 0)
3547 bitset_set (sbcset
, ch
);
3549 /* Check whether the array has enough space. */
3550 if (__glibc_unlikely (*equiv_class_alloc
== mbcset
->nequiv_classes
))
3552 /* Not enough, realloc it. */
3553 /* +1 in case of mbcset->nequiv_classes is 0. */
3554 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3555 /* Use realloc since the array is NULL if *alloc == 0. */
3556 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3558 new_equiv_class_alloc
);
3559 if (__glibc_unlikely (new_equiv_classes
== NULL
))
3561 mbcset
->equiv_classes
= new_equiv_classes
;
3562 *equiv_class_alloc
= new_equiv_class_alloc
;
3564 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3569 if (__glibc_unlikely (strlen ((const char *) name
) != 1))
3570 return REG_ECOLLATE
;
3571 bitset_set (sbcset
, *name
);
3576 /* Helper function for parse_bracket_exp.
3577 Build the character class which is represented by NAME.
3578 The result are written to MBCSET and SBCSET.
3579 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3580 is a pointer argument since we may update it. */
3582 static reg_errcode_t
3583 #ifdef RE_ENABLE_I18N
3584 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3585 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3586 const char *class_name
, reg_syntax_t syntax
)
3587 #else /* not RE_ENABLE_I18N */
3588 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3589 const char *class_name
, reg_syntax_t syntax
)
3590 #endif /* not RE_ENABLE_I18N */
3593 const char *name
= class_name
;
3595 /* In case of REG_ICASE "upper" and "lower" match the both of
3596 upper and lower cases. */
3597 if ((syntax
& RE_ICASE
)
3598 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3601 #ifdef RE_ENABLE_I18N
3602 /* Check the space of the arrays. */
3603 if (__glibc_unlikely (*char_class_alloc
== mbcset
->nchar_classes
))
3605 /* Not enough, realloc it. */
3606 /* +1 in case of mbcset->nchar_classes is 0. */
3607 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3608 /* Use realloc since array is NULL if *alloc == 0. */
3609 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3610 new_char_class_alloc
);
3611 if (__glibc_unlikely (new_char_classes
== NULL
))
3613 mbcset
->char_classes
= new_char_classes
;
3614 *char_class_alloc
= new_char_class_alloc
;
3616 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3617 #endif /* RE_ENABLE_I18N */
3619 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3621 if (__glibc_unlikely (trans != NULL)) \
3623 for (i = 0; i < SBC_MAX; ++i) \
3624 if (ctype_func (i)) \
3625 bitset_set (sbcset, trans[i]); \
3629 for (i = 0; i < SBC_MAX; ++i) \
3630 if (ctype_func (i)) \
3631 bitset_set (sbcset, i); \
3635 if (strcmp (name
, "alnum") == 0)
3636 BUILD_CHARCLASS_LOOP (isalnum
);
3637 else if (strcmp (name
, "cntrl") == 0)
3638 BUILD_CHARCLASS_LOOP (iscntrl
);
3639 else if (strcmp (name
, "lower") == 0)
3640 BUILD_CHARCLASS_LOOP (islower
);
3641 else if (strcmp (name
, "space") == 0)
3642 BUILD_CHARCLASS_LOOP (isspace
);
3643 else if (strcmp (name
, "alpha") == 0)
3644 BUILD_CHARCLASS_LOOP (isalpha
);
3645 else if (strcmp (name
, "digit") == 0)
3646 BUILD_CHARCLASS_LOOP (isdigit
);
3647 else if (strcmp (name
, "print") == 0)
3648 BUILD_CHARCLASS_LOOP (isprint
);
3649 else if (strcmp (name
, "upper") == 0)
3650 BUILD_CHARCLASS_LOOP (isupper
);
3651 else if (strcmp (name
, "blank") == 0)
3652 BUILD_CHARCLASS_LOOP (isblank
);
3653 else if (strcmp (name
, "graph") == 0)
3654 BUILD_CHARCLASS_LOOP (isgraph
);
3655 else if (strcmp (name
, "punct") == 0)
3656 BUILD_CHARCLASS_LOOP (ispunct
);
3657 else if (strcmp (name
, "xdigit") == 0)
3658 BUILD_CHARCLASS_LOOP (isxdigit
);
3666 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3667 const char *class_name
,
3668 const char *extra
, bool non_match
,
3671 re_bitset_ptr_t sbcset
;
3672 #ifdef RE_ENABLE_I18N
3673 re_charset_t
*mbcset
;
3675 #endif /* not RE_ENABLE_I18N */
3677 re_token_t br_token
;
3680 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3681 if (__glibc_unlikely (sbcset
== NULL
))
3686 #ifdef RE_ENABLE_I18N
3687 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3688 if (__glibc_unlikely (mbcset
== NULL
))
3694 mbcset
->non_match
= non_match
;
3695 #endif /* RE_ENABLE_I18N */
3697 /* We don't care the syntax in this case. */
3698 ret
= build_charclass (trans
, sbcset
,
3699 #ifdef RE_ENABLE_I18N
3701 #endif /* RE_ENABLE_I18N */
3704 if (__glibc_unlikely (ret
!= REG_NOERROR
))
3707 #ifdef RE_ENABLE_I18N
3708 free_charset (mbcset
);
3709 #endif /* RE_ENABLE_I18N */
3713 /* \w match '_' also. */
3714 for (; *extra
; extra
++)
3715 bitset_set (sbcset
, *extra
);
3717 /* If it is non-matching list. */
3719 bitset_not (sbcset
);
3721 #ifdef RE_ENABLE_I18N
3722 /* Ensure only single byte characters are set. */
3723 if (dfa
->mb_cur_max
> 1)
3724 bitset_mask (sbcset
, dfa
->sb_char
);
3727 /* Build a tree for simple bracket. */
3728 #if defined GCC_LINT || defined lint
3729 memset (&br_token
, 0, sizeof br_token
);
3731 br_token
.type
= SIMPLE_BRACKET
;
3732 br_token
.opr
.sbcset
= sbcset
;
3733 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3734 if (__glibc_unlikely (tree
== NULL
))
3735 goto build_word_op_espace
;
3737 #ifdef RE_ENABLE_I18N
3738 if (dfa
->mb_cur_max
> 1)
3740 bin_tree_t
*mbc_tree
;
3741 /* Build a tree for complex bracket. */
3742 br_token
.type
= COMPLEX_BRACKET
;
3743 br_token
.opr
.mbcset
= mbcset
;
3744 dfa
->has_mb_node
= 1;
3745 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3746 if (__glibc_unlikely (mbc_tree
== NULL
))
3747 goto build_word_op_espace
;
3748 /* Then join them by ALT node. */
3749 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3750 if (__glibc_likely (mbc_tree
!= NULL
))
3755 free_charset (mbcset
);
3758 #else /* not RE_ENABLE_I18N */
3760 #endif /* not RE_ENABLE_I18N */
3762 build_word_op_espace
:
3764 #ifdef RE_ENABLE_I18N
3765 free_charset (mbcset
);
3766 #endif /* RE_ENABLE_I18N */
3771 /* This is intended for the expressions like "a{1,3}".
3772 Fetch a number from 'input', and return the number.
3773 Return -1 if the number field is empty like "{,1}".
3774 Return RE_DUP_MAX + 1 if the number field is too large.
3775 Return -2 if an error occurred. */
3778 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3784 fetch_token (token
, input
, syntax
);
3786 if (__glibc_unlikely (token
->type
== END_OF_RE
))
3788 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3790 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3794 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3799 #ifdef RE_ENABLE_I18N
3801 free_charset (re_charset_t
*cset
)
3803 re_free (cset
->mbchars
);
3805 re_free (cset
->coll_syms
);
3806 re_free (cset
->equiv_classes
);
3808 re_free (cset
->range_starts
);
3809 re_free (cset
->range_ends
);
3810 re_free (cset
->char_classes
);
3813 #endif /* RE_ENABLE_I18N */
3815 /* Functions for binary tree operation. */
3817 /* Create a tree node. */
3820 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3821 re_token_type_t type
)
3824 #if defined GCC_LINT || defined lint
3825 memset (&t
, 0, sizeof t
);
3828 return create_token_tree (dfa
, left
, right
, &t
);
3832 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3833 const re_token_t
*token
)
3836 if (__glibc_unlikely (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
))
3838 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3840 if (storage
== NULL
)
3842 storage
->next
= dfa
->str_tree_storage
;
3843 dfa
->str_tree_storage
= storage
;
3844 dfa
->str_tree_storage_idx
= 0;
3846 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3848 tree
->parent
= NULL
;
3850 tree
->right
= right
;
3851 tree
->token
= *token
;
3852 tree
->token
.duplicated
= 0;
3853 tree
->token
.opt_subexp
= 0;
3856 tree
->node_idx
= -1;
3859 left
->parent
= tree
;
3861 right
->parent
= tree
;
3865 /* Mark the tree SRC as an optional subexpression.
3866 To be called from preorder or postorder. */
3868 static reg_errcode_t
3869 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3871 Idx idx
= (uintptr_t) extra
;
3872 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3873 node
->token
.opt_subexp
= 1;
3878 /* Free the allocated memory inside NODE. */
3881 free_token (re_token_t
*node
)
3883 #ifdef RE_ENABLE_I18N
3884 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3885 free_charset (node
->opr
.mbcset
);
3887 #endif /* RE_ENABLE_I18N */
3888 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3889 re_free (node
->opr
.sbcset
);
3892 /* Worker function for tree walking. Free the allocated memory inside NODE
3893 and its children. */
3895 static reg_errcode_t
3896 free_tree (void *extra
, bin_tree_t
*node
)
3898 free_token (&node
->token
);
3903 /* Duplicate the node SRC, and return new node. This is a preorder
3904 visit similar to the one implemented by the generic visitor, but
3905 we need more infrastructure to maintain two parallel trees --- so,
3906 it's easier to duplicate. */
3909 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3911 const bin_tree_t
*node
;
3912 bin_tree_t
*dup_root
;
3913 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3915 for (node
= root
; ; )
3917 /* Create a new tree and link it back to the current parent. */
3918 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3921 (*p_new
)->parent
= dup_node
;
3922 (*p_new
)->token
.duplicated
= 1;
3925 /* Go to the left node, or up and to the right. */
3929 p_new
= &dup_node
->left
;
3933 const bin_tree_t
*prev
= NULL
;
3934 while (node
->right
== prev
|| node
->right
== NULL
)
3937 node
= node
->parent
;
3938 dup_node
= dup_node
->parent
;
3943 p_new
= &dup_node
->right
;