1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
25 size_t length
, reg_syntax_t syntax
);
26 static void re_compile_fastmap_iter (regex_t
*bufp
,
27 const re_dfastate_t
*init_state
,
29 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
31 static void free_charset (re_charset_t
*cset
);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t
*preg
);
34 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
36 static void optimize_utf8 (re_dfa_t
*dfa
);
38 static reg_errcode_t
analyze (regex_t
*preg
);
39 static reg_errcode_t
preorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
postorder (bin_tree_t
*root
,
43 reg_errcode_t (fn (void *, bin_tree_t
*)),
45 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
47 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
49 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
52 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
53 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
54 unsigned int constraint
);
55 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
56 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
58 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
59 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
61 static int peek_token (re_token_t
*token
, re_string_t
*input
,
62 reg_syntax_t syntax
) internal_function
;
63 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
64 reg_syntax_t syntax
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 Idx nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 Idx nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 Idx nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
75 re_token_t
*token
, reg_syntax_t syntax
,
76 Idx nest
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
78 re_dfa_t
*dfa
, re_token_t
*token
,
79 reg_syntax_t syntax
, reg_errcode_t
*err
);
80 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
81 re_token_t
*token
, reg_syntax_t syntax
,
83 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
85 re_token_t
*token
, int token_len
,
89 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
93 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
95 Idx
*equiv_class_alloc
,
96 const unsigned char *name
);
97 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
100 Idx
*char_class_alloc
,
101 const char *class_name
,
102 reg_syntax_t syntax
);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
105 const unsigned char *name
);
106 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
108 const char *class_name
,
109 reg_syntax_t syntax
);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
112 RE_TRANSLATE_TYPE trans
,
113 const char *class_name
,
115 bool non_match
, reg_errcode_t
*err
);
116 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 re_token_type_t type
);
119 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
120 bin_tree_t
*left
, bin_tree_t
*right
,
121 const re_token_t
*token
);
122 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
123 static void free_token (re_token_t
*node
);
124 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
125 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid
[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx
[] =
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
217 re_compile_pattern (const char *pattern
, size_t length
,
218 struct re_pattern_buffer
*bufp
)
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
227 /* Match anchors at newline. */
228 bufp
->newline_anchor
= 1;
230 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
234 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
237 weak_alias (__re_compile_pattern
, re_compile_pattern
)
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options
;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
256 re_set_syntax (reg_syntax_t syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
270 re_dfa_t
*dfa
= bufp
->buffer
;
271 char *fastmap
= bufp
->fastmap
;
273 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
274 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
275 if (dfa
->init_state
!= dfa
->init_state_word
)
276 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_nl
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
281 bufp
->fastmap_accurate
= 1;
285 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
289 __attribute__ ((always_inline
))
290 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
294 fastmap
[tolower (ch
)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
301 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
304 re_dfa_t
*dfa
= bufp
->buffer
;
306 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
307 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
309 Idx node
= init_state
->nodes
.elems
[node_cnt
];
310 re_token_type_t type
= dfa
->nodes
[node
].type
;
312 if (type
== CHARACTER
)
314 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
318 unsigned char buf
[MB_LEN_MAX
];
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, false, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
353 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
357 /* See if we have to try all bytes which start multiple collation
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
364 && (cset
->ncoll_syms
|| cset
->nranges
))
366 const int32_t *table
= (const int32_t *)
367 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
368 for (i
= 0; i
< SBC_MAX
; ++i
)
370 re_set_fastmap (fastmap
, icase
, i
);
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa
->mb_cur_max
> 1
379 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
381 || cset
->nequiv_classes
389 memset (&mbs
, 0, sizeof (mbs
));
390 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
391 re_set_fastmap (fastmap
, false, (int) c
);
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i
= 0; i
< cset
->nmbchars
; ++i
)
403 memset (&state
, '\0', sizeof (state
));
404 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
405 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
406 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
408 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
410 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
415 #endif /* RE_ENABLE_I18N */
416 else if (type
== OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type
== OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type
== END_OF_RE
)
422 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
423 if (type
== END_OF_RE
)
424 bufp
->can_be_null
= 1;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t
*_Restrict_ preg
, const char *_Restrict_ pattern
, int cflags
)
470 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC
);
477 /* Try to allocate space for the fastmap. */
478 preg
->fastmap
= re_malloc (char, SBC_MAX
);
479 if (BE (preg
->fastmap
== NULL
, 0))
482 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags
& REG_NEWLINE
)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax
&= ~RE_DOT_NEWLINE
;
488 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
489 /* It also changes the matching behavior. */
490 preg
->newline_anchor
= 1;
493 preg
->newline_anchor
= 0;
494 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
495 preg
->translate
= NULL
;
497 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret
== REG_ERPAREN
)
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret
== REG_NOERROR
, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg
);
511 /* Some error occurred while compiling the expression. */
512 re_free (preg
->fastmap
);
513 preg
->fastmap
= NULL
;
519 weak_alias (__regcomp
, regcomp
)
522 /* Returns a message corresponding to an error code, ERRCODE, returned
523 from either regcomp or regexec. We don't use PREG here. */
526 regerror (int errcode
, const regex_t
*_Restrict_ preg
, char *_Restrict_ errbuf
,
533 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
534 / sizeof (__re_error_msgid_idx
[0])), 0))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
541 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
543 msg_size
= strlen (msg
) + 1; /* Includes the null. */
545 if (BE (errbuf_size
!= 0, 1))
547 size_t cpy_size
= msg_size
;
548 if (BE (msg_size
> errbuf_size
, 0))
550 cpy_size
= errbuf_size
- 1;
551 errbuf
[cpy_size
] = '\0';
553 memcpy (errbuf
, msg
, cpy_size
);
559 weak_alias (__regerror
, regerror
)
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565 UTF-8 is used. Otherwise we would allocate memory just to initialize
566 it the same all the time. UTF-8 is the preferred encoding so this is
567 a worthwhile optimization. */
568 static const bitset_t utf8_sb_map
=
570 /* Set the first 128 bits. */
571 # if defined __GNUC__ && !defined __STRICT_ANSI__
572 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
574 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
575 # error "bitset_word_t is narrower than 32 bits"
576 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
577 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
578 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
579 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
580 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
584 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
586 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
593 free_dfa_content (re_dfa_t
*dfa
)
598 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
599 free_token (dfa
->nodes
+ i
);
600 re_free (dfa
->nexts
);
601 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
603 if (dfa
->eclosures
!= NULL
)
604 re_node_set_free (dfa
->eclosures
+ i
);
605 if (dfa
->inveclosures
!= NULL
)
606 re_node_set_free (dfa
->inveclosures
+ i
);
607 if (dfa
->edests
!= NULL
)
608 re_node_set_free (dfa
->edests
+ i
);
610 re_free (dfa
->edests
);
611 re_free (dfa
->eclosures
);
612 re_free (dfa
->inveclosures
);
613 re_free (dfa
->nodes
);
615 if (dfa
->state_table
)
616 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
618 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
619 for (j
= 0; j
< entry
->num
; ++j
)
621 re_dfastate_t
*state
= entry
->array
[j
];
624 re_free (entry
->array
);
626 re_free (dfa
->state_table
);
627 #ifdef RE_ENABLE_I18N
628 if (dfa
->sb_char
!= utf8_sb_map
)
629 re_free (dfa
->sb_char
);
631 re_free (dfa
->subexp_map
);
633 re_free (dfa
->re_str
);
640 /* Free dynamically allocated space used by PREG. */
643 regfree (regex_t
*preg
)
645 re_dfa_t
*dfa
= preg
->buffer
;
646 if (BE (dfa
!= NULL
, 1))
648 lock_fini (dfa
->lock
);
649 free_dfa_content (dfa
);
654 re_free (preg
->fastmap
);
655 preg
->fastmap
= NULL
;
657 re_free (preg
->translate
);
658 preg
->translate
= NULL
;
661 weak_alias (__regfree
, regfree
)
664 /* Entry points compatible with 4.2 BSD regex library. We don't define
665 them unless specifically requested. */
667 #if defined _REGEX_RE_COMP || defined _LIBC
669 /* BSD has one and only one pattern buffer. */
670 static struct re_pattern_buffer re_comp_buf
;
674 /* Make these definitions weak in libc, so POSIX programs can redefine
675 these names if they don't use our functions, and still use
676 regcomp/regexec above without link errors. */
679 re_comp (const char *s
)
686 if (!re_comp_buf
.buffer
)
687 return gettext ("No previous regular expression");
691 if (re_comp_buf
.buffer
)
693 fastmap
= re_comp_buf
.fastmap
;
694 re_comp_buf
.fastmap
= NULL
;
695 __regfree (&re_comp_buf
);
696 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
697 re_comp_buf
.fastmap
= fastmap
;
700 if (re_comp_buf
.fastmap
== NULL
)
702 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
703 if (re_comp_buf
.fastmap
== NULL
)
704 return (char *) gettext (__re_error_msgid
705 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
708 /* Since 're_exec' always passes NULL for the 'regs' argument, we
709 don't need to initialize the pattern buffer fields which affect it. */
711 /* Match anchors at newlines. */
712 re_comp_buf
.newline_anchor
= 1;
714 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
719 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
720 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
724 libc_freeres_fn (free_mem
)
726 __regfree (&re_comp_buf
);
730 #endif /* _REGEX_RE_COMP */
732 /* Internal entry point.
733 Compile the regular expression PATTERN, whose length is LENGTH.
734 SYNTAX indicate regular expression's syntax. */
737 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
740 reg_errcode_t err
= REG_NOERROR
;
744 /* Initialize the pattern buffer. */
745 preg
->fastmap_accurate
= 0;
746 preg
->syntax
= syntax
;
747 preg
->not_bol
= preg
->not_eol
= 0;
750 preg
->can_be_null
= 0;
751 preg
->regs_allocated
= REGS_UNALLOCATED
;
753 /* Initialize the dfa. */
755 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
757 /* If zero allocated, but buffer is non-null, try to realloc
758 enough space. This loses if buffer's address is bogus, but
759 that is the user's responsibility. If ->buffer is NULL this
760 is a simple allocation. */
761 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
764 preg
->allocated
= sizeof (re_dfa_t
);
767 preg
->used
= sizeof (re_dfa_t
);
769 err
= init_dfa (dfa
, length
);
770 if (BE (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0, 0))
772 if (BE (err
!= REG_NOERROR
, 0))
774 free_dfa_content (dfa
);
780 /* Note: length+1 will not overflow since it is checked in init_dfa. */
781 dfa
->re_str
= re_malloc (char, length
+ 1);
782 strncpy (dfa
->re_str
, pattern
, length
+ 1);
785 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
786 (syntax
& RE_ICASE
) != 0, dfa
);
787 if (BE (err
!= REG_NOERROR
, 0))
789 re_compile_internal_free_return
:
790 free_workarea_compile (preg
);
791 re_string_destruct (®exp
);
792 lock_fini (dfa
->lock
);
793 free_dfa_content (dfa
);
799 /* Parse the regular expression, and build a structure tree. */
801 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
802 if (BE (dfa
->str_tree
== NULL
, 0))
803 goto re_compile_internal_free_return
;
805 /* Analyze the tree and create the nfa. */
806 err
= analyze (preg
);
807 if (BE (err
!= REG_NOERROR
, 0))
808 goto re_compile_internal_free_return
;
810 #ifdef RE_ENABLE_I18N
811 /* If possible, do searching in single byte encoding to speed things up. */
812 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
816 /* Then create the initial state of the dfa. */
817 err
= create_initial_state (dfa
);
819 /* Release work areas. */
820 free_workarea_compile (preg
);
821 re_string_destruct (®exp
);
823 if (BE (err
!= REG_NOERROR
, 0))
825 lock_fini (dfa
->lock
);
826 free_dfa_content (dfa
);
834 /* Initialize DFA. We use the length of the regular expression PAT_LEN
835 as the initial length of some arrays. */
838 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
840 __re_size_t table_size
;
842 const char *codeset_name
;
844 #ifdef RE_ENABLE_I18N
845 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
847 size_t max_i18n_object_size
= 0;
849 size_t max_object_size
=
850 MAX (sizeof (struct re_state_table_entry
),
851 MAX (sizeof (re_token_t
),
852 MAX (sizeof (re_node_set
),
853 MAX (sizeof (regmatch_t
),
854 max_i18n_object_size
))));
856 memset (dfa
, '\0', sizeof (re_dfa_t
));
858 /* Force allocation of str_tree_storage the first time. */
859 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
861 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
862 calculation below, and for similar doubling calculations
863 elsewhere. And it's <= rather than <, because some of the
864 doubling calculations add 1 afterwards. */
865 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2 <= pat_len
, 0))
868 dfa
->nodes_alloc
= pat_len
+ 1;
869 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
871 /* table_size = 2 ^ ceil(log pat_len) */
872 for (table_size
= 1; ; table_size
<<= 1)
873 if (table_size
> pat_len
)
876 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
877 dfa
->state_hash_mask
= table_size
- 1;
879 dfa
->mb_cur_max
= MB_CUR_MAX
;
881 if (dfa
->mb_cur_max
== 6
882 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
884 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
887 codeset_name
= nl_langinfo (CODESET
);
888 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
889 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
890 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
891 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
894 /* We check exhaustively in the loop below if this charset is a
895 superset of ASCII. */
896 dfa
->map_notascii
= 0;
899 #ifdef RE_ENABLE_I18N
900 if (dfa
->mb_cur_max
> 1)
903 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
908 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
909 if (BE (dfa
->sb_char
== NULL
, 0))
912 /* Set the bits corresponding to single byte chars. */
913 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
914 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
916 wint_t wch
= __btowc (ch
);
918 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
920 if (isascii (ch
) && wch
!= ch
)
921 dfa
->map_notascii
= 1;
928 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
933 /* Initialize WORD_CHAR table, which indicate which character is
934 "word". In this case "word" means that it is the word construction
935 character used by some operators like "\<", "\>", etc. */
939 init_word_char (re_dfa_t
*dfa
)
944 dfa
->word_ops_used
= 1;
945 if (BE (dfa
->map_notascii
== 0, 1))
947 bitset_word_t bits0
= 0x00000000;
948 bitset_word_t bits1
= 0x03ff0000;
949 bitset_word_t bits2
= 0x87fffffe;
950 bitset_word_t bits3
= 0x07fffffe;
951 if (BITSET_WORD_BITS
== 64)
953 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
954 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
957 else if (BITSET_WORD_BITS
== 32)
959 dfa
->word_char
[0] = bits0
;
960 dfa
->word_char
[1] = bits1
;
961 dfa
->word_char
[2] = bits2
;
962 dfa
->word_char
[3] = bits3
;
969 if (BE (dfa
->is_utf8
, 1))
971 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
977 for (; i
< BITSET_WORDS
; ++i
)
978 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
979 if (isalnum (ch
) || ch
== '_')
980 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
983 /* Free the work area which are only used while compiling. */
986 free_workarea_compile (regex_t
*preg
)
988 re_dfa_t
*dfa
= preg
->buffer
;
989 bin_tree_storage_t
*storage
, *next
;
990 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
992 next
= storage
->next
;
995 dfa
->str_tree_storage
= NULL
;
996 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
997 dfa
->str_tree
= NULL
;
998 re_free (dfa
->org_indices
);
999 dfa
->org_indices
= NULL
;
1002 /* Create initial states for all contexts. */
1004 static reg_errcode_t
1005 create_initial_state (re_dfa_t
*dfa
)
1009 re_node_set init_nodes
;
1011 /* Initial states have the epsilon closure of the node which is
1012 the first node of the regular expression. */
1013 first
= dfa
->str_tree
->first
->node_idx
;
1014 dfa
->init_node
= first
;
1015 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1016 if (BE (err
!= REG_NOERROR
, 0))
1019 /* The back-references which are in initial states can epsilon transit,
1020 since in this case all of the subexpressions can be null.
1021 Then we add epsilon closures of the nodes which are the next nodes of
1022 the back-references. */
1023 if (dfa
->nbackref
> 0)
1024 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1026 Idx node_idx
= init_nodes
.elems
[i
];
1027 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1030 if (type
!= OP_BACK_REF
)
1032 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1034 re_token_t
*clexp_node
;
1035 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1036 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1037 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1040 if (clexp_idx
== init_nodes
.nelem
)
1043 if (type
== OP_BACK_REF
)
1045 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1046 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1048 reg_errcode_t merge_err
1049 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1050 if (merge_err
!= REG_NOERROR
)
1057 /* It must be the first time to invoke acquire_state. */
1058 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1059 /* We don't check ERR here, since the initial state must not be NULL. */
1060 if (BE (dfa
->init_state
== NULL
, 0))
1062 if (dfa
->init_state
->has_constraint
)
1064 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1066 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1068 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1072 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1073 || dfa
->init_state_begbuf
== NULL
, 0))
1077 dfa
->init_state_word
= dfa
->init_state_nl
1078 = dfa
->init_state_begbuf
= dfa
->init_state
;
1080 re_node_set_free (&init_nodes
);
1084 #ifdef RE_ENABLE_I18N
1085 /* If it is possible to do searching in single byte encoding instead of UTF-8
1086 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1087 DFA nodes where needed. */
1090 optimize_utf8 (re_dfa_t
*dfa
)
1094 bool mb_chars
= false;
1095 bool has_period
= false;
1097 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1098 switch (dfa
->nodes
[node
].type
)
1101 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1105 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1113 /* Word anchors etc. cannot be handled. It's okay to test
1114 opr.ctx_type since constraints (for all DFA nodes) are
1115 created by ORing one or more opr.ctx_type values. */
1125 case OP_DUP_ASTERISK
:
1126 case OP_OPEN_SUBEXP
:
1127 case OP_CLOSE_SUBEXP
:
1129 case COMPLEX_BRACKET
:
1131 case SIMPLE_BRACKET
:
1132 /* Just double check. */
1134 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1136 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1137 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1139 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1149 if (mb_chars
|| has_period
)
1150 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1152 if (dfa
->nodes
[node
].type
== CHARACTER
1153 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1154 dfa
->nodes
[node
].mb_partial
= 0;
1155 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1156 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1159 /* The search can be in single byte locale. */
1160 dfa
->mb_cur_max
= 1;
1162 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1166 /* Analyze the structure tree, and calculate "first", "next", "edest",
1167 "eclosure", and "inveclosure". */
1169 static reg_errcode_t
1170 analyze (regex_t
*preg
)
1172 re_dfa_t
*dfa
= preg
->buffer
;
1175 /* Allocate arrays. */
1176 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1177 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1178 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1179 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1180 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1181 || dfa
->eclosures
== NULL
, 0))
1184 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1185 if (dfa
->subexp_map
!= NULL
)
1188 for (i
= 0; i
< preg
->re_nsub
; i
++)
1189 dfa
->subexp_map
[i
] = i
;
1190 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1191 for (i
= 0; i
< preg
->re_nsub
; i
++)
1192 if (dfa
->subexp_map
[i
] != i
)
1194 if (i
== preg
->re_nsub
)
1196 free (dfa
->subexp_map
);
1197 dfa
->subexp_map
= NULL
;
1201 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1202 if (BE (ret
!= REG_NOERROR
, 0))
1204 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1205 if (BE (ret
!= REG_NOERROR
, 0))
1207 preorder (dfa
->str_tree
, calc_next
, dfa
);
1208 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1209 if (BE (ret
!= REG_NOERROR
, 0))
1211 ret
= calc_eclosure (dfa
);
1212 if (BE (ret
!= REG_NOERROR
, 0))
1215 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1216 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1217 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1220 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1221 if (BE (dfa
->inveclosures
== NULL
, 0))
1223 ret
= calc_inveclosure (dfa
);
1229 /* Our parse trees are very unbalanced, so we cannot use a stack to
1230 implement parse tree visits. Instead, we use parent pointers and
1231 some hairy code in these two functions. */
1232 static reg_errcode_t
1233 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1236 bin_tree_t
*node
, *prev
;
1238 for (node
= root
; ; )
1240 /* Descend down the tree, preferably to the left (or to the right
1241 if that's the only child). */
1242 while (node
->left
|| node
->right
)
1250 reg_errcode_t err
= fn (extra
, node
);
1251 if (BE (err
!= REG_NOERROR
, 0))
1253 if (node
->parent
== NULL
)
1256 node
= node
->parent
;
1258 /* Go up while we have a node that is reached from the right. */
1259 while (node
->right
== prev
|| node
->right
== NULL
);
1264 static reg_errcode_t
1265 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1270 for (node
= root
; ; )
1272 reg_errcode_t err
= fn (extra
, node
);
1273 if (BE (err
!= REG_NOERROR
, 0))
1276 /* Go to the left node, or up and to the right. */
1281 bin_tree_t
*prev
= NULL
;
1282 while (node
->right
== prev
|| node
->right
== NULL
)
1285 node
= node
->parent
;
1294 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1296 backreferences as well. Requires a preorder visit. */
1297 static reg_errcode_t
1298 optimize_subexps (void *extra
, bin_tree_t
*node
)
1300 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1302 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1304 int idx
= node
->token
.opr
.idx
;
1305 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1306 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1309 else if (node
->token
.type
== SUBEXP
1310 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1312 Idx other_idx
= node
->left
->token
.opr
.idx
;
1314 node
->left
= node
->left
->left
;
1316 node
->left
->parent
= node
;
1318 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1319 if (other_idx
< BITSET_WORD_BITS
)
1320 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1326 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1328 static reg_errcode_t
1329 lower_subexps (void *extra
, bin_tree_t
*node
)
1331 regex_t
*preg
= (regex_t
*) extra
;
1332 reg_errcode_t err
= REG_NOERROR
;
1334 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1336 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1338 node
->left
->parent
= node
;
1340 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1342 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1344 node
->right
->parent
= node
;
1351 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1353 re_dfa_t
*dfa
= preg
->buffer
;
1354 bin_tree_t
*body
= node
->left
;
1355 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1358 /* We do not optimize empty subexpressions, because otherwise we may
1359 have bad CONCAT nodes with NULL children. This is obviously not
1360 very common, so we do not lose much. An example that triggers
1361 this case is the sed "script" /\(\)/x. */
1362 && node
->left
!= NULL
1363 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1364 || !(dfa
->used_bkref_map
1365 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1368 /* Convert the SUBEXP node to the concatenation of an
1369 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1370 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1371 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1372 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1373 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1374 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1380 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1381 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1385 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1386 nodes. Requires a postorder visit. */
1387 static reg_errcode_t
1388 calc_first (void *extra
, bin_tree_t
*node
)
1390 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1391 if (node
->token
.type
== CONCAT
)
1393 node
->first
= node
->left
->first
;
1394 node
->node_idx
= node
->left
->node_idx
;
1399 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1400 if (BE (node
->node_idx
== -1, 0))
1402 if (node
->token
.type
== ANCHOR
)
1403 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1408 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1409 static reg_errcode_t
1410 calc_next (void *extra
, bin_tree_t
*node
)
1412 switch (node
->token
.type
)
1414 case OP_DUP_ASTERISK
:
1415 node
->left
->next
= node
;
1418 node
->left
->next
= node
->right
->first
;
1419 node
->right
->next
= node
->next
;
1423 node
->left
->next
= node
->next
;
1425 node
->right
->next
= node
->next
;
1431 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1432 static reg_errcode_t
1433 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1435 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1436 Idx idx
= node
->node_idx
;
1437 reg_errcode_t err
= REG_NOERROR
;
1439 switch (node
->token
.type
)
1445 assert (node
->next
== NULL
);
1448 case OP_DUP_ASTERISK
:
1452 dfa
->has_plural_match
= 1;
1453 if (node
->left
!= NULL
)
1454 left
= node
->left
->first
->node_idx
;
1456 left
= node
->next
->node_idx
;
1457 if (node
->right
!= NULL
)
1458 right
= node
->right
->first
->node_idx
;
1460 right
= node
->next
->node_idx
;
1462 assert (right
> -1);
1463 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1468 case OP_OPEN_SUBEXP
:
1469 case OP_CLOSE_SUBEXP
:
1470 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1474 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1475 if (node
->token
.type
== OP_BACK_REF
)
1476 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1480 assert (!IS_EPSILON_NODE (node
->token
.type
));
1481 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1488 /* Duplicate the epsilon closure of the node ROOT_NODE.
1489 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1490 to their own constraint. */
1492 static reg_errcode_t
1494 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1495 Idx root_node
, unsigned int init_constraint
)
1497 Idx org_node
, clone_node
;
1499 unsigned int constraint
= init_constraint
;
1500 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1502 Idx org_dest
, clone_dest
;
1503 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1505 /* If the back reference epsilon-transit, its destination must
1506 also have the constraint. Then duplicate the epsilon closure
1507 of the destination of the back reference, and store it in
1508 edests of the back reference. */
1509 org_dest
= dfa
->nexts
[org_node
];
1510 re_node_set_empty (dfa
->edests
+ clone_node
);
1511 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1512 if (BE (clone_dest
== -1, 0))
1514 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1515 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1519 else if (dfa
->edests
[org_node
].nelem
== 0)
1521 /* In case of the node can't epsilon-transit, don't duplicate the
1522 destination and store the original destination as the
1523 destination of the node. */
1524 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1527 else if (dfa
->edests
[org_node
].nelem
== 1)
1529 /* In case of the node can epsilon-transit, and it has only one
1531 org_dest
= dfa
->edests
[org_node
].elems
[0];
1532 re_node_set_empty (dfa
->edests
+ clone_node
);
1533 /* If the node is root_node itself, it means the epsilon closure
1534 has a loop. Then tie it to the destination of the root_node. */
1535 if (org_node
== root_node
&& clone_node
!= org_node
)
1537 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1542 /* In case the node has another constraint, append it. */
1543 constraint
|= dfa
->nodes
[org_node
].constraint
;
1544 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1545 if (BE (clone_dest
== -1, 0))
1547 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1551 else /* dfa->edests[org_node].nelem == 2 */
1553 /* In case of the node can epsilon-transit, and it has two
1554 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1555 org_dest
= dfa
->edests
[org_node
].elems
[0];
1556 re_node_set_empty (dfa
->edests
+ clone_node
);
1557 /* Search for a duplicated node which satisfies the constraint. */
1558 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1559 if (clone_dest
== -1)
1561 /* There is no such duplicated node, create a new one. */
1563 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1564 if (BE (clone_dest
== -1, 0))
1566 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1569 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1570 root_node
, constraint
);
1571 if (BE (err
!= REG_NOERROR
, 0))
1576 /* There is a duplicated node which satisfies the constraint,
1577 use it to avoid infinite loop. */
1578 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1583 org_dest
= dfa
->edests
[org_node
].elems
[1];
1584 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1585 if (BE (clone_dest
== -1, 0))
1587 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1591 org_node
= org_dest
;
1592 clone_node
= clone_dest
;
1597 /* Search for a node which is duplicated from the node ORG_NODE, and
1598 satisfies the constraint CONSTRAINT. */
1601 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1602 unsigned int constraint
)
1605 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1607 if (org_node
== dfa
->org_indices
[idx
]
1608 && constraint
== dfa
->nodes
[idx
].constraint
)
1609 return idx
; /* Found. */
1611 return -1; /* Not found. */
1614 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615 Return the index of the new node, or -1 if insufficient storage is
1619 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1621 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1622 if (BE (dup_idx
!= -1, 1))
1624 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1625 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1626 dfa
->nodes
[dup_idx
].duplicated
= 1;
1628 /* Store the index of the original node. */
1629 dfa
->org_indices
[dup_idx
] = org_idx
;
1634 static reg_errcode_t
1635 calc_inveclosure (re_dfa_t
*dfa
)
1639 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1640 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1642 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1644 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1645 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1647 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1656 /* Calculate "eclosure" for all the node in DFA. */
1658 static reg_errcode_t
1659 calc_eclosure (re_dfa_t
*dfa
)
1664 assert (dfa
->nodes_len
> 0);
1667 /* For each nodes, calculate epsilon closure. */
1668 for (node_idx
= 0; ; ++node_idx
)
1671 re_node_set eclosure_elem
;
1672 if (node_idx
== dfa
->nodes_len
)
1681 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1684 /* If we have already calculated, skip it. */
1685 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1687 /* Calculate epsilon closure of 'node_idx'. */
1688 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1689 if (BE (err
!= REG_NOERROR
, 0))
1692 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1695 re_node_set_free (&eclosure_elem
);
1701 /* Calculate epsilon closure of NODE. */
1703 static reg_errcode_t
1704 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1708 re_node_set eclosure
;
1710 bool incomplete
= false;
1711 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1712 if (BE (err
!= REG_NOERROR
, 0))
1715 /* This indicates that we are calculating this node now.
1716 We reference this value to avoid infinite loop. */
1717 dfa
->eclosures
[node
].nelem
= -1;
1719 /* If the current node has constraints, duplicate all nodes
1720 since they must inherit the constraints. */
1721 if (dfa
->nodes
[node
].constraint
1722 && dfa
->edests
[node
].nelem
1723 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1725 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1726 dfa
->nodes
[node
].constraint
);
1727 if (BE (err
!= REG_NOERROR
, 0))
1731 /* Expand each epsilon destination nodes. */
1732 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1733 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1735 re_node_set eclosure_elem
;
1736 Idx edest
= dfa
->edests
[node
].elems
[i
];
1737 /* If calculating the epsilon closure of 'edest' is in progress,
1738 return intermediate result. */
1739 if (dfa
->eclosures
[edest
].nelem
== -1)
1744 /* If we haven't calculated the epsilon closure of 'edest' yet,
1745 calculate now. Otherwise use calculated epsilon closure. */
1746 if (dfa
->eclosures
[edest
].nelem
== 0)
1748 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1749 if (BE (err
!= REG_NOERROR
, 0))
1753 eclosure_elem
= dfa
->eclosures
[edest
];
1754 /* Merge the epsilon closure of 'edest'. */
1755 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1756 if (BE (err
!= REG_NOERROR
, 0))
1758 /* If the epsilon closure of 'edest' is incomplete,
1759 the epsilon closure of this node is also incomplete. */
1760 if (dfa
->eclosures
[edest
].nelem
== 0)
1763 re_node_set_free (&eclosure_elem
);
1767 /* An epsilon closure includes itself. */
1768 ok
= re_node_set_insert (&eclosure
, node
);
1771 if (incomplete
&& !root
)
1772 dfa
->eclosures
[node
].nelem
= 0;
1774 dfa
->eclosures
[node
] = eclosure
;
1775 *new_set
= eclosure
;
1779 /* Functions for token which are used in the parser. */
1781 /* Fetch a token from INPUT.
1782 We must not use this function inside bracket expressions. */
1786 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1788 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1791 /* Peek a token from INPUT, and return the length of the token.
1792 We must not use this function inside bracket expressions. */
1796 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1800 if (re_string_eoi (input
))
1802 token
->type
= END_OF_RE
;
1806 c
= re_string_peek_byte (input
, 0);
1809 token
->word_char
= 0;
1810 #ifdef RE_ENABLE_I18N
1811 token
->mb_partial
= 0;
1812 if (input
->mb_cur_max
> 1 &&
1813 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1815 token
->type
= CHARACTER
;
1816 token
->mb_partial
= 1;
1823 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1825 token
->type
= BACK_SLASH
;
1829 c2
= re_string_peek_byte_case (input
, 1);
1831 token
->type
= CHARACTER
;
1832 #ifdef RE_ENABLE_I18N
1833 if (input
->mb_cur_max
> 1)
1835 wint_t wc
= re_string_wchar_at (input
,
1836 re_string_cur_idx (input
) + 1);
1837 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1841 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1846 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1847 token
->type
= OP_ALT
;
1849 case '1': case '2': case '3': case '4': case '5':
1850 case '6': case '7': case '8': case '9':
1851 if (!(syntax
& RE_NO_BK_REFS
))
1853 token
->type
= OP_BACK_REF
;
1854 token
->opr
.idx
= c2
- '1';
1858 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= ANCHOR
;
1861 token
->opr
.ctx_type
= WORD_FIRST
;
1865 if (!(syntax
& RE_NO_GNU_OPS
))
1867 token
->type
= ANCHOR
;
1868 token
->opr
.ctx_type
= WORD_LAST
;
1872 if (!(syntax
& RE_NO_GNU_OPS
))
1874 token
->type
= ANCHOR
;
1875 token
->opr
.ctx_type
= WORD_DELIM
;
1879 if (!(syntax
& RE_NO_GNU_OPS
))
1881 token
->type
= ANCHOR
;
1882 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1886 if (!(syntax
& RE_NO_GNU_OPS
))
1887 token
->type
= OP_WORD
;
1890 if (!(syntax
& RE_NO_GNU_OPS
))
1891 token
->type
= OP_NOTWORD
;
1894 if (!(syntax
& RE_NO_GNU_OPS
))
1895 token
->type
= OP_SPACE
;
1898 if (!(syntax
& RE_NO_GNU_OPS
))
1899 token
->type
= OP_NOTSPACE
;
1902 if (!(syntax
& RE_NO_GNU_OPS
))
1904 token
->type
= ANCHOR
;
1905 token
->opr
.ctx_type
= BUF_FIRST
;
1909 if (!(syntax
& RE_NO_GNU_OPS
))
1911 token
->type
= ANCHOR
;
1912 token
->opr
.ctx_type
= BUF_LAST
;
1916 if (!(syntax
& RE_NO_BK_PARENS
))
1917 token
->type
= OP_OPEN_SUBEXP
;
1920 if (!(syntax
& RE_NO_BK_PARENS
))
1921 token
->type
= OP_CLOSE_SUBEXP
;
1924 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1925 token
->type
= OP_DUP_PLUS
;
1928 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1929 token
->type
= OP_DUP_QUESTION
;
1932 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1933 token
->type
= OP_OPEN_DUP_NUM
;
1936 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1937 token
->type
= OP_CLOSE_DUP_NUM
;
1945 token
->type
= CHARACTER
;
1946 #ifdef RE_ENABLE_I18N
1947 if (input
->mb_cur_max
> 1)
1949 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1950 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1954 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1959 if (syntax
& RE_NEWLINE_ALT
)
1960 token
->type
= OP_ALT
;
1963 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1964 token
->type
= OP_ALT
;
1967 token
->type
= OP_DUP_ASTERISK
;
1970 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1971 token
->type
= OP_DUP_PLUS
;
1974 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1975 token
->type
= OP_DUP_QUESTION
;
1978 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1979 token
->type
= OP_OPEN_DUP_NUM
;
1982 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1983 token
->type
= OP_CLOSE_DUP_NUM
;
1986 if (syntax
& RE_NO_BK_PARENS
)
1987 token
->type
= OP_OPEN_SUBEXP
;
1990 if (syntax
& RE_NO_BK_PARENS
)
1991 token
->type
= OP_CLOSE_SUBEXP
;
1994 token
->type
= OP_OPEN_BRACKET
;
1997 token
->type
= OP_PERIOD
;
2000 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
2001 re_string_cur_idx (input
) != 0)
2003 char prev
= re_string_peek_byte (input
, -1);
2004 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
2007 token
->type
= ANCHOR
;
2008 token
->opr
.ctx_type
= LINE_FIRST
;
2011 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2012 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2015 re_string_skip_bytes (input
, 1);
2016 peek_token (&next
, input
, syntax
);
2017 re_string_skip_bytes (input
, -1);
2018 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2021 token
->type
= ANCHOR
;
2022 token
->opr
.ctx_type
= LINE_LAST
;
2030 /* Peek a token from INPUT, and return the length of the token.
2031 We must not use this function out of bracket expressions. */
2035 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2038 if (re_string_eoi (input
))
2040 token
->type
= END_OF_RE
;
2043 c
= re_string_peek_byte (input
, 0);
2046 #ifdef RE_ENABLE_I18N
2047 if (input
->mb_cur_max
> 1 &&
2048 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2050 token
->type
= CHARACTER
;
2053 #endif /* RE_ENABLE_I18N */
2055 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2056 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2058 /* In this case, '\' escape a character. */
2060 re_string_skip_bytes (input
, 1);
2061 c2
= re_string_peek_byte (input
, 0);
2063 token
->type
= CHARACTER
;
2066 if (c
== '[') /* '[' is a special char in a bracket exps. */
2070 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2071 c2
= re_string_peek_byte (input
, 1);
2079 token
->type
= OP_OPEN_COLL_ELEM
;
2083 token
->type
= OP_OPEN_EQUIV_CLASS
;
2087 if (syntax
& RE_CHAR_CLASSES
)
2089 token
->type
= OP_OPEN_CHAR_CLASS
;
2094 token
->type
= CHARACTER
;
2104 token
->type
= OP_CHARSET_RANGE
;
2107 token
->type
= OP_CLOSE_BRACKET
;
2110 token
->type
= OP_NON_MATCH_LIST
;
2113 token
->type
= CHARACTER
;
2118 /* Functions for parser. */
2120 /* Entry point of the parser.
2121 Parse the regular expression REGEXP and return the structure tree.
2122 If an error occurs, ERR is set by error code, and return NULL.
2123 This function build the following tree, from regular expression <reg_exp>:
2129 CAT means concatenation.
2130 EOR means end of regular expression. */
2133 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2136 re_dfa_t
*dfa
= preg
->buffer
;
2137 bin_tree_t
*tree
, *eor
, *root
;
2138 re_token_t current_token
;
2139 dfa
->syntax
= syntax
;
2140 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2141 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2142 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2144 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2146 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2149 if (BE (eor
== NULL
|| root
== NULL
, 0))
2157 /* This function build the following tree, from regular expression
2158 <branch1>|<branch2>:
2164 ALT means alternative, which represents the operator '|'. */
2167 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2168 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2170 re_dfa_t
*dfa
= preg
->buffer
;
2171 bin_tree_t
*tree
, *branch
= NULL
;
2172 bitset_word_t initial_bkref_map
= dfa
->completed_bkref_map
;
2173 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2174 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2177 while (token
->type
== OP_ALT
)
2179 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2180 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2181 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2183 bitset_word_t accumulated_bkref_map
= dfa
->completed_bkref_map
;
2184 dfa
->completed_bkref_map
= initial_bkref_map
;
2185 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2186 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2189 postorder (tree
, free_tree
, NULL
);
2192 dfa
->completed_bkref_map
|= accumulated_bkref_map
;
2196 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2197 if (BE (tree
== NULL
, 0))
2206 /* This function build the following tree, from regular expression
2213 CAT means concatenation. */
2216 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2217 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2219 bin_tree_t
*tree
, *expr
;
2220 re_dfa_t
*dfa
= preg
->buffer
;
2221 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2222 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2225 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2226 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2228 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2229 if (BE (*err
!= REG_NOERROR
&& expr
== NULL
, 0))
2232 postorder (tree
, free_tree
, NULL
);
2235 if (tree
!= NULL
&& expr
!= NULL
)
2237 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2238 if (newtree
== NULL
)
2240 postorder (expr
, free_tree
, NULL
);
2241 postorder (tree
, free_tree
, NULL
);
2247 else if (tree
== NULL
)
2249 /* Otherwise expr == NULL, we don't need to create new tree. */
2254 /* This function build the following tree, from regular expression a*:
2261 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2262 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2264 re_dfa_t
*dfa
= preg
->buffer
;
2266 switch (token
->type
)
2269 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2270 if (BE (tree
== NULL
, 0))
2275 #ifdef RE_ENABLE_I18N
2276 if (dfa
->mb_cur_max
> 1)
2278 while (!re_string_eoi (regexp
)
2279 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2281 bin_tree_t
*mbc_remain
;
2282 fetch_token (token
, regexp
, syntax
);
2283 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2284 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2285 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2295 case OP_OPEN_SUBEXP
:
2296 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2297 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2301 case OP_OPEN_BRACKET
:
2302 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2303 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2308 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2313 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2314 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2315 if (BE (tree
== NULL
, 0))
2321 dfa
->has_mb_node
= 1;
2324 case OP_OPEN_DUP_NUM
:
2325 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2331 case OP_DUP_ASTERISK
:
2333 case OP_DUP_QUESTION
:
2334 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2339 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2341 fetch_token (token
, regexp
, syntax
);
2342 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2345 case OP_CLOSE_SUBEXP
:
2346 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2347 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2353 case OP_CLOSE_DUP_NUM
:
2354 /* We treat it as a normal character. */
2356 /* Then we can these characters as normal characters. */
2357 token
->type
= CHARACTER
;
2358 /* mb_partial and word_char bits should be initialized already
2360 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2361 if (BE (tree
== NULL
, 0))
2369 if ((token
->opr
.ctx_type
2370 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2371 && dfa
->word_ops_used
== 0)
2372 init_word_char (dfa
);
2373 if (token
->opr
.ctx_type
== WORD_DELIM
2374 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2376 bin_tree_t
*tree_first
, *tree_last
;
2377 if (token
->opr
.ctx_type
== WORD_DELIM
)
2379 token
->opr
.ctx_type
= WORD_FIRST
;
2380 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2381 token
->opr
.ctx_type
= WORD_LAST
;
2385 token
->opr
.ctx_type
= INSIDE_WORD
;
2386 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2387 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2389 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2390 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2391 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2399 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2400 if (BE (tree
== NULL
, 0))
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token
, regexp
, syntax
);
2414 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2415 if (BE (tree
== NULL
, 0))
2420 if (dfa
->mb_cur_max
> 1)
2421 dfa
->has_mb_node
= 1;
2426 tree
= build_charclass_op (dfa
, regexp
->trans
,
2429 token
->type
== OP_NOTWORD
, err
);
2430 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2436 tree
= build_charclass_op (dfa
, regexp
->trans
,
2439 token
->type
== OP_NOTSPACE
, err
);
2440 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2453 /* Must not happen? */
2459 fetch_token (token
, regexp
, syntax
);
2461 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2462 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2464 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
,
2466 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2469 postorder (tree
, free_tree
, NULL
);
2473 /* In BRE consecutive duplications are not allowed. */
2474 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2475 && (token
->type
== OP_DUP_ASTERISK
2476 || token
->type
== OP_OPEN_DUP_NUM
))
2479 postorder (tree
, free_tree
, NULL
);
2488 /* This function build the following tree, from regular expression
2496 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2497 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2499 re_dfa_t
*dfa
= preg
->buffer
;
2502 cur_nsub
= preg
->re_nsub
++;
2504 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2506 /* The subexpression may be a null string. */
2507 if (token
->type
== OP_CLOSE_SUBEXP
)
2511 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2512 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2515 postorder (tree
, free_tree
, NULL
);
2518 if (BE (*err
!= REG_NOERROR
, 0))
2522 if (cur_nsub
<= '9' - '1')
2523 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2525 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2526 if (BE (tree
== NULL
, 0))
2531 tree
->token
.opr
.idx
= cur_nsub
;
2535 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2538 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2539 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2541 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2542 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2543 re_token_t start_token
= *token
;
2545 if (token
->type
== OP_OPEN_DUP_NUM
)
2548 start
= fetch_number (regexp
, token
, syntax
);
2551 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2552 start
= 0; /* We treat "{,m}" as "{0,m}". */
2555 *err
= REG_BADBR
; /* <re>{} is invalid. */
2559 if (BE (start
!= -2, 1))
2561 /* We treat "{n}" as "{n,n}". */
2562 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2563 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2564 ? fetch_number (regexp
, token
, syntax
) : -2));
2566 if (BE (start
== -2 || end
== -2, 0))
2568 /* Invalid sequence. */
2569 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2571 if (token
->type
== END_OF_RE
)
2579 /* If the syntax bit is set, rollback. */
2580 re_string_set_index (regexp
, start_idx
);
2581 *token
= start_token
;
2582 token
->type
= CHARACTER
;
2583 /* mb_partial and word_char bits should be already initialized by
2588 if (BE ((end
!= -1 && start
> end
)
2589 || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2591 /* First number greater than second. */
2596 if (BE (RE_DUP_MAX
< (end
== -1 ? start
: end
), 0))
2604 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2605 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2608 fetch_token (token
, regexp
, syntax
);
2610 if (BE (elem
== NULL
, 0))
2612 if (BE (start
== 0 && end
== 0, 0))
2614 postorder (elem
, free_tree
, NULL
);
2618 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2619 if (BE (start
> 0, 0))
2622 for (i
= 2; i
<= start
; ++i
)
2624 elem
= duplicate_tree (elem
, dfa
);
2625 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2626 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2627 goto parse_dup_op_espace
;
2633 /* Duplicate ELEM before it is marked optional. */
2634 elem
= duplicate_tree (elem
, dfa
);
2635 if (BE (elem
== NULL
, 0))
2636 goto parse_dup_op_espace
;
2642 if (elem
->token
.type
== SUBEXP
)
2644 uintptr_t subidx
= elem
->token
.opr
.idx
;
2645 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2648 tree
= create_tree (dfa
, elem
, NULL
,
2649 (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2650 if (BE (tree
== NULL
, 0))
2651 goto parse_dup_op_espace
;
2653 /* From gnulib's "intprops.h":
2654 True if the arithmetic type T is signed. */
2655 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2657 /* This loop is actually executed only when end != -1,
2658 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2659 already created the start+1-th copy. */
2660 if (TYPE_SIGNED (Idx
) || end
!= -1)
2661 for (i
= start
+ 2; i
<= end
; ++i
)
2663 elem
= duplicate_tree (elem
, dfa
);
2664 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2665 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2666 goto parse_dup_op_espace
;
2668 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2669 if (BE (tree
== NULL
, 0))
2670 goto parse_dup_op_espace
;
2674 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2678 parse_dup_op_espace
:
2683 /* Size of the names for collating symbol/equivalence_class/character_class.
2684 I'm not sure, but maybe enough. */
2685 #define BRACKET_NAME_BUF_SIZE 32
2689 # ifdef RE_ENABLE_I18N
2690 /* Convert the byte B to the corresponding wide character. In a
2691 unibyte locale, treat B as itself if it is an encoding error.
2692 In a multibyte locale, return WEOF if B is an encoding error. */
2694 parse_byte (unsigned char b
, re_charset_t
*mbcset
)
2696 wint_t wc
= __btowc (b
);
2697 return wc
== WEOF
&& !mbcset
? b
: wc
;
2701 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2702 Build the range expression which starts from START_ELEM, and ends
2703 at END_ELEM. The result are written to MBCSET and SBCSET.
2704 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2705 mbcset->range_ends, is a pointer argument since we may
2708 static reg_errcode_t
2710 # ifdef RE_ENABLE_I18N
2711 build_range_exp (const reg_syntax_t syntax
,
2713 re_charset_t
*mbcset
,
2715 const bracket_elem_t
*start_elem
,
2716 const bracket_elem_t
*end_elem
)
2717 # else /* not RE_ENABLE_I18N */
2718 build_range_exp (const reg_syntax_t syntax
,
2720 const bracket_elem_t
*start_elem
,
2721 const bracket_elem_t
*end_elem
)
2722 # endif /* not RE_ENABLE_I18N */
2724 unsigned int start_ch
, end_ch
;
2725 /* Equivalence Classes and Character Classes can't be a range start/end. */
2726 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2727 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2731 /* We can handle no multi character collating elements without libc
2733 if (BE ((start_elem
->type
== COLL_SYM
2734 && strlen ((char *) start_elem
->opr
.name
) > 1)
2735 || (end_elem
->type
== COLL_SYM
2736 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2737 return REG_ECOLLATE
;
2739 # ifdef RE_ENABLE_I18N
2745 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2746 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2748 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2749 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2751 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2752 ? parse_byte (start_ch
, mbcset
) : start_elem
->opr
.wch
);
2753 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2754 ? parse_byte (end_ch
, mbcset
) : end_elem
->opr
.wch
);
2755 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2756 return REG_ECOLLATE
;
2757 else if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_wc
> end_wc
, 0))
2760 /* Got valid collation sequence values, add them as a new entry.
2761 However, for !_LIBC we have no collation elements: if the
2762 character set is single byte, the single byte character set
2763 that we build below suffices. parse_bracket_exp passes
2764 no MBCSET if dfa->mb_cur_max == 1. */
2767 /* Check the space of the arrays. */
2768 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2770 /* There is not enough space, need realloc. */
2771 wchar_t *new_array_start
, *new_array_end
;
2774 /* +1 in case of mbcset->nranges is 0. */
2775 new_nranges
= 2 * mbcset
->nranges
+ 1;
2776 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2777 are NULL if *range_alloc == 0. */
2778 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2780 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2783 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2785 re_free (new_array_start
);
2786 re_free (new_array_end
);
2790 mbcset
->range_starts
= new_array_start
;
2791 mbcset
->range_ends
= new_array_end
;
2792 *range_alloc
= new_nranges
;
2795 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2796 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2799 /* Build the table for single byte characters. */
2800 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2802 if (start_wc
<= wc
&& wc
<= end_wc
)
2803 bitset_set (sbcset
, wc
);
2806 # else /* not RE_ENABLE_I18N */
2809 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2810 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2812 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2813 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2815 if (start_ch
> end_ch
)
2817 /* Build the table for single byte characters. */
2818 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2819 if (start_ch
<= ch
&& ch
<= end_ch
)
2820 bitset_set (sbcset
, ch
);
2822 # endif /* not RE_ENABLE_I18N */
2825 #endif /* not _LIBC */
2828 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2829 Build the collating element which is represented by NAME.
2830 The result are written to MBCSET and SBCSET.
2831 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2832 pointer argument since we may update it. */
2834 static reg_errcode_t
2836 # ifdef RE_ENABLE_I18N
2837 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2838 Idx
*coll_sym_alloc
, const unsigned char *name
)
2839 # else /* not RE_ENABLE_I18N */
2840 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2841 # endif /* not RE_ENABLE_I18N */
2843 size_t name_len
= strlen ((const char *) name
);
2844 if (BE (name_len
!= 1, 0))
2845 return REG_ECOLLATE
;
2848 bitset_set (sbcset
, name
[0]);
2852 #endif /* not _LIBC */
2854 /* This function parse bracket expression like "[abc]", "[a-c]",
2858 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2859 reg_syntax_t syntax
, reg_errcode_t
*err
)
2862 const unsigned char *collseqmb
;
2863 const char *collseqwc
;
2866 const int32_t *symb_table
;
2867 const unsigned char *extra
;
2869 /* Local function for parse_bracket_exp used in _LIBC environment.
2870 Seek the collating symbol entry corresponding to NAME.
2871 Return the index of the symbol in the SYMB_TABLE,
2872 or -1 if not found. */
2875 __attribute__ ((always_inline
))
2876 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2880 for (elem
= 0; elem
< table_size
; elem
++)
2881 if (symb_table
[2 * elem
] != 0)
2883 int32_t idx
= symb_table
[2 * elem
+ 1];
2884 /* Skip the name of collating element name. */
2885 idx
+= 1 + extra
[idx
];
2886 if (/* Compare the length of the name. */
2887 name_len
== extra
[idx
]
2888 /* Compare the name. */
2889 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2890 /* Yep, this is the entry. */
2896 /* Local function for parse_bracket_exp used in _LIBC environment.
2897 Look up the collation sequence value of BR_ELEM.
2898 Return the value if succeeded, UINT_MAX otherwise. */
2900 auto inline unsigned int
2901 __attribute__ ((always_inline
))
2902 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2904 if (br_elem
->type
== SB_CHAR
)
2907 if (MB_CUR_MAX == 1)
2910 return collseqmb
[br_elem
->opr
.ch
];
2913 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2914 return __collseq_table_lookup (collseqwc
, wc
);
2917 else if (br_elem
->type
== MB_CHAR
)
2920 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2922 else if (br_elem
->type
== COLL_SYM
)
2924 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2928 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2932 /* We found the entry. */
2933 idx
= symb_table
[2 * elem
+ 1];
2934 /* Skip the name of collating element name. */
2935 idx
+= 1 + extra
[idx
];
2936 /* Skip the byte sequence of the collating element. */
2937 idx
+= 1 + extra
[idx
];
2938 /* Adjust for the alignment. */
2939 idx
= (idx
+ 3) & ~3;
2940 /* Skip the multibyte collation sequence value. */
2941 idx
+= sizeof (unsigned int);
2942 /* Skip the wide char sequence of the collating element. */
2943 idx
+= sizeof (unsigned int) *
2944 (1 + *(unsigned int *) (extra
+ idx
));
2945 /* Return the collation sequence value. */
2946 return *(unsigned int *) (extra
+ idx
);
2948 else if (sym_name_len
== 1)
2950 /* No valid character. Match it as a single byte
2952 return collseqmb
[br_elem
->opr
.name
[0]];
2955 else if (sym_name_len
== 1)
2956 return collseqmb
[br_elem
->opr
.name
[0]];
2961 /* Local function for parse_bracket_exp used in _LIBC environment.
2962 Build the range expression which starts from START_ELEM, and ends
2963 at END_ELEM. The result are written to MBCSET and SBCSET.
2964 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2965 mbcset->range_ends, is a pointer argument since we may
2968 auto inline reg_errcode_t
2969 __attribute__ ((always_inline
))
2970 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2971 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2974 uint32_t start_collseq
;
2975 uint32_t end_collseq
;
2977 /* Equivalence Classes and Character Classes can't be a range
2979 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2980 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2984 /* FIXME: Implement rational ranges here, too. */
2985 start_collseq
= lookup_collation_sequence_value (start_elem
);
2986 end_collseq
= lookup_collation_sequence_value (end_elem
);
2987 /* Check start/end collation sequence values. */
2988 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2989 return REG_ECOLLATE
;
2990 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2993 /* Got valid collation sequence values, add them as a new entry.
2994 However, if we have no collation elements, and the character set
2995 is single byte, the single byte character set that we
2996 build below suffices. */
2997 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2999 /* Check the space of the arrays. */
3000 if (BE (*range_alloc
== mbcset
->nranges
, 0))
3002 /* There is not enough space, need realloc. */
3003 uint32_t *new_array_start
;
3004 uint32_t *new_array_end
;
3007 /* +1 in case of mbcset->nranges is 0. */
3008 new_nranges
= 2 * mbcset
->nranges
+ 1;
3009 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
3011 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
3014 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
3017 mbcset
->range_starts
= new_array_start
;
3018 mbcset
->range_ends
= new_array_end
;
3019 *range_alloc
= new_nranges
;
3022 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
3023 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
3026 /* Build the table for single byte characters. */
3027 for (ch
= 0; ch
< SBC_MAX
; ch
++)
3029 uint32_t ch_collseq
;
3031 if (MB_CUR_MAX == 1)
3034 ch_collseq
= collseqmb
[ch
];
3036 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3037 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3038 bitset_set (sbcset
, ch
);
3043 /* Local function for parse_bracket_exp used in _LIBC environment.
3044 Build the collating element which is represented by NAME.
3045 The result are written to MBCSET and SBCSET.
3046 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3047 pointer argument since we may update it. */
3049 auto inline reg_errcode_t
3050 __attribute__ ((always_inline
))
3051 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
3052 Idx
*coll_sym_alloc
, const unsigned char *name
)
3055 size_t name_len
= strlen ((const char *) name
);
3058 elem
= seek_collating_symbol_entry (name
, name_len
);
3061 /* We found the entry. */
3062 idx
= symb_table
[2 * elem
+ 1];
3063 /* Skip the name of collating element name. */
3064 idx
+= 1 + extra
[idx
];
3066 else if (name_len
== 1)
3068 /* No valid character, treat it as a normal
3070 bitset_set (sbcset
, name
[0]);
3074 return REG_ECOLLATE
;
3076 /* Got valid collation sequence, add it as a new entry. */
3077 /* Check the space of the arrays. */
3078 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3080 /* Not enough, realloc it. */
3081 /* +1 in case of mbcset->ncoll_syms is 0. */
3082 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3083 /* Use realloc since mbcset->coll_syms is NULL
3085 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3086 new_coll_sym_alloc
);
3087 if (BE (new_coll_syms
== NULL
, 0))
3089 mbcset
->coll_syms
= new_coll_syms
;
3090 *coll_sym_alloc
= new_coll_sym_alloc
;
3092 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3097 if (BE (name_len
!= 1, 0))
3098 return REG_ECOLLATE
;
3101 bitset_set (sbcset
, name
[0]);
3108 re_token_t br_token
;
3109 re_bitset_ptr_t sbcset
;
3110 #ifdef RE_ENABLE_I18N
3111 re_charset_t
*mbcset
;
3112 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3113 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3114 #endif /* not RE_ENABLE_I18N */
3115 bool non_match
= false;
3116 bin_tree_t
*work_tree
;
3118 bool first_round
= true;
3120 collseqmb
= (const unsigned char *)
3121 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3122 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3128 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3129 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3130 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3131 _NL_COLLATE_SYMB_TABLEMB
);
3132 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3133 _NL_COLLATE_SYMB_EXTRAMB
);
3136 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3137 #ifdef RE_ENABLE_I18N
3138 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3139 #endif /* RE_ENABLE_I18N */
3140 #ifdef RE_ENABLE_I18N
3141 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3143 if (BE (sbcset
== NULL
, 0))
3144 #endif /* RE_ENABLE_I18N */
3147 #ifdef RE_ENABLE_I18N
3154 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3155 if (BE (token
->type
== END_OF_RE
, 0))
3158 goto parse_bracket_exp_free_return
;
3160 if (token
->type
== OP_NON_MATCH_LIST
)
3162 #ifdef RE_ENABLE_I18N
3163 mbcset
->non_match
= 1;
3164 #endif /* not RE_ENABLE_I18N */
3166 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3167 bitset_set (sbcset
, '\n');
3168 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3169 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3170 if (BE (token
->type
== END_OF_RE
, 0))
3173 goto parse_bracket_exp_free_return
;
3177 /* We treat the first ']' as a normal character. */
3178 if (token
->type
== OP_CLOSE_BRACKET
)
3179 token
->type
= CHARACTER
;
3183 bracket_elem_t start_elem
, end_elem
;
3184 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3185 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3188 bool is_range_exp
= false;
3191 start_elem
.opr
.name
= start_name_buf
;
3192 start_elem
.type
= COLL_SYM
;
3193 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3194 syntax
, first_round
);
3195 if (BE (ret
!= REG_NOERROR
, 0))
3198 goto parse_bracket_exp_free_return
;
3200 first_round
= false;
3202 /* Get information about the next token. We need it in any case. */
3203 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3205 /* Do not check for ranges if we know they are not allowed. */
3206 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3208 if (BE (token
->type
== END_OF_RE
, 0))
3211 goto parse_bracket_exp_free_return
;
3213 if (token
->type
== OP_CHARSET_RANGE
)
3215 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3216 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3217 if (BE (token2
.type
== END_OF_RE
, 0))
3220 goto parse_bracket_exp_free_return
;
3222 if (token2
.type
== OP_CLOSE_BRACKET
)
3224 /* We treat the last '-' as a normal character. */
3225 re_string_skip_bytes (regexp
, -token_len
);
3226 token
->type
= CHARACTER
;
3229 is_range_exp
= true;
3233 if (is_range_exp
== true)
3235 end_elem
.opr
.name
= end_name_buf
;
3236 end_elem
.type
= COLL_SYM
;
3237 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3239 if (BE (ret
!= REG_NOERROR
, 0))
3242 goto parse_bracket_exp_free_return
;
3245 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3248 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3249 &start_elem
, &end_elem
);
3251 # ifdef RE_ENABLE_I18N
3252 *err
= build_range_exp (syntax
, sbcset
,
3253 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3254 &range_alloc
, &start_elem
, &end_elem
);
3256 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3258 #endif /* RE_ENABLE_I18N */
3259 if (BE (*err
!= REG_NOERROR
, 0))
3260 goto parse_bracket_exp_free_return
;
3264 switch (start_elem
.type
)
3267 bitset_set (sbcset
, start_elem
.opr
.ch
);
3269 #ifdef RE_ENABLE_I18N
3271 /* Check whether the array has enough space. */
3272 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3274 wchar_t *new_mbchars
;
3275 /* Not enough, realloc it. */
3276 /* +1 in case of mbcset->nmbchars is 0. */
3277 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3278 /* Use realloc since array is NULL if *alloc == 0. */
3279 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3281 if (BE (new_mbchars
== NULL
, 0))
3282 goto parse_bracket_exp_espace
;
3283 mbcset
->mbchars
= new_mbchars
;
3285 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3287 #endif /* RE_ENABLE_I18N */
3289 *err
= build_equiv_class (sbcset
,
3290 #ifdef RE_ENABLE_I18N
3291 mbcset
, &equiv_class_alloc
,
3292 #endif /* RE_ENABLE_I18N */
3293 start_elem
.opr
.name
);
3294 if (BE (*err
!= REG_NOERROR
, 0))
3295 goto parse_bracket_exp_free_return
;
3298 *err
= build_collating_symbol (sbcset
,
3299 #ifdef RE_ENABLE_I18N
3300 mbcset
, &coll_sym_alloc
,
3301 #endif /* RE_ENABLE_I18N */
3302 start_elem
.opr
.name
);
3303 if (BE (*err
!= REG_NOERROR
, 0))
3304 goto parse_bracket_exp_free_return
;
3307 *err
= build_charclass (regexp
->trans
, sbcset
,
3308 #ifdef RE_ENABLE_I18N
3309 mbcset
, &char_class_alloc
,
3310 #endif /* RE_ENABLE_I18N */
3311 (const char *) start_elem
.opr
.name
,
3313 if (BE (*err
!= REG_NOERROR
, 0))
3314 goto parse_bracket_exp_free_return
;
3321 if (BE (token
->type
== END_OF_RE
, 0))
3324 goto parse_bracket_exp_free_return
;
3326 if (token
->type
== OP_CLOSE_BRACKET
)
3330 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3332 /* If it is non-matching list. */
3334 bitset_not (sbcset
);
3336 #ifdef RE_ENABLE_I18N
3337 /* Ensure only single byte characters are set. */
3338 if (dfa
->mb_cur_max
> 1)
3339 bitset_mask (sbcset
, dfa
->sb_char
);
3341 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3342 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3343 || mbcset
->non_match
)))
3345 bin_tree_t
*mbc_tree
;
3347 /* Build a tree for complex bracket. */
3348 dfa
->has_mb_node
= 1;
3349 br_token
.type
= COMPLEX_BRACKET
;
3350 br_token
.opr
.mbcset
= mbcset
;
3351 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3352 if (BE (mbc_tree
== NULL
, 0))
3353 goto parse_bracket_exp_espace
;
3354 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3355 if (sbcset
[sbc_idx
])
3357 /* If there are no bits set in sbcset, there is no point
3358 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3359 if (sbc_idx
< BITSET_WORDS
)
3361 /* Build a tree for simple bracket. */
3362 br_token
.type
= SIMPLE_BRACKET
;
3363 br_token
.opr
.sbcset
= sbcset
;
3364 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3365 if (BE (work_tree
== NULL
, 0))
3366 goto parse_bracket_exp_espace
;
3368 /* Then join them by ALT node. */
3369 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3370 if (BE (work_tree
== NULL
, 0))
3371 goto parse_bracket_exp_espace
;
3376 work_tree
= mbc_tree
;
3380 #endif /* not RE_ENABLE_I18N */
3382 #ifdef RE_ENABLE_I18N
3383 free_charset (mbcset
);
3385 /* Build a tree for simple bracket. */
3386 br_token
.type
= SIMPLE_BRACKET
;
3387 br_token
.opr
.sbcset
= sbcset
;
3388 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3389 if (BE (work_tree
== NULL
, 0))
3390 goto parse_bracket_exp_espace
;
3394 parse_bracket_exp_espace
:
3396 parse_bracket_exp_free_return
:
3398 #ifdef RE_ENABLE_I18N
3399 free_charset (mbcset
);
3400 #endif /* RE_ENABLE_I18N */
3404 /* Parse an element in the bracket expression. */
3406 static reg_errcode_t
3407 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3408 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3409 reg_syntax_t syntax
, bool accept_hyphen
)
3411 #ifdef RE_ENABLE_I18N
3413 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3414 if (cur_char_size
> 1)
3416 elem
->type
= MB_CHAR
;
3417 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3418 re_string_skip_bytes (regexp
, cur_char_size
);
3421 #endif /* RE_ENABLE_I18N */
3422 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3423 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3424 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3425 return parse_bracket_symbol (elem
, regexp
, token
);
3426 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3428 /* A '-' must only appear as anything but a range indicator before
3429 the closing bracket. Everything else is an error. */
3431 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3432 if (token2
.type
!= OP_CLOSE_BRACKET
)
3433 /* The actual error value is not standardized since this whole
3434 case is undefined. But ERANGE makes good sense. */
3437 elem
->type
= SB_CHAR
;
3438 elem
->opr
.ch
= token
->opr
.c
;
3442 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3443 such as [:<character_class>:], [.<collating_element>.], and
3444 [=<equivalent_class>=]. */
3446 static reg_errcode_t
3447 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3450 unsigned char ch
, delim
= token
->opr
.c
;
3452 if (re_string_eoi(regexp
))
3456 if (i
>= BRACKET_NAME_BUF_SIZE
)
3458 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3459 ch
= re_string_fetch_byte_case (regexp
);
3461 ch
= re_string_fetch_byte (regexp
);
3462 if (re_string_eoi(regexp
))
3464 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3466 elem
->opr
.name
[i
] = ch
;
3468 re_string_skip_bytes (regexp
, 1);
3469 elem
->opr
.name
[i
] = '\0';
3470 switch (token
->type
)
3472 case OP_OPEN_COLL_ELEM
:
3473 elem
->type
= COLL_SYM
;
3475 case OP_OPEN_EQUIV_CLASS
:
3476 elem
->type
= EQUIV_CLASS
;
3478 case OP_OPEN_CHAR_CLASS
:
3479 elem
->type
= CHAR_CLASS
;
3487 /* Helper function for parse_bracket_exp.
3488 Build the equivalence class which is represented by NAME.
3489 The result are written to MBCSET and SBCSET.
3490 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3491 is a pointer argument since we may update it. */
3493 static reg_errcode_t
3494 #ifdef RE_ENABLE_I18N
3495 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3496 Idx
*equiv_class_alloc
, const unsigned char *name
)
3497 #else /* not RE_ENABLE_I18N */
3498 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3499 #endif /* not RE_ENABLE_I18N */
3502 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3505 const int32_t *table
, *indirect
;
3506 const unsigned char *weights
, *extra
, *cp
;
3507 unsigned char char_buf
[2];
3511 /* Calculate the index for equivalence class. */
3513 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3514 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3515 _NL_COLLATE_WEIGHTMB
);
3516 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3517 _NL_COLLATE_EXTRAMB
);
3518 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3519 _NL_COLLATE_INDIRECTMB
);
3520 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3521 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3522 /* This isn't a valid character. */
3523 return REG_ECOLLATE
;
3525 /* Build single byte matching table for this equivalence class. */
3526 len
= weights
[idx1
& 0xffffff];
3527 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3531 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3536 /* This isn't a valid character. */
3538 /* Compare only if the length matches and the collation rule
3539 index is the same. */
3540 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3544 while (cnt
<= len
&&
3545 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3546 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3550 bitset_set (sbcset
, ch
);
3553 /* Check whether the array has enough space. */
3554 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3556 /* Not enough, realloc it. */
3557 /* +1 in case of mbcset->nequiv_classes is 0. */
3558 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3559 /* Use realloc since the array is NULL if *alloc == 0. */
3560 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3562 new_equiv_class_alloc
);
3563 if (BE (new_equiv_classes
== NULL
, 0))
3565 mbcset
->equiv_classes
= new_equiv_classes
;
3566 *equiv_class_alloc
= new_equiv_class_alloc
;
3568 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3573 if (BE (strlen ((const char *) name
) != 1, 0))
3574 return REG_ECOLLATE
;
3575 bitset_set (sbcset
, *name
);
3580 /* Helper function for parse_bracket_exp.
3581 Build the character class which is represented by NAME.
3582 The result are written to MBCSET and SBCSET.
3583 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3584 is a pointer argument since we may update it. */
3586 static reg_errcode_t
3587 #ifdef RE_ENABLE_I18N
3588 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3589 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3590 const char *class_name
, reg_syntax_t syntax
)
3591 #else /* not RE_ENABLE_I18N */
3592 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3593 const char *class_name
, reg_syntax_t syntax
)
3594 #endif /* not RE_ENABLE_I18N */
3597 const char *name
= class_name
;
3599 /* In case of REG_ICASE "upper" and "lower" match the both of
3600 upper and lower cases. */
3601 if ((syntax
& RE_ICASE
)
3602 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3605 #ifdef RE_ENABLE_I18N
3606 /* Check the space of the arrays. */
3607 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3609 /* Not enough, realloc it. */
3610 /* +1 in case of mbcset->nchar_classes is 0. */
3611 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3612 /* Use realloc since array is NULL if *alloc == 0. */
3613 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3614 new_char_class_alloc
);
3615 if (BE (new_char_classes
== NULL
, 0))
3617 mbcset
->char_classes
= new_char_classes
;
3618 *char_class_alloc
= new_char_class_alloc
;
3620 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3621 #endif /* RE_ENABLE_I18N */
3623 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3625 if (BE (trans != NULL, 0)) \
3627 for (i = 0; i < SBC_MAX; ++i) \
3628 if (ctype_func (i)) \
3629 bitset_set (sbcset, trans[i]); \
3633 for (i = 0; i < SBC_MAX; ++i) \
3634 if (ctype_func (i)) \
3635 bitset_set (sbcset, i); \
3639 if (strcmp (name
, "alnum") == 0)
3640 BUILD_CHARCLASS_LOOP (isalnum
);
3641 else if (strcmp (name
, "cntrl") == 0)
3642 BUILD_CHARCLASS_LOOP (iscntrl
);
3643 else if (strcmp (name
, "lower") == 0)
3644 BUILD_CHARCLASS_LOOP (islower
);
3645 else if (strcmp (name
, "space") == 0)
3646 BUILD_CHARCLASS_LOOP (isspace
);
3647 else if (strcmp (name
, "alpha") == 0)
3648 BUILD_CHARCLASS_LOOP (isalpha
);
3649 else if (strcmp (name
, "digit") == 0)
3650 BUILD_CHARCLASS_LOOP (isdigit
);
3651 else if (strcmp (name
, "print") == 0)
3652 BUILD_CHARCLASS_LOOP (isprint
);
3653 else if (strcmp (name
, "upper") == 0)
3654 BUILD_CHARCLASS_LOOP (isupper
);
3655 else if (strcmp (name
, "blank") == 0)
3656 BUILD_CHARCLASS_LOOP (isblank
);
3657 else if (strcmp (name
, "graph") == 0)
3658 BUILD_CHARCLASS_LOOP (isgraph
);
3659 else if (strcmp (name
, "punct") == 0)
3660 BUILD_CHARCLASS_LOOP (ispunct
);
3661 else if (strcmp (name
, "xdigit") == 0)
3662 BUILD_CHARCLASS_LOOP (isxdigit
);
3670 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3671 const char *class_name
,
3672 const char *extra
, bool non_match
,
3675 re_bitset_ptr_t sbcset
;
3676 #ifdef RE_ENABLE_I18N
3677 re_charset_t
*mbcset
;
3679 #endif /* not RE_ENABLE_I18N */
3681 re_token_t br_token
;
3684 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3685 if (BE (sbcset
== NULL
, 0))
3690 #ifdef RE_ENABLE_I18N
3691 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3692 if (BE (mbcset
== NULL
, 0))
3698 mbcset
->non_match
= non_match
;
3699 #endif /* RE_ENABLE_I18N */
3701 /* We don't care the syntax in this case. */
3702 ret
= build_charclass (trans
, sbcset
,
3703 #ifdef RE_ENABLE_I18N
3705 #endif /* RE_ENABLE_I18N */
3708 if (BE (ret
!= REG_NOERROR
, 0))
3711 #ifdef RE_ENABLE_I18N
3712 free_charset (mbcset
);
3713 #endif /* RE_ENABLE_I18N */
3717 /* \w match '_' also. */
3718 for (; *extra
; extra
++)
3719 bitset_set (sbcset
, *extra
);
3721 /* If it is non-matching list. */
3723 bitset_not (sbcset
);
3725 #ifdef RE_ENABLE_I18N
3726 /* Ensure only single byte characters are set. */
3727 if (dfa
->mb_cur_max
> 1)
3728 bitset_mask (sbcset
, dfa
->sb_char
);
3731 /* Build a tree for simple bracket. */
3732 #if defined GCC_LINT || defined lint
3733 memset (&br_token
, 0, sizeof br_token
);
3735 br_token
.type
= SIMPLE_BRACKET
;
3736 br_token
.opr
.sbcset
= sbcset
;
3737 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3738 if (BE (tree
== NULL
, 0))
3739 goto build_word_op_espace
;
3741 #ifdef RE_ENABLE_I18N
3742 if (dfa
->mb_cur_max
> 1)
3744 bin_tree_t
*mbc_tree
;
3745 /* Build a tree for complex bracket. */
3746 br_token
.type
= COMPLEX_BRACKET
;
3747 br_token
.opr
.mbcset
= mbcset
;
3748 dfa
->has_mb_node
= 1;
3749 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3750 if (BE (mbc_tree
== NULL
, 0))
3751 goto build_word_op_espace
;
3752 /* Then join them by ALT node. */
3753 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3754 if (BE (mbc_tree
!= NULL
, 1))
3759 free_charset (mbcset
);
3762 #else /* not RE_ENABLE_I18N */
3764 #endif /* not RE_ENABLE_I18N */
3766 build_word_op_espace
:
3768 #ifdef RE_ENABLE_I18N
3769 free_charset (mbcset
);
3770 #endif /* RE_ENABLE_I18N */
3775 /* This is intended for the expressions like "a{1,3}".
3776 Fetch a number from 'input', and return the number.
3777 Return -1 if the number field is empty like "{,1}".
3778 Return RE_DUP_MAX + 1 if the number field is too large.
3779 Return -2 if an error occurred. */
3782 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3788 fetch_token (token
, input
, syntax
);
3790 if (BE (token
->type
== END_OF_RE
, 0))
3792 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3794 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3798 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3803 #ifdef RE_ENABLE_I18N
3805 free_charset (re_charset_t
*cset
)
3807 re_free (cset
->mbchars
);
3809 re_free (cset
->coll_syms
);
3810 re_free (cset
->equiv_classes
);
3811 re_free (cset
->range_starts
);
3812 re_free (cset
->range_ends
);
3814 re_free (cset
->char_classes
);
3817 #endif /* RE_ENABLE_I18N */
3819 /* Functions for binary tree operation. */
3821 /* Create a tree node. */
3824 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3825 re_token_type_t type
)
3828 #if defined GCC_LINT || defined lint
3829 memset (&t
, 0, sizeof t
);
3832 return create_token_tree (dfa
, left
, right
, &t
);
3836 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3837 const re_token_t
*token
)
3840 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3842 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3844 if (storage
== NULL
)
3846 storage
->next
= dfa
->str_tree_storage
;
3847 dfa
->str_tree_storage
= storage
;
3848 dfa
->str_tree_storage_idx
= 0;
3850 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3852 tree
->parent
= NULL
;
3854 tree
->right
= right
;
3855 tree
->token
= *token
;
3856 tree
->token
.duplicated
= 0;
3857 tree
->token
.opt_subexp
= 0;
3860 tree
->node_idx
= -1;
3863 left
->parent
= tree
;
3865 right
->parent
= tree
;
3869 /* Mark the tree SRC as an optional subexpression.
3870 To be called from preorder or postorder. */
3872 static reg_errcode_t
3873 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3875 Idx idx
= (uintptr_t) extra
;
3876 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3877 node
->token
.opt_subexp
= 1;
3882 /* Free the allocated memory inside NODE. */
3885 free_token (re_token_t
*node
)
3887 #ifdef RE_ENABLE_I18N
3888 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3889 free_charset (node
->opr
.mbcset
);
3891 #endif /* RE_ENABLE_I18N */
3892 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3893 re_free (node
->opr
.sbcset
);
3896 /* Worker function for tree walking. Free the allocated memory inside NODE
3897 and its children. */
3899 static reg_errcode_t
3900 free_tree (void *extra
, bin_tree_t
*node
)
3902 free_token (&node
->token
);
3907 /* Duplicate the node SRC, and return new node. This is a preorder
3908 visit similar to the one implemented by the generic visitor, but
3909 we need more infrastructure to maintain two parallel trees --- so,
3910 it's easier to duplicate. */
3913 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3915 const bin_tree_t
*node
;
3916 bin_tree_t
*dup_root
;
3917 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3919 for (node
= root
; ; )
3921 /* Create a new tree and link it back to the current parent. */
3922 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3925 (*p_new
)->parent
= dup_node
;
3926 (*p_new
)->token
.duplicated
= 1;
3929 /* Go to the left node, or up and to the right. */
3933 p_new
= &dup_node
->left
;
3937 const bin_tree_t
*prev
= NULL
;
3938 while (node
->right
== prev
|| node
->right
== NULL
)
3941 node
= node
->parent
;
3942 dup_node
= dup_node
->parent
;
3947 p_new
= &dup_node
->right
;