1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
21 size_t length
, reg_syntax_t syntax
);
22 static void re_compile_fastmap_iter (regex_t
*bufp
,
23 const re_dfastate_t
*init_state
,
25 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
27 static void free_charset (re_charset_t
*cset
);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t
*preg
);
30 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
32 static void optimize_utf8 (re_dfa_t
*dfa
);
34 static reg_errcode_t
analyze (regex_t
*preg
);
35 static reg_errcode_t
preorder (bin_tree_t
*root
,
36 reg_errcode_t (fn (void *, bin_tree_t
*)),
38 static reg_errcode_t
postorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
42 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
43 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
45 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
48 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
49 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
50 unsigned int constraint
);
51 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
52 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
54 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
55 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
57 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 reg_syntax_t syntax
) internal_function
;
59 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
60 reg_syntax_t syntax
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 Idx nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 Idx nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 Idx nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 Idx nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
74 re_dfa_t
*dfa
, re_token_t
*token
,
75 reg_syntax_t syntax
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
77 re_token_t
*token
, reg_syntax_t syntax
,
79 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
81 re_token_t
*token
, int token_len
,
85 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
89 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
91 Idx
*equiv_class_alloc
,
92 const unsigned char *name
);
93 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
96 Idx
*char_class_alloc
,
97 const char *class_name
,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
101 const unsigned char *name
);
102 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
104 const char *class_name
,
105 reg_syntax_t syntax
);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
108 RE_TRANSLATE_TYPE trans
,
109 const char *class_name
,
111 bool non_match
, reg_errcode_t
*err
);
112 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
113 bin_tree_t
*left
, bin_tree_t
*right
,
114 re_token_type_t type
);
115 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
116 bin_tree_t
*left
, bin_tree_t
*right
,
117 const re_token_t
*token
);
118 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
119 static void free_token (re_token_t
*node
);
120 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
121 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid
[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx
[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern
, length
, bufp
)
217 struct re_pattern_buffer
*bufp
;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern
, size_t length
,
221 struct re_pattern_buffer
*bufp
)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
231 /* Match anchors at newline. */
232 bufp
->newline_anchor
= 1;
234 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
238 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
241 weak_alias (__re_compile_pattern
, re_compile_pattern
)
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options
;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax
)
263 reg_syntax_t ret
= re_syntax_options
;
265 re_syntax_options
= syntax
;
269 weak_alias (__re_set_syntax
, re_set_syntax
)
273 re_compile_fastmap (bufp
)
274 struct re_pattern_buffer
*bufp
;
276 re_dfa_t
*dfa
= bufp
->buffer
;
277 char *fastmap
= bufp
->fastmap
;
279 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
280 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
281 if (dfa
->init_state
!= dfa
->init_state_word
)
282 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
283 if (dfa
->init_state
!= dfa
->init_state_nl
)
284 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
285 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
286 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
287 bufp
->fastmap_accurate
= 1;
291 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
295 __attribute__ ((always_inline
))
296 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
300 fastmap
[tolower (ch
)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
310 re_dfa_t
*dfa
= bufp
->buffer
;
312 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
313 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
315 Idx node
= init_state
->nodes
.elems
[node_cnt
];
316 re_token_type_t type
= dfa
->nodes
[node
].type
;
318 if (type
== CHARACTER
)
320 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
324 unsigned char buf
[MB_LEN_MAX
];
330 *p
++ = dfa
->nodes
[node
].opr
.c
;
331 while (++node
< dfa
->nodes_len
332 && dfa
->nodes
[node
].type
== CHARACTER
333 && dfa
->nodes
[node
].mb_partial
)
334 *p
++ = dfa
->nodes
[node
].opr
.c
;
335 memset (&state
, '\0', sizeof (state
));
336 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
338 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
340 re_set_fastmap (fastmap
, false, buf
[0]);
344 else if (type
== SIMPLE_BRACKET
)
347 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
350 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
351 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
352 if (w
& ((bitset_word_t
) 1 << j
))
353 re_set_fastmap (fastmap
, icase
, ch
);
356 #ifdef RE_ENABLE_I18N
357 else if (type
== COMPLEX_BRACKET
)
359 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
363 /* See if we have to try all bytes which start multiple collation
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
370 && (cset
->ncoll_syms
|| cset
->nranges
))
372 const int32_t *table
= (const int32_t *)
373 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
374 for (i
= 0; i
< SBC_MAX
; ++i
)
376 re_set_fastmap (fastmap
, icase
, i
);
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa
->mb_cur_max
> 1
385 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
387 || cset
->nequiv_classes
395 memset (&mbs
, 0, sizeof (mbs
));
396 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
397 re_set_fastmap (fastmap
, false, (int) c
);
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i
= 0; i
< cset
->nmbchars
; ++i
)
409 memset (&state
, '\0', sizeof (state
));
410 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
411 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
412 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
414 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
416 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
421 #endif /* RE_ENABLE_I18N */
422 else if (type
== OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type
== OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type
== END_OF_RE
)
428 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
429 if (type
== END_OF_RE
)
430 bufp
->can_be_null
= 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 'buffer' to the compiled pattern;
443 'used' to the length of the compiled pattern;
444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 'fastmap' to an allocated space for the fastmap;
449 'fastmap_accurate' to zero;
450 're_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg
, pattern
, cflags
)
474 regex_t
*_Restrict_ preg
;
475 const char *_Restrict_ pattern
;
479 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC
);
486 /* Try to allocate space for the fastmap. */
487 preg
->fastmap
= re_malloc (char, SBC_MAX
);
488 if (BE (preg
->fastmap
== NULL
, 0))
491 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags
& REG_NEWLINE
)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax
&= ~RE_DOT_NEWLINE
;
497 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
498 /* It also changes the matching behavior. */
499 preg
->newline_anchor
= 1;
502 preg
->newline_anchor
= 0;
503 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
504 preg
->translate
= NULL
;
506 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret
== REG_ERPAREN
)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret
== REG_NOERROR
, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg
);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg
->fastmap
);
522 preg
->fastmap
= NULL
;
528 weak_alias (__regcomp
, regcomp
)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
536 regerror (errcode
, preg
, errbuf
, errbuf_size
)
538 const regex_t
*_Restrict_ preg
;
539 char *_Restrict_ errbuf
;
541 #else /* size_t might promote */
543 regerror (int errcode
, const regex_t
*_Restrict_ preg
,
544 char *_Restrict_ errbuf
, size_t errbuf_size
)
551 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
552 / sizeof (__re_error_msgid_idx
[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
559 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
561 msg_size
= strlen (msg
) + 1; /* Includes the null. */
563 if (BE (errbuf_size
!= 0, 1))
565 size_t cpy_size
= msg_size
;
566 if (BE (msg_size
> errbuf_size
, 0))
568 cpy_size
= errbuf_size
- 1;
569 errbuf
[cpy_size
] = '\0';
571 memcpy (errbuf
, msg
, cpy_size
);
577 weak_alias (__regerror
, regerror
)
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map
=
588 /* Set the first 128 bits. */
589 # if defined __GNUC__ && !defined __STRICT_ANSI__
590 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
602 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
604 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
611 free_dfa_content (re_dfa_t
*dfa
)
616 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
617 free_token (dfa
->nodes
+ i
);
618 re_free (dfa
->nexts
);
619 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
621 if (dfa
->eclosures
!= NULL
)
622 re_node_set_free (dfa
->eclosures
+ i
);
623 if (dfa
->inveclosures
!= NULL
)
624 re_node_set_free (dfa
->inveclosures
+ i
);
625 if (dfa
->edests
!= NULL
)
626 re_node_set_free (dfa
->edests
+ i
);
628 re_free (dfa
->edests
);
629 re_free (dfa
->eclosures
);
630 re_free (dfa
->inveclosures
);
631 re_free (dfa
->nodes
);
633 if (dfa
->state_table
)
634 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
636 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
637 for (j
= 0; j
< entry
->num
; ++j
)
639 re_dfastate_t
*state
= entry
->array
[j
];
642 re_free (entry
->array
);
644 re_free (dfa
->state_table
);
645 #ifdef RE_ENABLE_I18N
646 if (dfa
->sb_char
!= utf8_sb_map
)
647 re_free (dfa
->sb_char
);
649 re_free (dfa
->subexp_map
);
651 re_free (dfa
->re_str
);
658 /* Free dynamically allocated space used by PREG. */
664 re_dfa_t
*dfa
= preg
->buffer
;
665 if (BE (dfa
!= NULL
, 1))
667 lock_fini (dfa
->lock
);
668 free_dfa_content (dfa
);
673 re_free (preg
->fastmap
);
674 preg
->fastmap
= NULL
;
676 re_free (preg
->translate
);
677 preg
->translate
= NULL
;
680 weak_alias (__regfree
, regfree
)
683 /* Entry points compatible with 4.2 BSD regex library. We don't define
684 them unless specifically requested. */
686 #if defined _REGEX_RE_COMP || defined _LIBC
688 /* BSD has one and only one pattern buffer. */
689 static struct re_pattern_buffer re_comp_buf
;
693 /* Make these definitions weak in libc, so POSIX programs can redefine
694 these names if they don't use our functions, and still use
695 regcomp/regexec above without link errors. */
706 if (!re_comp_buf
.buffer
)
707 return gettext ("No previous regular expression");
711 if (re_comp_buf
.buffer
)
713 fastmap
= re_comp_buf
.fastmap
;
714 re_comp_buf
.fastmap
= NULL
;
715 __regfree (&re_comp_buf
);
716 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
717 re_comp_buf
.fastmap
= fastmap
;
720 if (re_comp_buf
.fastmap
== NULL
)
722 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
723 if (re_comp_buf
.fastmap
== NULL
)
724 return (char *) gettext (__re_error_msgid
725 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
728 /* Since 're_exec' always passes NULL for the 'regs' argument, we
729 don't need to initialize the pattern buffer fields which affect it. */
731 /* Match anchors at newlines. */
732 re_comp_buf
.newline_anchor
= 1;
734 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
739 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
740 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
744 libc_freeres_fn (free_mem
)
746 __regfree (&re_comp_buf
);
750 #endif /* _REGEX_RE_COMP */
752 /* Internal entry point.
753 Compile the regular expression PATTERN, whose length is LENGTH.
754 SYNTAX indicate regular expression's syntax. */
757 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
760 reg_errcode_t err
= REG_NOERROR
;
764 /* Initialize the pattern buffer. */
765 preg
->fastmap_accurate
= 0;
766 preg
->syntax
= syntax
;
767 preg
->not_bol
= preg
->not_eol
= 0;
770 preg
->can_be_null
= 0;
771 preg
->regs_allocated
= REGS_UNALLOCATED
;
773 /* Initialize the dfa. */
775 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
777 /* If zero allocated, but buffer is non-null, try to realloc
778 enough space. This loses if buffer's address is bogus, but
779 that is the user's responsibility. If ->buffer is NULL this
780 is a simple allocation. */
781 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
784 preg
->allocated
= sizeof (re_dfa_t
);
787 preg
->used
= sizeof (re_dfa_t
);
789 err
= init_dfa (dfa
, length
);
790 if (BE (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0, 0))
792 if (BE (err
!= REG_NOERROR
, 0))
794 free_dfa_content (dfa
);
800 /* Note: length+1 will not overflow since it is checked in init_dfa. */
801 dfa
->re_str
= re_malloc (char, length
+ 1);
802 strncpy (dfa
->re_str
, pattern
, length
+ 1);
805 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
806 (syntax
& RE_ICASE
) != 0, dfa
);
807 if (BE (err
!= REG_NOERROR
, 0))
809 re_compile_internal_free_return
:
810 free_workarea_compile (preg
);
811 re_string_destruct (®exp
);
812 lock_fini (dfa
->lock
);
813 free_dfa_content (dfa
);
819 /* Parse the regular expression, and build a structure tree. */
821 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
822 if (BE (dfa
->str_tree
== NULL
, 0))
823 goto re_compile_internal_free_return
;
825 /* Analyze the tree and create the nfa. */
826 err
= analyze (preg
);
827 if (BE (err
!= REG_NOERROR
, 0))
828 goto re_compile_internal_free_return
;
830 #ifdef RE_ENABLE_I18N
831 /* If possible, do searching in single byte encoding to speed things up. */
832 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
836 /* Then create the initial state of the dfa. */
837 err
= create_initial_state (dfa
);
839 /* Release work areas. */
840 free_workarea_compile (preg
);
841 re_string_destruct (®exp
);
843 if (BE (err
!= REG_NOERROR
, 0))
845 lock_fini (dfa
->lock
);
846 free_dfa_content (dfa
);
854 /* Initialize DFA. We use the length of the regular expression PAT_LEN
855 as the initial length of some arrays. */
858 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
860 __re_size_t table_size
;
862 const char *codeset_name
;
864 #ifdef RE_ENABLE_I18N
865 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
867 size_t max_i18n_object_size
= 0;
869 size_t max_object_size
=
870 MAX (sizeof (struct re_state_table_entry
),
871 MAX (sizeof (re_token_t
),
872 MAX (sizeof (re_node_set
),
873 MAX (sizeof (regmatch_t
),
874 max_i18n_object_size
))));
876 memset (dfa
, '\0', sizeof (re_dfa_t
));
878 /* Force allocation of str_tree_storage the first time. */
879 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
881 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
882 calculation below, and for similar doubling calculations
883 elsewhere. And it's <= rather than <, because some of the
884 doubling calculations add 1 afterwards. */
885 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2 <= pat_len
, 0))
888 dfa
->nodes_alloc
= pat_len
+ 1;
889 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
891 /* table_size = 2 ^ ceil(log pat_len) */
892 for (table_size
= 1; ; table_size
<<= 1)
893 if (table_size
> pat_len
)
896 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
897 dfa
->state_hash_mask
= table_size
- 1;
899 dfa
->mb_cur_max
= MB_CUR_MAX
;
901 if (dfa
->mb_cur_max
== 6
902 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
904 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
907 codeset_name
= nl_langinfo (CODESET
);
908 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
909 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
910 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
911 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
914 /* We check exhaustively in the loop below if this charset is a
915 superset of ASCII. */
916 dfa
->map_notascii
= 0;
919 #ifdef RE_ENABLE_I18N
920 if (dfa
->mb_cur_max
> 1)
923 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
928 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
929 if (BE (dfa
->sb_char
== NULL
, 0))
932 /* Set the bits corresponding to single byte chars. */
933 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
934 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
936 wint_t wch
= __btowc (ch
);
938 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
940 if (isascii (ch
) && wch
!= ch
)
941 dfa
->map_notascii
= 1;
948 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
953 /* Initialize WORD_CHAR table, which indicate which character is
954 "word". In this case "word" means that it is the word construction
955 character used by some operators like "\<", "\>", etc. */
959 init_word_char (re_dfa_t
*dfa
)
964 dfa
->word_ops_used
= 1;
965 if (BE (dfa
->map_notascii
== 0, 1))
967 bitset_word_t bits0
= 0x00000000;
968 bitset_word_t bits1
= 0x03ff0000;
969 bitset_word_t bits2
= 0x87fffffe;
970 bitset_word_t bits3
= 0x07fffffe;
971 if (BITSET_WORD_BITS
== 64)
973 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
974 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
977 else if (BITSET_WORD_BITS
== 32)
979 dfa
->word_char
[0] = bits0
;
980 dfa
->word_char
[1] = bits1
;
981 dfa
->word_char
[2] = bits2
;
982 dfa
->word_char
[3] = bits3
;
989 if (BE (dfa
->is_utf8
, 1))
991 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
997 for (; i
< BITSET_WORDS
; ++i
)
998 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
999 if (isalnum (ch
) || ch
== '_')
1000 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
1003 /* Free the work area which are only used while compiling. */
1006 free_workarea_compile (regex_t
*preg
)
1008 re_dfa_t
*dfa
= preg
->buffer
;
1009 bin_tree_storage_t
*storage
, *next
;
1010 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
1012 next
= storage
->next
;
1015 dfa
->str_tree_storage
= NULL
;
1016 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
1017 dfa
->str_tree
= NULL
;
1018 re_free (dfa
->org_indices
);
1019 dfa
->org_indices
= NULL
;
1022 /* Create initial states for all contexts. */
1024 static reg_errcode_t
1025 create_initial_state (re_dfa_t
*dfa
)
1029 re_node_set init_nodes
;
1031 /* Initial states have the epsilon closure of the node which is
1032 the first node of the regular expression. */
1033 first
= dfa
->str_tree
->first
->node_idx
;
1034 dfa
->init_node
= first
;
1035 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1036 if (BE (err
!= REG_NOERROR
, 0))
1039 /* The back-references which are in initial states can epsilon transit,
1040 since in this case all of the subexpressions can be null.
1041 Then we add epsilon closures of the nodes which are the next nodes of
1042 the back-references. */
1043 if (dfa
->nbackref
> 0)
1044 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1046 Idx node_idx
= init_nodes
.elems
[i
];
1047 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1050 if (type
!= OP_BACK_REF
)
1052 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1054 re_token_t
*clexp_node
;
1055 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1056 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1057 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1060 if (clexp_idx
== init_nodes
.nelem
)
1063 if (type
== OP_BACK_REF
)
1065 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1066 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1068 reg_errcode_t merge_err
1069 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1070 if (merge_err
!= REG_NOERROR
)
1077 /* It must be the first time to invoke acquire_state. */
1078 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1079 /* We don't check ERR here, since the initial state must not be NULL. */
1080 if (BE (dfa
->init_state
== NULL
, 0))
1082 if (dfa
->init_state
->has_constraint
)
1084 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1086 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1088 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1092 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1093 || dfa
->init_state_begbuf
== NULL
, 0))
1097 dfa
->init_state_word
= dfa
->init_state_nl
1098 = dfa
->init_state_begbuf
= dfa
->init_state
;
1100 re_node_set_free (&init_nodes
);
1104 #ifdef RE_ENABLE_I18N
1105 /* If it is possible to do searching in single byte encoding instead of UTF-8
1106 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107 DFA nodes where needed. */
1110 optimize_utf8 (re_dfa_t
*dfa
)
1114 bool mb_chars
= false;
1115 bool has_period
= false;
1117 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1118 switch (dfa
->nodes
[node
].type
)
1121 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1125 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1133 /* Word anchors etc. cannot be handled. It's okay to test
1134 opr.ctx_type since constraints (for all DFA nodes) are
1135 created by ORing one or more opr.ctx_type values. */
1145 case OP_DUP_ASTERISK
:
1146 case OP_OPEN_SUBEXP
:
1147 case OP_CLOSE_SUBEXP
:
1149 case COMPLEX_BRACKET
:
1151 case SIMPLE_BRACKET
:
1152 /* Just double check. */
1154 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1156 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1157 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1159 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1169 if (mb_chars
|| has_period
)
1170 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1172 if (dfa
->nodes
[node
].type
== CHARACTER
1173 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1174 dfa
->nodes
[node
].mb_partial
= 0;
1175 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1176 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1179 /* The search can be in single byte locale. */
1180 dfa
->mb_cur_max
= 1;
1182 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1186 /* Analyze the structure tree, and calculate "first", "next", "edest",
1187 "eclosure", and "inveclosure". */
1189 static reg_errcode_t
1190 analyze (regex_t
*preg
)
1192 re_dfa_t
*dfa
= preg
->buffer
;
1195 /* Allocate arrays. */
1196 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1197 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1198 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1199 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1200 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1201 || dfa
->eclosures
== NULL
, 0))
1204 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1205 if (dfa
->subexp_map
!= NULL
)
1208 for (i
= 0; i
< preg
->re_nsub
; i
++)
1209 dfa
->subexp_map
[i
] = i
;
1210 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1211 for (i
= 0; i
< preg
->re_nsub
; i
++)
1212 if (dfa
->subexp_map
[i
] != i
)
1214 if (i
== preg
->re_nsub
)
1216 free (dfa
->subexp_map
);
1217 dfa
->subexp_map
= NULL
;
1221 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1222 if (BE (ret
!= REG_NOERROR
, 0))
1224 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1225 if (BE (ret
!= REG_NOERROR
, 0))
1227 preorder (dfa
->str_tree
, calc_next
, dfa
);
1228 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1229 if (BE (ret
!= REG_NOERROR
, 0))
1231 ret
= calc_eclosure (dfa
);
1232 if (BE (ret
!= REG_NOERROR
, 0))
1235 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1237 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1240 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1241 if (BE (dfa
->inveclosures
== NULL
, 0))
1243 ret
= calc_inveclosure (dfa
);
1249 /* Our parse trees are very unbalanced, so we cannot use a stack to
1250 implement parse tree visits. Instead, we use parent pointers and
1251 some hairy code in these two functions. */
1252 static reg_errcode_t
1253 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1256 bin_tree_t
*node
, *prev
;
1258 for (node
= root
; ; )
1260 /* Descend down the tree, preferably to the left (or to the right
1261 if that's the only child). */
1262 while (node
->left
|| node
->right
)
1270 reg_errcode_t err
= fn (extra
, node
);
1271 if (BE (err
!= REG_NOERROR
, 0))
1273 if (node
->parent
== NULL
)
1276 node
= node
->parent
;
1278 /* Go up while we have a node that is reached from the right. */
1279 while (node
->right
== prev
|| node
->right
== NULL
);
1284 static reg_errcode_t
1285 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1290 for (node
= root
; ; )
1292 reg_errcode_t err
= fn (extra
, node
);
1293 if (BE (err
!= REG_NOERROR
, 0))
1296 /* Go to the left node, or up and to the right. */
1301 bin_tree_t
*prev
= NULL
;
1302 while (node
->right
== prev
|| node
->right
== NULL
)
1305 node
= node
->parent
;
1314 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1316 backreferences as well. Requires a preorder visit. */
1317 static reg_errcode_t
1318 optimize_subexps (void *extra
, bin_tree_t
*node
)
1320 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1322 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1324 int idx
= node
->token
.opr
.idx
;
1325 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1326 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1329 else if (node
->token
.type
== SUBEXP
1330 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1332 Idx other_idx
= node
->left
->token
.opr
.idx
;
1334 node
->left
= node
->left
->left
;
1336 node
->left
->parent
= node
;
1338 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1339 if (other_idx
< BITSET_WORD_BITS
)
1340 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1346 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1348 static reg_errcode_t
1349 lower_subexps (void *extra
, bin_tree_t
*node
)
1351 regex_t
*preg
= (regex_t
*) extra
;
1352 reg_errcode_t err
= REG_NOERROR
;
1354 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1356 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1358 node
->left
->parent
= node
;
1360 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1362 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1364 node
->right
->parent
= node
;
1371 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1373 re_dfa_t
*dfa
= preg
->buffer
;
1374 bin_tree_t
*body
= node
->left
;
1375 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1378 /* We do not optimize empty subexpressions, because otherwise we may
1379 have bad CONCAT nodes with NULL children. This is obviously not
1380 very common, so we do not lose much. An example that triggers
1381 this case is the sed "script" /\(\)/x. */
1382 && node
->left
!= NULL
1383 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1384 || !(dfa
->used_bkref_map
1385 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1388 /* Convert the SUBEXP node to the concatenation of an
1389 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1390 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1391 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1392 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1393 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1394 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1400 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1401 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1405 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406 nodes. Requires a postorder visit. */
1407 static reg_errcode_t
1408 calc_first (void *extra
, bin_tree_t
*node
)
1410 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1411 if (node
->token
.type
== CONCAT
)
1413 node
->first
= node
->left
->first
;
1414 node
->node_idx
= node
->left
->node_idx
;
1419 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1420 if (BE (node
->node_idx
== REG_MISSING
, 0))
1422 if (node
->token
.type
== ANCHOR
)
1423 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1428 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1429 static reg_errcode_t
1430 calc_next (void *extra
, bin_tree_t
*node
)
1432 switch (node
->token
.type
)
1434 case OP_DUP_ASTERISK
:
1435 node
->left
->next
= node
;
1438 node
->left
->next
= node
->right
->first
;
1439 node
->right
->next
= node
->next
;
1443 node
->left
->next
= node
->next
;
1445 node
->right
->next
= node
->next
;
1451 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1452 static reg_errcode_t
1453 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1455 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1456 Idx idx
= node
->node_idx
;
1457 reg_errcode_t err
= REG_NOERROR
;
1459 switch (node
->token
.type
)
1465 assert (node
->next
== NULL
);
1468 case OP_DUP_ASTERISK
:
1472 dfa
->has_plural_match
= 1;
1473 if (node
->left
!= NULL
)
1474 left
= node
->left
->first
->node_idx
;
1476 left
= node
->next
->node_idx
;
1477 if (node
->right
!= NULL
)
1478 right
= node
->right
->first
->node_idx
;
1480 right
= node
->next
->node_idx
;
1481 assert (REG_VALID_INDEX (left
));
1482 assert (REG_VALID_INDEX (right
));
1483 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1488 case OP_OPEN_SUBEXP
:
1489 case OP_CLOSE_SUBEXP
:
1490 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1494 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1495 if (node
->token
.type
== OP_BACK_REF
)
1496 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1500 assert (!IS_EPSILON_NODE (node
->token
.type
));
1501 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1508 /* Duplicate the epsilon closure of the node ROOT_NODE.
1509 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510 to their own constraint. */
1512 static reg_errcode_t
1514 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1515 Idx root_node
, unsigned int init_constraint
)
1517 Idx org_node
, clone_node
;
1519 unsigned int constraint
= init_constraint
;
1520 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1522 Idx org_dest
, clone_dest
;
1523 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1525 /* If the back reference epsilon-transit, its destination must
1526 also have the constraint. Then duplicate the epsilon closure
1527 of the destination of the back reference, and store it in
1528 edests of the back reference. */
1529 org_dest
= dfa
->nexts
[org_node
];
1530 re_node_set_empty (dfa
->edests
+ clone_node
);
1531 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1532 if (BE (clone_dest
== REG_MISSING
, 0))
1534 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1535 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1539 else if (dfa
->edests
[org_node
].nelem
== 0)
1541 /* In case of the node can't epsilon-transit, don't duplicate the
1542 destination and store the original destination as the
1543 destination of the node. */
1544 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1547 else if (dfa
->edests
[org_node
].nelem
== 1)
1549 /* In case of the node can epsilon-transit, and it has only one
1551 org_dest
= dfa
->edests
[org_node
].elems
[0];
1552 re_node_set_empty (dfa
->edests
+ clone_node
);
1553 /* If the node is root_node itself, it means the epsilon closure
1554 has a loop. Then tie it to the destination of the root_node. */
1555 if (org_node
== root_node
&& clone_node
!= org_node
)
1557 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1562 /* In case the node has another constraint, append it. */
1563 constraint
|= dfa
->nodes
[org_node
].constraint
;
1564 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1565 if (BE (clone_dest
== REG_MISSING
, 0))
1567 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1571 else /* dfa->edests[org_node].nelem == 2 */
1573 /* In case of the node can epsilon-transit, and it has two
1574 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1575 org_dest
= dfa
->edests
[org_node
].elems
[0];
1576 re_node_set_empty (dfa
->edests
+ clone_node
);
1577 /* Search for a duplicated node which satisfies the constraint. */
1578 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1579 if (clone_dest
== REG_MISSING
)
1581 /* There is no such duplicated node, create a new one. */
1583 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1584 if (BE (clone_dest
== REG_MISSING
, 0))
1586 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1589 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1590 root_node
, constraint
);
1591 if (BE (err
!= REG_NOERROR
, 0))
1596 /* There is a duplicated node which satisfies the constraint,
1597 use it to avoid infinite loop. */
1598 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1603 org_dest
= dfa
->edests
[org_node
].elems
[1];
1604 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1605 if (BE (clone_dest
== REG_MISSING
, 0))
1607 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1611 org_node
= org_dest
;
1612 clone_node
= clone_dest
;
1617 /* Search for a node which is duplicated from the node ORG_NODE, and
1618 satisfies the constraint CONSTRAINT. */
1621 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1622 unsigned int constraint
)
1625 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1627 if (org_node
== dfa
->org_indices
[idx
]
1628 && constraint
== dfa
->nodes
[idx
].constraint
)
1629 return idx
; /* Found. */
1631 return REG_MISSING
; /* Not found. */
1634 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635 Return the index of the new node, or REG_MISSING if insufficient storage is
1639 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1641 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1642 if (BE (dup_idx
!= REG_MISSING
, 1))
1644 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1645 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1646 dfa
->nodes
[dup_idx
].duplicated
= 1;
1648 /* Store the index of the original node. */
1649 dfa
->org_indices
[dup_idx
] = org_idx
;
1654 static reg_errcode_t
1655 calc_inveclosure (re_dfa_t
*dfa
)
1659 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1660 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1662 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1664 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1665 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1667 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1676 /* Calculate "eclosure" for all the node in DFA. */
1678 static reg_errcode_t
1679 calc_eclosure (re_dfa_t
*dfa
)
1684 assert (dfa
->nodes_len
> 0);
1687 /* For each nodes, calculate epsilon closure. */
1688 for (node_idx
= 0; ; ++node_idx
)
1691 re_node_set eclosure_elem
;
1692 if (node_idx
== dfa
->nodes_len
)
1701 assert (dfa
->eclosures
[node_idx
].nelem
!= REG_MISSING
);
1704 /* If we have already calculated, skip it. */
1705 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1707 /* Calculate epsilon closure of 'node_idx'. */
1708 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1709 if (BE (err
!= REG_NOERROR
, 0))
1712 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1715 re_node_set_free (&eclosure_elem
);
1721 /* Calculate epsilon closure of NODE. */
1723 static reg_errcode_t
1724 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1728 re_node_set eclosure
;
1730 bool incomplete
= false;
1731 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1732 if (BE (err
!= REG_NOERROR
, 0))
1735 /* This indicates that we are calculating this node now.
1736 We reference this value to avoid infinite loop. */
1737 dfa
->eclosures
[node
].nelem
= REG_MISSING
;
1739 /* If the current node has constraints, duplicate all nodes
1740 since they must inherit the constraints. */
1741 if (dfa
->nodes
[node
].constraint
1742 && dfa
->edests
[node
].nelem
1743 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1745 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1746 dfa
->nodes
[node
].constraint
);
1747 if (BE (err
!= REG_NOERROR
, 0))
1751 /* Expand each epsilon destination nodes. */
1752 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1753 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1755 re_node_set eclosure_elem
;
1756 Idx edest
= dfa
->edests
[node
].elems
[i
];
1757 /* If calculating the epsilon closure of 'edest' is in progress,
1758 return intermediate result. */
1759 if (dfa
->eclosures
[edest
].nelem
== REG_MISSING
)
1764 /* If we haven't calculated the epsilon closure of 'edest' yet,
1765 calculate now. Otherwise use calculated epsilon closure. */
1766 if (dfa
->eclosures
[edest
].nelem
== 0)
1768 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1769 if (BE (err
!= REG_NOERROR
, 0))
1773 eclosure_elem
= dfa
->eclosures
[edest
];
1774 /* Merge the epsilon closure of 'edest'. */
1775 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1776 if (BE (err
!= REG_NOERROR
, 0))
1778 /* If the epsilon closure of 'edest' is incomplete,
1779 the epsilon closure of this node is also incomplete. */
1780 if (dfa
->eclosures
[edest
].nelem
== 0)
1783 re_node_set_free (&eclosure_elem
);
1787 /* An epsilon closure includes itself. */
1788 ok
= re_node_set_insert (&eclosure
, node
);
1791 if (incomplete
&& !root
)
1792 dfa
->eclosures
[node
].nelem
= 0;
1794 dfa
->eclosures
[node
] = eclosure
;
1795 *new_set
= eclosure
;
1799 /* Functions for token which are used in the parser. */
1801 /* Fetch a token from INPUT.
1802 We must not use this function inside bracket expressions. */
1806 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1808 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1811 /* Peek a token from INPUT, and return the length of the token.
1812 We must not use this function inside bracket expressions. */
1816 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1820 if (re_string_eoi (input
))
1822 token
->type
= END_OF_RE
;
1826 c
= re_string_peek_byte (input
, 0);
1829 token
->word_char
= 0;
1830 #ifdef RE_ENABLE_I18N
1831 token
->mb_partial
= 0;
1832 if (input
->mb_cur_max
> 1 &&
1833 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1835 token
->type
= CHARACTER
;
1836 token
->mb_partial
= 1;
1843 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1845 token
->type
= BACK_SLASH
;
1849 c2
= re_string_peek_byte_case (input
, 1);
1851 token
->type
= CHARACTER
;
1852 #ifdef RE_ENABLE_I18N
1853 if (input
->mb_cur_max
> 1)
1855 wint_t wc
= re_string_wchar_at (input
,
1856 re_string_cur_idx (input
) + 1);
1857 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1861 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1866 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1867 token
->type
= OP_ALT
;
1869 case '1': case '2': case '3': case '4': case '5':
1870 case '6': case '7': case '8': case '9':
1871 if (!(syntax
& RE_NO_BK_REFS
))
1873 token
->type
= OP_BACK_REF
;
1874 token
->opr
.idx
= c2
- '1';
1878 if (!(syntax
& RE_NO_GNU_OPS
))
1880 token
->type
= ANCHOR
;
1881 token
->opr
.ctx_type
= WORD_FIRST
;
1885 if (!(syntax
& RE_NO_GNU_OPS
))
1887 token
->type
= ANCHOR
;
1888 token
->opr
.ctx_type
= WORD_LAST
;
1892 if (!(syntax
& RE_NO_GNU_OPS
))
1894 token
->type
= ANCHOR
;
1895 token
->opr
.ctx_type
= WORD_DELIM
;
1899 if (!(syntax
& RE_NO_GNU_OPS
))
1901 token
->type
= ANCHOR
;
1902 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1906 if (!(syntax
& RE_NO_GNU_OPS
))
1907 token
->type
= OP_WORD
;
1910 if (!(syntax
& RE_NO_GNU_OPS
))
1911 token
->type
= OP_NOTWORD
;
1914 if (!(syntax
& RE_NO_GNU_OPS
))
1915 token
->type
= OP_SPACE
;
1918 if (!(syntax
& RE_NO_GNU_OPS
))
1919 token
->type
= OP_NOTSPACE
;
1922 if (!(syntax
& RE_NO_GNU_OPS
))
1924 token
->type
= ANCHOR
;
1925 token
->opr
.ctx_type
= BUF_FIRST
;
1929 if (!(syntax
& RE_NO_GNU_OPS
))
1931 token
->type
= ANCHOR
;
1932 token
->opr
.ctx_type
= BUF_LAST
;
1936 if (!(syntax
& RE_NO_BK_PARENS
))
1937 token
->type
= OP_OPEN_SUBEXP
;
1940 if (!(syntax
& RE_NO_BK_PARENS
))
1941 token
->type
= OP_CLOSE_SUBEXP
;
1944 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1945 token
->type
= OP_DUP_PLUS
;
1948 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1949 token
->type
= OP_DUP_QUESTION
;
1952 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1953 token
->type
= OP_OPEN_DUP_NUM
;
1956 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1957 token
->type
= OP_CLOSE_DUP_NUM
;
1965 token
->type
= CHARACTER
;
1966 #ifdef RE_ENABLE_I18N
1967 if (input
->mb_cur_max
> 1)
1969 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1970 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1974 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1979 if (syntax
& RE_NEWLINE_ALT
)
1980 token
->type
= OP_ALT
;
1983 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1984 token
->type
= OP_ALT
;
1987 token
->type
= OP_DUP_ASTERISK
;
1990 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1991 token
->type
= OP_DUP_PLUS
;
1994 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1995 token
->type
= OP_DUP_QUESTION
;
1998 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1999 token
->type
= OP_OPEN_DUP_NUM
;
2002 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
2003 token
->type
= OP_CLOSE_DUP_NUM
;
2006 if (syntax
& RE_NO_BK_PARENS
)
2007 token
->type
= OP_OPEN_SUBEXP
;
2010 if (syntax
& RE_NO_BK_PARENS
)
2011 token
->type
= OP_CLOSE_SUBEXP
;
2014 token
->type
= OP_OPEN_BRACKET
;
2017 token
->type
= OP_PERIOD
;
2020 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
2021 re_string_cur_idx (input
) != 0)
2023 char prev
= re_string_peek_byte (input
, -1);
2024 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
2027 token
->type
= ANCHOR
;
2028 token
->opr
.ctx_type
= LINE_FIRST
;
2031 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2032 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2035 re_string_skip_bytes (input
, 1);
2036 peek_token (&next
, input
, syntax
);
2037 re_string_skip_bytes (input
, -1);
2038 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2041 token
->type
= ANCHOR
;
2042 token
->opr
.ctx_type
= LINE_LAST
;
2050 /* Peek a token from INPUT, and return the length of the token.
2051 We must not use this function out of bracket expressions. */
2055 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2058 if (re_string_eoi (input
))
2060 token
->type
= END_OF_RE
;
2063 c
= re_string_peek_byte (input
, 0);
2066 #ifdef RE_ENABLE_I18N
2067 if (input
->mb_cur_max
> 1 &&
2068 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2070 token
->type
= CHARACTER
;
2073 #endif /* RE_ENABLE_I18N */
2075 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2076 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2078 /* In this case, '\' escape a character. */
2080 re_string_skip_bytes (input
, 1);
2081 c2
= re_string_peek_byte (input
, 0);
2083 token
->type
= CHARACTER
;
2086 if (c
== '[') /* '[' is a special char in a bracket exps. */
2090 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2091 c2
= re_string_peek_byte (input
, 1);
2099 token
->type
= OP_OPEN_COLL_ELEM
;
2102 token
->type
= OP_OPEN_EQUIV_CLASS
;
2105 if (syntax
& RE_CHAR_CLASSES
)
2107 token
->type
= OP_OPEN_CHAR_CLASS
;
2110 /* else fall through. */
2112 token
->type
= CHARACTER
;
2122 token
->type
= OP_CHARSET_RANGE
;
2125 token
->type
= OP_CLOSE_BRACKET
;
2128 token
->type
= OP_NON_MATCH_LIST
;
2131 token
->type
= CHARACTER
;
2136 /* Functions for parser. */
2138 /* Entry point of the parser.
2139 Parse the regular expression REGEXP and return the structure tree.
2140 If an error occurs, ERR is set by error code, and return NULL.
2141 This function build the following tree, from regular expression <reg_exp>:
2147 CAT means concatenation.
2148 EOR means end of regular expression. */
2151 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2154 re_dfa_t
*dfa
= preg
->buffer
;
2155 bin_tree_t
*tree
, *eor
, *root
;
2156 re_token_t current_token
;
2157 dfa
->syntax
= syntax
;
2158 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2159 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2160 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2162 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2164 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2167 if (BE (eor
== NULL
|| root
== NULL
, 0))
2175 /* This function build the following tree, from regular expression
2176 <branch1>|<branch2>:
2182 ALT means alternative, which represents the operator '|'. */
2185 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2186 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2188 re_dfa_t
*dfa
= preg
->buffer
;
2189 bin_tree_t
*tree
, *branch
= NULL
;
2190 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2191 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2194 while (token
->type
== OP_ALT
)
2196 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2197 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2198 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2200 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2201 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2206 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2207 if (BE (tree
== NULL
, 0))
2216 /* This function build the following tree, from regular expression
2223 CAT means concatenation. */
2226 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2227 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2229 bin_tree_t
*tree
, *expr
;
2230 re_dfa_t
*dfa
= preg
->buffer
;
2231 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2232 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2235 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2236 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2238 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2239 if (BE (*err
!= REG_NOERROR
&& expr
== NULL
, 0))
2242 postorder (tree
, free_tree
, NULL
);
2245 if (tree
!= NULL
&& expr
!= NULL
)
2247 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2248 if (newtree
== NULL
)
2250 postorder (expr
, free_tree
, NULL
);
2251 postorder (tree
, free_tree
, NULL
);
2257 else if (tree
== NULL
)
2259 /* Otherwise expr == NULL, we don't need to create new tree. */
2264 /* This function build the following tree, from regular expression a*:
2271 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2272 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2274 re_dfa_t
*dfa
= preg
->buffer
;
2276 switch (token
->type
)
2279 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2280 if (BE (tree
== NULL
, 0))
2285 #ifdef RE_ENABLE_I18N
2286 if (dfa
->mb_cur_max
> 1)
2288 while (!re_string_eoi (regexp
)
2289 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2291 bin_tree_t
*mbc_remain
;
2292 fetch_token (token
, regexp
, syntax
);
2293 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2294 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2295 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2304 case OP_OPEN_SUBEXP
:
2305 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2306 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2309 case OP_OPEN_BRACKET
:
2310 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2311 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2315 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2320 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2321 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2322 if (BE (tree
== NULL
, 0))
2328 dfa
->has_mb_node
= 1;
2330 case OP_OPEN_DUP_NUM
:
2331 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2337 case OP_DUP_ASTERISK
:
2339 case OP_DUP_QUESTION
:
2340 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2345 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2347 fetch_token (token
, regexp
, syntax
);
2348 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2350 /* else fall through */
2351 case OP_CLOSE_SUBEXP
:
2352 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2353 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2358 /* else fall through */
2359 case OP_CLOSE_DUP_NUM
:
2360 /* We treat it as a normal character. */
2362 /* Then we can these characters as normal characters. */
2363 token
->type
= CHARACTER
;
2364 /* mb_partial and word_char bits should be initialized already
2366 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2367 if (BE (tree
== NULL
, 0))
2374 if ((token
->opr
.ctx_type
2375 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2376 && dfa
->word_ops_used
== 0)
2377 init_word_char (dfa
);
2378 if (token
->opr
.ctx_type
== WORD_DELIM
2379 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2381 bin_tree_t
*tree_first
, *tree_last
;
2382 if (token
->opr
.ctx_type
== WORD_DELIM
)
2384 token
->opr
.ctx_type
= WORD_FIRST
;
2385 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2386 token
->opr
.ctx_type
= WORD_LAST
;
2390 token
->opr
.ctx_type
= INSIDE_WORD
;
2391 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2392 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2394 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2395 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2396 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2404 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2405 if (BE (tree
== NULL
, 0))
2411 /* We must return here, since ANCHORs can't be followed
2412 by repetition operators.
2413 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2414 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2415 fetch_token (token
, regexp
, syntax
);
2418 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2419 if (BE (tree
== NULL
, 0))
2424 if (dfa
->mb_cur_max
> 1)
2425 dfa
->has_mb_node
= 1;
2429 tree
= build_charclass_op (dfa
, regexp
->trans
,
2432 token
->type
== OP_NOTWORD
, err
);
2433 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2438 tree
= build_charclass_op (dfa
, regexp
->trans
,
2441 token
->type
== OP_NOTSPACE
, err
);
2442 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2452 /* Must not happen? */
2458 fetch_token (token
, regexp
, syntax
);
2460 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2461 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2463 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2464 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2466 /* In BRE consecutive duplications are not allowed. */
2467 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2468 && (token
->type
== OP_DUP_ASTERISK
2469 || token
->type
== OP_OPEN_DUP_NUM
))
2479 /* This function build the following tree, from regular expression
2487 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2488 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2490 re_dfa_t
*dfa
= preg
->buffer
;
2493 cur_nsub
= preg
->re_nsub
++;
2495 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2497 /* The subexpression may be a null string. */
2498 if (token
->type
== OP_CLOSE_SUBEXP
)
2502 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2503 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2506 postorder (tree
, free_tree
, NULL
);
2509 if (BE (*err
!= REG_NOERROR
, 0))
2513 if (cur_nsub
<= '9' - '1')
2514 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2516 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2517 if (BE (tree
== NULL
, 0))
2522 tree
->token
.opr
.idx
= cur_nsub
;
2526 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2529 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2530 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2532 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2533 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2534 re_token_t start_token
= *token
;
2536 if (token
->type
== OP_OPEN_DUP_NUM
)
2539 start
= fetch_number (regexp
, token
, syntax
);
2540 if (start
== REG_MISSING
)
2542 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2543 start
= 0; /* We treat "{,m}" as "{0,m}". */
2546 *err
= REG_BADBR
; /* <re>{} is invalid. */
2550 if (BE (start
!= REG_ERROR
, 1))
2552 /* We treat "{n}" as "{n,n}". */
2553 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2554 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2555 ? fetch_number (regexp
, token
, syntax
) : REG_ERROR
));
2557 if (BE (start
== REG_ERROR
|| end
== REG_ERROR
, 0))
2559 /* Invalid sequence. */
2560 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2562 if (token
->type
== END_OF_RE
)
2570 /* If the syntax bit is set, rollback. */
2571 re_string_set_index (regexp
, start_idx
);
2572 *token
= start_token
;
2573 token
->type
= CHARACTER
;
2574 /* mb_partial and word_char bits should be already initialized by
2579 if (BE ((end
!= REG_MISSING
&& start
> end
)
2580 || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2582 /* First number greater than second. */
2587 if (BE (RE_DUP_MAX
< (end
== REG_MISSING
? start
: end
), 0))
2595 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2596 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : REG_MISSING
;
2599 fetch_token (token
, regexp
, syntax
);
2601 if (BE (elem
== NULL
, 0))
2603 if (BE (start
== 0 && end
== 0, 0))
2605 postorder (elem
, free_tree
, NULL
);
2609 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2610 if (BE (start
> 0, 0))
2613 for (i
= 2; i
<= start
; ++i
)
2615 elem
= duplicate_tree (elem
, dfa
);
2616 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2617 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2618 goto parse_dup_op_espace
;
2624 /* Duplicate ELEM before it is marked optional. */
2625 elem
= duplicate_tree (elem
, dfa
);
2631 if (elem
->token
.type
== SUBEXP
)
2633 uintptr_t subidx
= elem
->token
.opr
.idx
;
2634 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2637 tree
= create_tree (dfa
, elem
, NULL
,
2638 (end
== REG_MISSING
? OP_DUP_ASTERISK
: OP_ALT
));
2639 if (BE (tree
== NULL
, 0))
2640 goto parse_dup_op_espace
;
2642 /* From gnulib's "intprops.h":
2643 True if the arithmetic type T is signed. */
2644 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2646 /* This loop is actually executed only when end != REG_MISSING,
2647 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2648 already created the start+1-th copy. */
2649 if (TYPE_SIGNED (Idx
) || end
!= REG_MISSING
)
2650 for (i
= start
+ 2; i
<= end
; ++i
)
2652 elem
= duplicate_tree (elem
, dfa
);
2653 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2654 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2655 goto parse_dup_op_espace
;
2657 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2658 if (BE (tree
== NULL
, 0))
2659 goto parse_dup_op_espace
;
2663 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2667 parse_dup_op_espace
:
2672 /* Size of the names for collating symbol/equivalence_class/character_class.
2673 I'm not sure, but maybe enough. */
2674 #define BRACKET_NAME_BUF_SIZE 32
2677 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678 Build the range expression which starts from START_ELEM, and ends
2679 at END_ELEM. The result are written to MBCSET and SBCSET.
2680 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681 mbcset->range_ends, is a pointer argument since we may
2684 static reg_errcode_t
2686 # ifdef RE_ENABLE_I18N
2687 build_range_exp (const reg_syntax_t syntax
,
2689 re_charset_t
*mbcset
,
2691 const bracket_elem_t
*start_elem
,
2692 const bracket_elem_t
*end_elem
)
2693 # else /* not RE_ENABLE_I18N */
2694 build_range_exp (const reg_syntax_t syntax
,
2696 const bracket_elem_t
*start_elem
,
2697 const bracket_elem_t
*end_elem
)
2698 # endif /* not RE_ENABLE_I18N */
2700 unsigned int start_ch
, end_ch
;
2701 /* Equivalence Classes and Character Classes can't be a range start/end. */
2702 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2703 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2707 /* We can handle no multi character collating elements without libc
2709 if (BE ((start_elem
->type
== COLL_SYM
2710 && strlen ((char *) start_elem
->opr
.name
) > 1)
2711 || (end_elem
->type
== COLL_SYM
2712 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2713 return REG_ECOLLATE
;
2715 # ifdef RE_ENABLE_I18N
2721 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2722 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2724 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2725 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2727 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2728 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2729 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2730 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2731 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2732 return REG_ECOLLATE
;
2733 else if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_wc
> end_wc
, 0))
2736 /* Got valid collation sequence values, add them as a new entry.
2737 However, for !_LIBC we have no collation elements: if the
2738 character set is single byte, the single byte character set
2739 that we build below suffices. parse_bracket_exp passes
2740 no MBCSET if dfa->mb_cur_max == 1. */
2743 /* Check the space of the arrays. */
2744 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2746 /* There is not enough space, need realloc. */
2747 wchar_t *new_array_start
, *new_array_end
;
2750 /* +1 in case of mbcset->nranges is 0. */
2751 new_nranges
= 2 * mbcset
->nranges
+ 1;
2752 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753 are NULL if *range_alloc == 0. */
2754 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2756 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2759 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2762 mbcset
->range_starts
= new_array_start
;
2763 mbcset
->range_ends
= new_array_end
;
2764 *range_alloc
= new_nranges
;
2767 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2768 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2771 /* Build the table for single byte characters. */
2772 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2774 if (start_wc
<= wc
&& wc
<= end_wc
)
2775 bitset_set (sbcset
, wc
);
2778 # else /* not RE_ENABLE_I18N */
2781 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2782 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2784 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2785 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2787 if (start_ch
> end_ch
)
2789 /* Build the table for single byte characters. */
2790 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2791 if (start_ch
<= ch
&& ch
<= end_ch
)
2792 bitset_set (sbcset
, ch
);
2794 # endif /* not RE_ENABLE_I18N */
2797 #endif /* not _LIBC */
2800 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2801 Build the collating element which is represented by NAME.
2802 The result are written to MBCSET and SBCSET.
2803 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2804 pointer argument since we may update it. */
2806 static reg_errcode_t
2808 # ifdef RE_ENABLE_I18N
2809 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2810 Idx
*coll_sym_alloc
, const unsigned char *name
)
2811 # else /* not RE_ENABLE_I18N */
2812 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2813 # endif /* not RE_ENABLE_I18N */
2815 size_t name_len
= strlen ((const char *) name
);
2816 if (BE (name_len
!= 1, 0))
2817 return REG_ECOLLATE
;
2820 bitset_set (sbcset
, name
[0]);
2824 #endif /* not _LIBC */
2826 /* This function parse bracket expression like "[abc]", "[a-c]",
2830 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2831 reg_syntax_t syntax
, reg_errcode_t
*err
)
2834 const unsigned char *collseqmb
;
2835 const char *collseqwc
;
2838 const int32_t *symb_table
;
2839 const unsigned char *extra
;
2841 /* Local function for parse_bracket_exp used in _LIBC environment.
2842 Seek the collating symbol entry corresponding to NAME.
2843 Return the index of the symbol in the SYMB_TABLE,
2844 or -1 if not found. */
2847 __attribute__ ((always_inline
))
2848 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2852 for (elem
= 0; elem
< table_size
; elem
++)
2853 if (symb_table
[2 * elem
] != 0)
2855 int32_t idx
= symb_table
[2 * elem
+ 1];
2856 /* Skip the name of collating element name. */
2857 idx
+= 1 + extra
[idx
];
2858 if (/* Compare the length of the name. */
2859 name_len
== extra
[idx
]
2860 /* Compare the name. */
2861 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2862 /* Yep, this is the entry. */
2868 /* Local function for parse_bracket_exp used in _LIBC environment.
2869 Look up the collation sequence value of BR_ELEM.
2870 Return the value if succeeded, UINT_MAX otherwise. */
2872 auto inline unsigned int
2873 __attribute__ ((always_inline
))
2874 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2876 if (br_elem
->type
== SB_CHAR
)
2879 if (MB_CUR_MAX == 1)
2882 return collseqmb
[br_elem
->opr
.ch
];
2885 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2886 return __collseq_table_lookup (collseqwc
, wc
);
2889 else if (br_elem
->type
== MB_CHAR
)
2892 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2894 else if (br_elem
->type
== COLL_SYM
)
2896 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2900 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2904 /* We found the entry. */
2905 idx
= symb_table
[2 * elem
+ 1];
2906 /* Skip the name of collating element name. */
2907 idx
+= 1 + extra
[idx
];
2908 /* Skip the byte sequence of the collating element. */
2909 idx
+= 1 + extra
[idx
];
2910 /* Adjust for the alignment. */
2911 idx
= (idx
+ 3) & ~3;
2912 /* Skip the multibyte collation sequence value. */
2913 idx
+= sizeof (unsigned int);
2914 /* Skip the wide char sequence of the collating element. */
2915 idx
+= sizeof (unsigned int) *
2916 (1 + *(unsigned int *) (extra
+ idx
));
2917 /* Return the collation sequence value. */
2918 return *(unsigned int *) (extra
+ idx
);
2920 else if (sym_name_len
== 1)
2922 /* No valid character. Match it as a single byte
2924 return collseqmb
[br_elem
->opr
.name
[0]];
2927 else if (sym_name_len
== 1)
2928 return collseqmb
[br_elem
->opr
.name
[0]];
2933 /* Local function for parse_bracket_exp used in _LIBC environment.
2934 Build the range expression which starts from START_ELEM, and ends
2935 at END_ELEM. The result are written to MBCSET and SBCSET.
2936 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937 mbcset->range_ends, is a pointer argument since we may
2940 auto inline reg_errcode_t
2941 __attribute__ ((always_inline
))
2942 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2943 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2946 uint32_t start_collseq
;
2947 uint32_t end_collseq
;
2949 /* Equivalence Classes and Character Classes can't be a range
2951 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2952 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2956 /* FIXME: Implement rational ranges here, too. */
2957 start_collseq
= lookup_collation_sequence_value (start_elem
);
2958 end_collseq
= lookup_collation_sequence_value (end_elem
);
2959 /* Check start/end collation sequence values. */
2960 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2961 return REG_ECOLLATE
;
2962 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2965 /* Got valid collation sequence values, add them as a new entry.
2966 However, if we have no collation elements, and the character set
2967 is single byte, the single byte character set that we
2968 build below suffices. */
2969 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2971 /* Check the space of the arrays. */
2972 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2974 /* There is not enough space, need realloc. */
2975 uint32_t *new_array_start
;
2976 uint32_t *new_array_end
;
2979 /* +1 in case of mbcset->nranges is 0. */
2980 new_nranges
= 2 * mbcset
->nranges
+ 1;
2981 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2983 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2986 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2989 mbcset
->range_starts
= new_array_start
;
2990 mbcset
->range_ends
= new_array_end
;
2991 *range_alloc
= new_nranges
;
2994 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2995 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2998 /* Build the table for single byte characters. */
2999 for (ch
= 0; ch
< SBC_MAX
; ch
++)
3001 uint32_t ch_collseq
;
3003 if (MB_CUR_MAX == 1)
3006 ch_collseq
= collseqmb
[ch
];
3008 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3009 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3010 bitset_set (sbcset
, ch
);
3015 /* Local function for parse_bracket_exp used in _LIBC environment.
3016 Build the collating element which is represented by NAME.
3017 The result are written to MBCSET and SBCSET.
3018 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019 pointer argument since we may update it. */
3021 auto inline reg_errcode_t
3022 __attribute__ ((always_inline
))
3023 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
3024 Idx
*coll_sym_alloc
, const unsigned char *name
)
3027 size_t name_len
= strlen ((const char *) name
);
3030 elem
= seek_collating_symbol_entry (name
, name_len
);
3033 /* We found the entry. */
3034 idx
= symb_table
[2 * elem
+ 1];
3035 /* Skip the name of collating element name. */
3036 idx
+= 1 + extra
[idx
];
3038 else if (name_len
== 1)
3040 /* No valid character, treat it as a normal
3042 bitset_set (sbcset
, name
[0]);
3046 return REG_ECOLLATE
;
3048 /* Got valid collation sequence, add it as a new entry. */
3049 /* Check the space of the arrays. */
3050 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3052 /* Not enough, realloc it. */
3053 /* +1 in case of mbcset->ncoll_syms is 0. */
3054 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3055 /* Use realloc since mbcset->coll_syms is NULL
3057 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3058 new_coll_sym_alloc
);
3059 if (BE (new_coll_syms
== NULL
, 0))
3061 mbcset
->coll_syms
= new_coll_syms
;
3062 *coll_sym_alloc
= new_coll_sym_alloc
;
3064 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3069 if (BE (name_len
!= 1, 0))
3070 return REG_ECOLLATE
;
3073 bitset_set (sbcset
, name
[0]);
3080 re_token_t br_token
;
3081 re_bitset_ptr_t sbcset
;
3082 #ifdef RE_ENABLE_I18N
3083 re_charset_t
*mbcset
;
3084 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3085 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3086 #endif /* not RE_ENABLE_I18N */
3087 bool non_match
= false;
3088 bin_tree_t
*work_tree
;
3090 bool first_round
= true;
3092 collseqmb
= (const unsigned char *)
3093 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3094 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3100 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3101 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3102 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3103 _NL_COLLATE_SYMB_TABLEMB
);
3104 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3105 _NL_COLLATE_SYMB_EXTRAMB
);
3108 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3109 #ifdef RE_ENABLE_I18N
3110 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3111 #endif /* RE_ENABLE_I18N */
3112 #ifdef RE_ENABLE_I18N
3113 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3115 if (BE (sbcset
== NULL
, 0))
3116 #endif /* RE_ENABLE_I18N */
3119 #ifdef RE_ENABLE_I18N
3126 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3127 if (BE (token
->type
== END_OF_RE
, 0))
3130 goto parse_bracket_exp_free_return
;
3132 if (token
->type
== OP_NON_MATCH_LIST
)
3134 #ifdef RE_ENABLE_I18N
3135 mbcset
->non_match
= 1;
3136 #endif /* not RE_ENABLE_I18N */
3138 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3139 bitset_set (sbcset
, '\n');
3140 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3141 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3142 if (BE (token
->type
== END_OF_RE
, 0))
3145 goto parse_bracket_exp_free_return
;
3149 /* We treat the first ']' as a normal character. */
3150 if (token
->type
== OP_CLOSE_BRACKET
)
3151 token
->type
= CHARACTER
;
3155 bracket_elem_t start_elem
, end_elem
;
3156 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3157 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3160 bool is_range_exp
= false;
3163 start_elem
.opr
.name
= start_name_buf
;
3164 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3165 syntax
, first_round
);
3166 if (BE (ret
!= REG_NOERROR
, 0))
3169 goto parse_bracket_exp_free_return
;
3171 first_round
= false;
3173 /* Get information about the next token. We need it in any case. */
3174 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3176 /* Do not check for ranges if we know they are not allowed. */
3177 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3179 if (BE (token
->type
== END_OF_RE
, 0))
3182 goto parse_bracket_exp_free_return
;
3184 if (token
->type
== OP_CHARSET_RANGE
)
3186 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3187 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3188 if (BE (token2
.type
== END_OF_RE
, 0))
3191 goto parse_bracket_exp_free_return
;
3193 if (token2
.type
== OP_CLOSE_BRACKET
)
3195 /* We treat the last '-' as a normal character. */
3196 re_string_skip_bytes (regexp
, -token_len
);
3197 token
->type
= CHARACTER
;
3200 is_range_exp
= true;
3204 if (is_range_exp
== true)
3206 end_elem
.opr
.name
= end_name_buf
;
3207 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3209 if (BE (ret
!= REG_NOERROR
, 0))
3212 goto parse_bracket_exp_free_return
;
3215 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3218 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3219 &start_elem
, &end_elem
);
3221 # ifdef RE_ENABLE_I18N
3222 *err
= build_range_exp (syntax
, sbcset
,
3223 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3224 &range_alloc
, &start_elem
, &end_elem
);
3226 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3228 #endif /* RE_ENABLE_I18N */
3229 if (BE (*err
!= REG_NOERROR
, 0))
3230 goto parse_bracket_exp_free_return
;
3234 switch (start_elem
.type
)
3237 bitset_set (sbcset
, start_elem
.opr
.ch
);
3239 #ifdef RE_ENABLE_I18N
3241 /* Check whether the array has enough space. */
3242 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3244 wchar_t *new_mbchars
;
3245 /* Not enough, realloc it. */
3246 /* +1 in case of mbcset->nmbchars is 0. */
3247 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3248 /* Use realloc since array is NULL if *alloc == 0. */
3249 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3251 if (BE (new_mbchars
== NULL
, 0))
3252 goto parse_bracket_exp_espace
;
3253 mbcset
->mbchars
= new_mbchars
;
3255 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3257 #endif /* RE_ENABLE_I18N */
3259 *err
= build_equiv_class (sbcset
,
3260 #ifdef RE_ENABLE_I18N
3261 mbcset
, &equiv_class_alloc
,
3262 #endif /* RE_ENABLE_I18N */
3263 start_elem
.opr
.name
);
3264 if (BE (*err
!= REG_NOERROR
, 0))
3265 goto parse_bracket_exp_free_return
;
3268 *err
= build_collating_symbol (sbcset
,
3269 #ifdef RE_ENABLE_I18N
3270 mbcset
, &coll_sym_alloc
,
3271 #endif /* RE_ENABLE_I18N */
3272 start_elem
.opr
.name
);
3273 if (BE (*err
!= REG_NOERROR
, 0))
3274 goto parse_bracket_exp_free_return
;
3277 *err
= build_charclass (regexp
->trans
, sbcset
,
3278 #ifdef RE_ENABLE_I18N
3279 mbcset
, &char_class_alloc
,
3280 #endif /* RE_ENABLE_I18N */
3281 (const char *) start_elem
.opr
.name
,
3283 if (BE (*err
!= REG_NOERROR
, 0))
3284 goto parse_bracket_exp_free_return
;
3291 if (BE (token
->type
== END_OF_RE
, 0))
3294 goto parse_bracket_exp_free_return
;
3296 if (token
->type
== OP_CLOSE_BRACKET
)
3300 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3302 /* If it is non-matching list. */
3304 bitset_not (sbcset
);
3306 #ifdef RE_ENABLE_I18N
3307 /* Ensure only single byte characters are set. */
3308 if (dfa
->mb_cur_max
> 1)
3309 bitset_mask (sbcset
, dfa
->sb_char
);
3311 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3312 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3313 || mbcset
->non_match
)))
3315 bin_tree_t
*mbc_tree
;
3317 /* Build a tree for complex bracket. */
3318 dfa
->has_mb_node
= 1;
3319 br_token
.type
= COMPLEX_BRACKET
;
3320 br_token
.opr
.mbcset
= mbcset
;
3321 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3322 if (BE (mbc_tree
== NULL
, 0))
3323 goto parse_bracket_exp_espace
;
3324 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3325 if (sbcset
[sbc_idx
])
3327 /* If there are no bits set in sbcset, there is no point
3328 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3329 if (sbc_idx
< BITSET_WORDS
)
3331 /* Build a tree for simple bracket. */
3332 br_token
.type
= SIMPLE_BRACKET
;
3333 br_token
.opr
.sbcset
= sbcset
;
3334 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3335 if (BE (work_tree
== NULL
, 0))
3336 goto parse_bracket_exp_espace
;
3338 /* Then join them by ALT node. */
3339 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3340 if (BE (work_tree
== NULL
, 0))
3341 goto parse_bracket_exp_espace
;
3346 work_tree
= mbc_tree
;
3350 #endif /* not RE_ENABLE_I18N */
3352 #ifdef RE_ENABLE_I18N
3353 free_charset (mbcset
);
3355 /* Build a tree for simple bracket. */
3356 br_token
.type
= SIMPLE_BRACKET
;
3357 br_token
.opr
.sbcset
= sbcset
;
3358 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3359 if (BE (work_tree
== NULL
, 0))
3360 goto parse_bracket_exp_espace
;
3364 parse_bracket_exp_espace
:
3366 parse_bracket_exp_free_return
:
3368 #ifdef RE_ENABLE_I18N
3369 free_charset (mbcset
);
3370 #endif /* RE_ENABLE_I18N */
3374 /* Parse an element in the bracket expression. */
3376 static reg_errcode_t
3377 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3378 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3379 reg_syntax_t syntax
, bool accept_hyphen
)
3381 #ifdef RE_ENABLE_I18N
3383 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3384 if (cur_char_size
> 1)
3386 elem
->type
= MB_CHAR
;
3387 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3388 re_string_skip_bytes (regexp
, cur_char_size
);
3391 #endif /* RE_ENABLE_I18N */
3392 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3393 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3394 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3395 return parse_bracket_symbol (elem
, regexp
, token
);
3396 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3398 /* A '-' must only appear as anything but a range indicator before
3399 the closing bracket. Everything else is an error. */
3401 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3402 if (token2
.type
!= OP_CLOSE_BRACKET
)
3403 /* The actual error value is not standardized since this whole
3404 case is undefined. But ERANGE makes good sense. */
3407 elem
->type
= SB_CHAR
;
3408 elem
->opr
.ch
= token
->opr
.c
;
3412 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3413 such as [:<character_class>:], [.<collating_element>.], and
3414 [=<equivalent_class>=]. */
3416 static reg_errcode_t
3417 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3420 unsigned char ch
, delim
= token
->opr
.c
;
3422 if (re_string_eoi(regexp
))
3426 if (i
>= BRACKET_NAME_BUF_SIZE
)
3428 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3429 ch
= re_string_fetch_byte_case (regexp
);
3431 ch
= re_string_fetch_byte (regexp
);
3432 if (re_string_eoi(regexp
))
3434 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3436 elem
->opr
.name
[i
] = ch
;
3438 re_string_skip_bytes (regexp
, 1);
3439 elem
->opr
.name
[i
] = '\0';
3440 switch (token
->type
)
3442 case OP_OPEN_COLL_ELEM
:
3443 elem
->type
= COLL_SYM
;
3445 case OP_OPEN_EQUIV_CLASS
:
3446 elem
->type
= EQUIV_CLASS
;
3448 case OP_OPEN_CHAR_CLASS
:
3449 elem
->type
= CHAR_CLASS
;
3457 /* Helper function for parse_bracket_exp.
3458 Build the equivalence class which is represented by NAME.
3459 The result are written to MBCSET and SBCSET.
3460 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461 is a pointer argument since we may update it. */
3463 static reg_errcode_t
3464 #ifdef RE_ENABLE_I18N
3465 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3466 Idx
*equiv_class_alloc
, const unsigned char *name
)
3467 #else /* not RE_ENABLE_I18N */
3468 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3469 #endif /* not RE_ENABLE_I18N */
3472 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3475 const int32_t *table
, *indirect
;
3476 const unsigned char *weights
, *extra
, *cp
;
3477 unsigned char char_buf
[2];
3481 /* This #include defines a local function! */
3482 # include <locale/weight.h>
3483 /* Calculate the index for equivalence class. */
3485 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3486 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3487 _NL_COLLATE_WEIGHTMB
);
3488 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3489 _NL_COLLATE_EXTRAMB
);
3490 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3491 _NL_COLLATE_INDIRECTMB
);
3492 idx1
= findidx (&cp
, -1);
3493 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3494 /* This isn't a valid character. */
3495 return REG_ECOLLATE
;
3497 /* Build single byte matching table for this equivalence class. */
3498 len
= weights
[idx1
& 0xffffff];
3499 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3503 idx2
= findidx (&cp
, 1);
3508 /* This isn't a valid character. */
3510 /* Compare only if the length matches and the collation rule
3511 index is the same. */
3512 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3516 while (cnt
<= len
&&
3517 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3518 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3522 bitset_set (sbcset
, ch
);
3525 /* Check whether the array has enough space. */
3526 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nequiv_classes is 0. */
3530 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3531 /* Use realloc since the array is NULL if *alloc == 0. */
3532 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3534 new_equiv_class_alloc
);
3535 if (BE (new_equiv_classes
== NULL
, 0))
3537 mbcset
->equiv_classes
= new_equiv_classes
;
3538 *equiv_class_alloc
= new_equiv_class_alloc
;
3540 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3545 if (BE (strlen ((const char *) name
) != 1, 0))
3546 return REG_ECOLLATE
;
3547 bitset_set (sbcset
, *name
);
3552 /* Helper function for parse_bracket_exp.
3553 Build the character class which is represented by NAME.
3554 The result are written to MBCSET and SBCSET.
3555 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556 is a pointer argument since we may update it. */
3558 static reg_errcode_t
3559 #ifdef RE_ENABLE_I18N
3560 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3561 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3562 const char *class_name
, reg_syntax_t syntax
)
3563 #else /* not RE_ENABLE_I18N */
3564 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3565 const char *class_name
, reg_syntax_t syntax
)
3566 #endif /* not RE_ENABLE_I18N */
3569 const char *name
= class_name
;
3571 /* In case of REG_ICASE "upper" and "lower" match the both of
3572 upper and lower cases. */
3573 if ((syntax
& RE_ICASE
)
3574 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3577 #ifdef RE_ENABLE_I18N
3578 /* Check the space of the arrays. */
3579 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3581 /* Not enough, realloc it. */
3582 /* +1 in case of mbcset->nchar_classes is 0. */
3583 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3584 /* Use realloc since array is NULL if *alloc == 0. */
3585 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3586 new_char_class_alloc
);
3587 if (BE (new_char_classes
== NULL
, 0))
3589 mbcset
->char_classes
= new_char_classes
;
3590 *char_class_alloc
= new_char_class_alloc
;
3592 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3593 #endif /* RE_ENABLE_I18N */
3595 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3597 if (BE (trans != NULL, 0)) \
3599 for (i = 0; i < SBC_MAX; ++i) \
3600 if (ctype_func (i)) \
3601 bitset_set (sbcset, trans[i]); \
3605 for (i = 0; i < SBC_MAX; ++i) \
3606 if (ctype_func (i)) \
3607 bitset_set (sbcset, i); \
3611 if (strcmp (name
, "alnum") == 0)
3612 BUILD_CHARCLASS_LOOP (isalnum
);
3613 else if (strcmp (name
, "cntrl") == 0)
3614 BUILD_CHARCLASS_LOOP (iscntrl
);
3615 else if (strcmp (name
, "lower") == 0)
3616 BUILD_CHARCLASS_LOOP (islower
);
3617 else if (strcmp (name
, "space") == 0)
3618 BUILD_CHARCLASS_LOOP (isspace
);
3619 else if (strcmp (name
, "alpha") == 0)
3620 BUILD_CHARCLASS_LOOP (isalpha
);
3621 else if (strcmp (name
, "digit") == 0)
3622 BUILD_CHARCLASS_LOOP (isdigit
);
3623 else if (strcmp (name
, "print") == 0)
3624 BUILD_CHARCLASS_LOOP (isprint
);
3625 else if (strcmp (name
, "upper") == 0)
3626 BUILD_CHARCLASS_LOOP (isupper
);
3627 else if (strcmp (name
, "blank") == 0)
3628 BUILD_CHARCLASS_LOOP (isblank
);
3629 else if (strcmp (name
, "graph") == 0)
3630 BUILD_CHARCLASS_LOOP (isgraph
);
3631 else if (strcmp (name
, "punct") == 0)
3632 BUILD_CHARCLASS_LOOP (ispunct
);
3633 else if (strcmp (name
, "xdigit") == 0)
3634 BUILD_CHARCLASS_LOOP (isxdigit
);
3642 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3643 const char *class_name
,
3644 const char *extra
, bool non_match
,
3647 re_bitset_ptr_t sbcset
;
3648 #ifdef RE_ENABLE_I18N
3649 re_charset_t
*mbcset
;
3651 #endif /* not RE_ENABLE_I18N */
3653 re_token_t br_token
;
3656 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3657 #ifdef RE_ENABLE_I18N
3658 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3659 #endif /* RE_ENABLE_I18N */
3661 #ifdef RE_ENABLE_I18N
3662 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3663 #else /* not RE_ENABLE_I18N */
3664 if (BE (sbcset
== NULL
, 0))
3665 #endif /* not RE_ENABLE_I18N */
3673 #ifdef RE_ENABLE_I18N
3674 mbcset
->non_match
= 1;
3675 #endif /* not RE_ENABLE_I18N */
3678 /* We don't care the syntax in this case. */
3679 ret
= build_charclass (trans
, sbcset
,
3680 #ifdef RE_ENABLE_I18N
3682 #endif /* RE_ENABLE_I18N */
3685 if (BE (ret
!= REG_NOERROR
, 0))
3688 #ifdef RE_ENABLE_I18N
3689 free_charset (mbcset
);
3690 #endif /* RE_ENABLE_I18N */
3694 /* \w match '_' also. */
3695 for (; *extra
; extra
++)
3696 bitset_set (sbcset
, *extra
);
3698 /* If it is non-matching list. */
3700 bitset_not (sbcset
);
3702 #ifdef RE_ENABLE_I18N
3703 /* Ensure only single byte characters are set. */
3704 if (dfa
->mb_cur_max
> 1)
3705 bitset_mask (sbcset
, dfa
->sb_char
);
3708 /* Build a tree for simple bracket. */
3709 br_token
.type
= SIMPLE_BRACKET
;
3710 br_token
.opr
.sbcset
= sbcset
;
3711 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3712 if (BE (tree
== NULL
, 0))
3713 goto build_word_op_espace
;
3715 #ifdef RE_ENABLE_I18N
3716 if (dfa
->mb_cur_max
> 1)
3718 bin_tree_t
*mbc_tree
;
3719 /* Build a tree for complex bracket. */
3720 br_token
.type
= COMPLEX_BRACKET
;
3721 br_token
.opr
.mbcset
= mbcset
;
3722 dfa
->has_mb_node
= 1;
3723 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3724 if (BE (mbc_tree
== NULL
, 0))
3725 goto build_word_op_espace
;
3726 /* Then join them by ALT node. */
3727 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3728 if (BE (mbc_tree
!= NULL
, 1))
3733 free_charset (mbcset
);
3736 #else /* not RE_ENABLE_I18N */
3738 #endif /* not RE_ENABLE_I18N */
3740 build_word_op_espace
:
3742 #ifdef RE_ENABLE_I18N
3743 free_charset (mbcset
);
3744 #endif /* RE_ENABLE_I18N */
3749 /* This is intended for the expressions like "a{1,3}".
3750 Fetch a number from 'input', and return the number.
3751 Return REG_MISSING if the number field is empty like "{,1}".
3752 Return RE_DUP_MAX + 1 if the number field is too large.
3753 Return REG_ERROR if an error occurred. */
3756 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3758 Idx num
= REG_MISSING
;
3762 fetch_token (token
, input
, syntax
);
3764 if (BE (token
->type
== END_OF_RE
, 0))
3766 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3768 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
3769 || num
== REG_ERROR
)
3771 : num
== REG_MISSING
3773 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3778 #ifdef RE_ENABLE_I18N
3780 free_charset (re_charset_t
*cset
)
3782 re_free (cset
->mbchars
);
3784 re_free (cset
->coll_syms
);
3785 re_free (cset
->equiv_classes
);
3786 re_free (cset
->range_starts
);
3787 re_free (cset
->range_ends
);
3789 re_free (cset
->char_classes
);
3792 #endif /* RE_ENABLE_I18N */
3794 /* Functions for binary tree operation. */
3796 /* Create a tree node. */
3799 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3800 re_token_type_t type
)
3804 return create_token_tree (dfa
, left
, right
, &t
);
3808 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3809 const re_token_t
*token
)
3812 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3814 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3816 if (storage
== NULL
)
3818 storage
->next
= dfa
->str_tree_storage
;
3819 dfa
->str_tree_storage
= storage
;
3820 dfa
->str_tree_storage_idx
= 0;
3822 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3824 tree
->parent
= NULL
;
3826 tree
->right
= right
;
3827 tree
->token
= *token
;
3828 tree
->token
.duplicated
= 0;
3829 tree
->token
.opt_subexp
= 0;
3832 tree
->node_idx
= REG_MISSING
;
3835 left
->parent
= tree
;
3837 right
->parent
= tree
;
3841 /* Mark the tree SRC as an optional subexpression.
3842 To be called from preorder or postorder. */
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3847 Idx idx
= (uintptr_t) extra
;
3848 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3849 node
->token
.opt_subexp
= 1;
3854 /* Free the allocated memory inside NODE. */
3857 free_token (re_token_t
*node
)
3859 #ifdef RE_ENABLE_I18N
3860 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3861 free_charset (node
->opr
.mbcset
);
3863 #endif /* RE_ENABLE_I18N */
3864 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3865 re_free (node
->opr
.sbcset
);
3868 /* Worker function for tree walking. Free the allocated memory inside NODE
3869 and its children. */
3871 static reg_errcode_t
3872 free_tree (void *extra
, bin_tree_t
*node
)
3874 free_token (&node
->token
);
3879 /* Duplicate the node SRC, and return new node. This is a preorder
3880 visit similar to the one implemented by the generic visitor, but
3881 we need more infrastructure to maintain two parallel trees --- so,
3882 it's easier to duplicate. */
3885 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3887 const bin_tree_t
*node
;
3888 bin_tree_t
*dup_root
;
3889 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3891 for (node
= root
; ; )
3893 /* Create a new tree and link it back to the current parent. */
3894 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3897 (*p_new
)->parent
= dup_node
;
3898 (*p_new
)->token
.duplicated
= 1;
3901 /* Go to the left node, or up and to the right. */
3905 p_new
= &dup_node
->left
;
3909 const bin_tree_t
*prev
= NULL
;
3910 while (node
->right
== prev
|| node
->right
== NULL
)
3913 node
= node
->parent
;
3914 dup_node
= dup_node
->parent
;
3919 p_new
= &dup_node
->right
;