1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
25 size_t length
, reg_syntax_t syntax
);
26 static void re_compile_fastmap_iter (regex_t
*bufp
,
27 const re_dfastate_t
*init_state
,
29 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
31 static void free_charset (re_charset_t
*cset
);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t
*preg
);
34 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
36 static void optimize_utf8 (re_dfa_t
*dfa
);
38 static reg_errcode_t
analyze (regex_t
*preg
);
39 static reg_errcode_t
preorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
postorder (bin_tree_t
*root
,
43 reg_errcode_t (fn (void *, bin_tree_t
*)),
45 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
47 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
49 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
52 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
53 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
54 unsigned int constraint
);
55 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
56 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
58 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
59 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
61 static int peek_token (re_token_t
*token
, re_string_t
*input
,
62 reg_syntax_t syntax
) internal_function
;
63 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
64 reg_syntax_t syntax
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 Idx nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 Idx nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 Idx nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
75 re_token_t
*token
, reg_syntax_t syntax
,
76 Idx nest
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
78 re_dfa_t
*dfa
, re_token_t
*token
,
79 reg_syntax_t syntax
, reg_errcode_t
*err
);
80 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
81 re_token_t
*token
, reg_syntax_t syntax
,
83 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
85 re_token_t
*token
, int token_len
,
89 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
93 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
95 Idx
*equiv_class_alloc
,
96 const unsigned char *name
);
97 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
100 Idx
*char_class_alloc
,
101 const char *class_name
,
102 reg_syntax_t syntax
);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
105 const unsigned char *name
);
106 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
108 const char *class_name
,
109 reg_syntax_t syntax
);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
112 RE_TRANSLATE_TYPE trans
,
113 const char *class_name
,
115 bool non_match
, reg_errcode_t
*err
);
116 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 re_token_type_t type
);
119 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
120 bin_tree_t
*left
, bin_tree_t
*right
,
121 const re_token_t
*token
);
122 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
123 static void free_token (re_token_t
*node
);
124 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
125 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid
[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx
[] =
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
218 re_compile_pattern (pattern
, length
, bufp
)
221 struct re_pattern_buffer
*bufp
;
222 #else /* size_t might promote */
224 re_compile_pattern (const char *pattern
, size_t length
,
225 struct re_pattern_buffer
*bufp
)
230 /* And GNU code determines whether or not to get register information
231 by passing null for the REGS argument to re_match, etc., not by
232 setting no_sub, unless RE_NO_SUB is set. */
233 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
235 /* Match anchors at newline. */
236 bufp
->newline_anchor
= 1;
238 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
242 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
245 weak_alias (__re_compile_pattern
, re_compile_pattern
)
248 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
249 also be assigned to arbitrarily: each pattern buffer stores its own
250 syntax, so it can be changed between regex compilations. */
251 /* This has no initializer because initialized variables in Emacs
252 become read-only after dumping. */
253 reg_syntax_t re_syntax_options
;
256 /* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
260 The argument SYNTAX is a bit mask comprised of the various bits
261 defined in regex.h. We return the old syntax. */
264 re_set_syntax (syntax
)
267 reg_syntax_t ret
= re_syntax_options
;
269 re_syntax_options
= syntax
;
273 weak_alias (__re_set_syntax
, re_set_syntax
)
277 re_compile_fastmap (bufp
)
278 struct re_pattern_buffer
*bufp
;
280 re_dfa_t
*dfa
= bufp
->buffer
;
281 char *fastmap
= bufp
->fastmap
;
283 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
284 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
285 if (dfa
->init_state
!= dfa
->init_state_word
)
286 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
287 if (dfa
->init_state
!= dfa
->init_state_nl
)
288 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
289 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
290 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
291 bufp
->fastmap_accurate
= 1;
295 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
299 __attribute__ ((always_inline
))
300 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
304 fastmap
[tolower (ch
)] = 1;
307 /* Helper function for re_compile_fastmap.
308 Compile fastmap for the initial_state INIT_STATE. */
311 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
314 re_dfa_t
*dfa
= bufp
->buffer
;
316 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
317 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
319 Idx node
= init_state
->nodes
.elems
[node_cnt
];
320 re_token_type_t type
= dfa
->nodes
[node
].type
;
322 if (type
== CHARACTER
)
324 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
325 #ifdef RE_ENABLE_I18N
326 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
328 unsigned char buf
[MB_LEN_MAX
];
334 *p
++ = dfa
->nodes
[node
].opr
.c
;
335 while (++node
< dfa
->nodes_len
336 && dfa
->nodes
[node
].type
== CHARACTER
337 && dfa
->nodes
[node
].mb_partial
)
338 *p
++ = dfa
->nodes
[node
].opr
.c
;
339 memset (&state
, '\0', sizeof (state
));
340 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
342 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
344 re_set_fastmap (fastmap
, false, buf
[0]);
348 else if (type
== SIMPLE_BRACKET
)
351 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
354 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
355 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
356 if (w
& ((bitset_word_t
) 1 << j
))
357 re_set_fastmap (fastmap
, icase
, ch
);
360 #ifdef RE_ENABLE_I18N
361 else if (type
== COMPLEX_BRACKET
)
363 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
367 /* See if we have to try all bytes which start multiple collation
369 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
370 collation element, and don't catch 'b' since 'b' is
371 the only collation element which starts from 'b' (and
372 it is caught by SIMPLE_BRACKET). */
373 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
374 && (cset
->ncoll_syms
|| cset
->nranges
))
376 const int32_t *table
= (const int32_t *)
377 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
378 for (i
= 0; i
< SBC_MAX
; ++i
)
380 re_set_fastmap (fastmap
, icase
, i
);
384 /* See if we have to start the match at all multibyte characters,
385 i.e. where we would not find an invalid sequence. This only
386 applies to multibyte character sets; for single byte character
387 sets, the SIMPLE_BRACKET again suffices. */
388 if (dfa
->mb_cur_max
> 1
389 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
391 || cset
->nequiv_classes
399 memset (&mbs
, 0, sizeof (mbs
));
400 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
401 re_set_fastmap (fastmap
, false, (int) c
);
408 /* ... Else catch all bytes which can start the mbchars. */
409 for (i
= 0; i
< cset
->nmbchars
; ++i
)
413 memset (&state
, '\0', sizeof (state
));
414 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
415 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
416 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
418 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
420 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
425 #endif /* RE_ENABLE_I18N */
426 else if (type
== OP_PERIOD
427 #ifdef RE_ENABLE_I18N
428 || type
== OP_UTF8_PERIOD
429 #endif /* RE_ENABLE_I18N */
430 || type
== END_OF_RE
)
432 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
433 if (type
== END_OF_RE
)
434 bufp
->can_be_null
= 1;
440 /* Entry point for POSIX code. */
441 /* regcomp takes a regular expression as a string and compiles it.
443 PREG is a regex_t *. We do not expect any fields to be initialized,
444 since POSIX says we shouldn't. Thus, we set
446 'buffer' to the compiled pattern;
447 'used' to the length of the compiled pattern;
448 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
449 REG_EXTENDED bit in CFLAGS is set; otherwise, to
450 RE_SYNTAX_POSIX_BASIC;
451 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
452 'fastmap' to an allocated space for the fastmap;
453 'fastmap_accurate' to zero;
454 're_nsub' to the number of subexpressions in PATTERN.
456 PATTERN is the address of the pattern string.
458 CFLAGS is a series of bits which affect compilation.
460 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
461 use POSIX basic syntax.
463 If REG_NEWLINE is set, then . and [^...] don't match newline.
464 Also, regexec will try a match beginning after every newline.
466 If REG_ICASE is set, then we considers upper- and lowercase
467 versions of letters to be equivalent when matching.
469 If REG_NOSUB is set, then when PREG is passed to regexec, that
470 routine will report only success or failure, and nothing about the
473 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
474 the return codes and their meanings.) */
477 regcomp (preg
, pattern
, cflags
)
478 regex_t
*_Restrict_ preg
;
479 const char *_Restrict_ pattern
;
483 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
484 : RE_SYNTAX_POSIX_BASIC
);
490 /* Try to allocate space for the fastmap. */
491 preg
->fastmap
= re_malloc (char, SBC_MAX
);
492 if (BE (preg
->fastmap
== NULL
, 0))
495 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
497 /* If REG_NEWLINE is set, newlines are treated differently. */
498 if (cflags
& REG_NEWLINE
)
499 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
500 syntax
&= ~RE_DOT_NEWLINE
;
501 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
502 /* It also changes the matching behavior. */
503 preg
->newline_anchor
= 1;
506 preg
->newline_anchor
= 0;
507 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
508 preg
->translate
= NULL
;
510 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
512 /* POSIX doesn't distinguish between an unmatched open-group and an
513 unmatched close-group: both are REG_EPAREN. */
514 if (ret
== REG_ERPAREN
)
517 /* We have already checked preg->fastmap != NULL. */
518 if (BE (ret
== REG_NOERROR
, 1))
519 /* Compute the fastmap now, since regexec cannot modify the pattern
520 buffer. This function never fails in this implementation. */
521 (void) re_compile_fastmap (preg
);
524 /* Some error occurred while compiling the expression. */
525 re_free (preg
->fastmap
);
526 preg
->fastmap
= NULL
;
532 weak_alias (__regcomp
, regcomp
)
535 /* Returns a message corresponding to an error code, ERRCODE, returned
536 from either regcomp or regexec. We don't use PREG here. */
540 regerror (errcode
, preg
, errbuf
, errbuf_size
)
542 const regex_t
*_Restrict_ preg
;
543 char *_Restrict_ errbuf
;
545 #else /* size_t might promote */
547 regerror (int errcode
, const regex_t
*_Restrict_ preg
,
548 char *_Restrict_ errbuf
, size_t errbuf_size
)
555 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
556 / sizeof (__re_error_msgid_idx
[0])), 0))
557 /* Only error codes returned by the rest of the code should be passed
558 to this routine. If we are given anything else, or if other regex
559 code generates an invalid error code, then the program has a bug.
560 Dump core so we can fix it. */
563 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
565 msg_size
= strlen (msg
) + 1; /* Includes the null. */
567 if (BE (errbuf_size
!= 0, 1))
569 size_t cpy_size
= msg_size
;
570 if (BE (msg_size
> errbuf_size
, 0))
572 cpy_size
= errbuf_size
- 1;
573 errbuf
[cpy_size
] = '\0';
575 memcpy (errbuf
, msg
, cpy_size
);
581 weak_alias (__regerror
, regerror
)
585 #ifdef RE_ENABLE_I18N
586 /* This static array is used for the map to single-byte characters when
587 UTF-8 is used. Otherwise we would allocate memory just to initialize
588 it the same all the time. UTF-8 is the preferred encoding so this is
589 a worthwhile optimization. */
590 static const bitset_t utf8_sb_map
=
592 /* Set the first 128 bits. */
593 # if defined __GNUC__ && !defined __STRICT_ANSI__
594 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
596 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
597 # error "bitset_word_t is narrower than 32 bits"
598 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
600 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
601 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
602 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
606 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
608 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
615 free_dfa_content (re_dfa_t
*dfa
)
620 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
621 free_token (dfa
->nodes
+ i
);
622 re_free (dfa
->nexts
);
623 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
625 if (dfa
->eclosures
!= NULL
)
626 re_node_set_free (dfa
->eclosures
+ i
);
627 if (dfa
->inveclosures
!= NULL
)
628 re_node_set_free (dfa
->inveclosures
+ i
);
629 if (dfa
->edests
!= NULL
)
630 re_node_set_free (dfa
->edests
+ i
);
632 re_free (dfa
->edests
);
633 re_free (dfa
->eclosures
);
634 re_free (dfa
->inveclosures
);
635 re_free (dfa
->nodes
);
637 if (dfa
->state_table
)
638 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
640 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
641 for (j
= 0; j
< entry
->num
; ++j
)
643 re_dfastate_t
*state
= entry
->array
[j
];
646 re_free (entry
->array
);
648 re_free (dfa
->state_table
);
649 #ifdef RE_ENABLE_I18N
650 if (dfa
->sb_char
!= utf8_sb_map
)
651 re_free (dfa
->sb_char
);
653 re_free (dfa
->subexp_map
);
655 re_free (dfa
->re_str
);
662 /* Free dynamically allocated space used by PREG. */
668 re_dfa_t
*dfa
= preg
->buffer
;
669 if (BE (dfa
!= NULL
, 1))
671 lock_fini (dfa
->lock
);
672 free_dfa_content (dfa
);
677 re_free (preg
->fastmap
);
678 preg
->fastmap
= NULL
;
680 re_free (preg
->translate
);
681 preg
->translate
= NULL
;
684 weak_alias (__regfree
, regfree
)
687 /* Entry points compatible with 4.2 BSD regex library. We don't define
688 them unless specifically requested. */
690 #if defined _REGEX_RE_COMP || defined _LIBC
692 /* BSD has one and only one pattern buffer. */
693 static struct re_pattern_buffer re_comp_buf
;
697 /* Make these definitions weak in libc, so POSIX programs can redefine
698 these names if they don't use our functions, and still use
699 regcomp/regexec above without link errors. */
710 if (!re_comp_buf
.buffer
)
711 return gettext ("No previous regular expression");
715 if (re_comp_buf
.buffer
)
717 fastmap
= re_comp_buf
.fastmap
;
718 re_comp_buf
.fastmap
= NULL
;
719 __regfree (&re_comp_buf
);
720 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
721 re_comp_buf
.fastmap
= fastmap
;
724 if (re_comp_buf
.fastmap
== NULL
)
726 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
727 if (re_comp_buf
.fastmap
== NULL
)
728 return (char *) gettext (__re_error_msgid
729 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
732 /* Since 're_exec' always passes NULL for the 'regs' argument, we
733 don't need to initialize the pattern buffer fields which affect it. */
735 /* Match anchors at newlines. */
736 re_comp_buf
.newline_anchor
= 1;
738 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
743 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
744 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
748 libc_freeres_fn (free_mem
)
750 __regfree (&re_comp_buf
);
754 #endif /* _REGEX_RE_COMP */
756 /* Internal entry point.
757 Compile the regular expression PATTERN, whose length is LENGTH.
758 SYNTAX indicate regular expression's syntax. */
761 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
764 reg_errcode_t err
= REG_NOERROR
;
768 /* Initialize the pattern buffer. */
769 preg
->fastmap_accurate
= 0;
770 preg
->syntax
= syntax
;
771 preg
->not_bol
= preg
->not_eol
= 0;
774 preg
->can_be_null
= 0;
775 preg
->regs_allocated
= REGS_UNALLOCATED
;
777 /* Initialize the dfa. */
779 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
781 /* If zero allocated, but buffer is non-null, try to realloc
782 enough space. This loses if buffer's address is bogus, but
783 that is the user's responsibility. If ->buffer is NULL this
784 is a simple allocation. */
785 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
788 preg
->allocated
= sizeof (re_dfa_t
);
791 preg
->used
= sizeof (re_dfa_t
);
793 err
= init_dfa (dfa
, length
);
794 if (BE (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0, 0))
796 if (BE (err
!= REG_NOERROR
, 0))
798 free_dfa_content (dfa
);
804 /* Note: length+1 will not overflow since it is checked in init_dfa. */
805 dfa
->re_str
= re_malloc (char, length
+ 1);
806 strncpy (dfa
->re_str
, pattern
, length
+ 1);
809 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
810 (syntax
& RE_ICASE
) != 0, dfa
);
811 if (BE (err
!= REG_NOERROR
, 0))
813 re_compile_internal_free_return
:
814 free_workarea_compile (preg
);
815 re_string_destruct (®exp
);
816 lock_fini (dfa
->lock
);
817 free_dfa_content (dfa
);
823 /* Parse the regular expression, and build a structure tree. */
825 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
826 if (BE (dfa
->str_tree
== NULL
, 0))
827 goto re_compile_internal_free_return
;
829 /* Analyze the tree and create the nfa. */
830 err
= analyze (preg
);
831 if (BE (err
!= REG_NOERROR
, 0))
832 goto re_compile_internal_free_return
;
834 #ifdef RE_ENABLE_I18N
835 /* If possible, do searching in single byte encoding to speed things up. */
836 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
840 /* Then create the initial state of the dfa. */
841 err
= create_initial_state (dfa
);
843 /* Release work areas. */
844 free_workarea_compile (preg
);
845 re_string_destruct (®exp
);
847 if (BE (err
!= REG_NOERROR
, 0))
849 lock_fini (dfa
->lock
);
850 free_dfa_content (dfa
);
858 /* Initialize DFA. We use the length of the regular expression PAT_LEN
859 as the initial length of some arrays. */
862 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
864 __re_size_t table_size
;
866 const char *codeset_name
;
868 #ifdef RE_ENABLE_I18N
869 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
871 size_t max_i18n_object_size
= 0;
873 size_t max_object_size
=
874 MAX (sizeof (struct re_state_table_entry
),
875 MAX (sizeof (re_token_t
),
876 MAX (sizeof (re_node_set
),
877 MAX (sizeof (regmatch_t
),
878 max_i18n_object_size
))));
880 memset (dfa
, '\0', sizeof (re_dfa_t
));
882 /* Force allocation of str_tree_storage the first time. */
883 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
885 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
886 calculation below, and for similar doubling calculations
887 elsewhere. And it's <= rather than <, because some of the
888 doubling calculations add 1 afterwards. */
889 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2 <= pat_len
, 0))
892 dfa
->nodes_alloc
= pat_len
+ 1;
893 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
895 /* table_size = 2 ^ ceil(log pat_len) */
896 for (table_size
= 1; ; table_size
<<= 1)
897 if (table_size
> pat_len
)
900 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
901 dfa
->state_hash_mask
= table_size
- 1;
903 dfa
->mb_cur_max
= MB_CUR_MAX
;
905 if (dfa
->mb_cur_max
== 6
906 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
908 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
911 codeset_name
= nl_langinfo (CODESET
);
912 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
913 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
914 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
915 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
918 /* We check exhaustively in the loop below if this charset is a
919 superset of ASCII. */
920 dfa
->map_notascii
= 0;
923 #ifdef RE_ENABLE_I18N
924 if (dfa
->mb_cur_max
> 1)
927 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
932 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
933 if (BE (dfa
->sb_char
== NULL
, 0))
936 /* Set the bits corresponding to single byte chars. */
937 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
938 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
940 wint_t wch
= __btowc (ch
);
942 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
944 if (isascii (ch
) && wch
!= ch
)
945 dfa
->map_notascii
= 1;
952 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
957 /* Initialize WORD_CHAR table, which indicate which character is
958 "word". In this case "word" means that it is the word construction
959 character used by some operators like "\<", "\>", etc. */
963 init_word_char (re_dfa_t
*dfa
)
968 dfa
->word_ops_used
= 1;
969 if (BE (dfa
->map_notascii
== 0, 1))
971 bitset_word_t bits0
= 0x00000000;
972 bitset_word_t bits1
= 0x03ff0000;
973 bitset_word_t bits2
= 0x87fffffe;
974 bitset_word_t bits3
= 0x07fffffe;
975 if (BITSET_WORD_BITS
== 64)
977 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
978 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
981 else if (BITSET_WORD_BITS
== 32)
983 dfa
->word_char
[0] = bits0
;
984 dfa
->word_char
[1] = bits1
;
985 dfa
->word_char
[2] = bits2
;
986 dfa
->word_char
[3] = bits3
;
993 if (BE (dfa
->is_utf8
, 1))
995 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
1001 for (; i
< BITSET_WORDS
; ++i
)
1002 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
1003 if (isalnum (ch
) || ch
== '_')
1004 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
1007 /* Free the work area which are only used while compiling. */
1010 free_workarea_compile (regex_t
*preg
)
1012 re_dfa_t
*dfa
= preg
->buffer
;
1013 bin_tree_storage_t
*storage
, *next
;
1014 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
1016 next
= storage
->next
;
1019 dfa
->str_tree_storage
= NULL
;
1020 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
1021 dfa
->str_tree
= NULL
;
1022 re_free (dfa
->org_indices
);
1023 dfa
->org_indices
= NULL
;
1026 /* Create initial states for all contexts. */
1028 static reg_errcode_t
1029 create_initial_state (re_dfa_t
*dfa
)
1033 re_node_set init_nodes
;
1035 /* Initial states have the epsilon closure of the node which is
1036 the first node of the regular expression. */
1037 first
= dfa
->str_tree
->first
->node_idx
;
1038 dfa
->init_node
= first
;
1039 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1040 if (BE (err
!= REG_NOERROR
, 0))
1043 /* The back-references which are in initial states can epsilon transit,
1044 since in this case all of the subexpressions can be null.
1045 Then we add epsilon closures of the nodes which are the next nodes of
1046 the back-references. */
1047 if (dfa
->nbackref
> 0)
1048 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1050 Idx node_idx
= init_nodes
.elems
[i
];
1051 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1054 if (type
!= OP_BACK_REF
)
1056 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1058 re_token_t
*clexp_node
;
1059 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1060 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1061 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1064 if (clexp_idx
== init_nodes
.nelem
)
1067 if (type
== OP_BACK_REF
)
1069 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1070 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1072 reg_errcode_t merge_err
1073 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1074 if (merge_err
!= REG_NOERROR
)
1081 /* It must be the first time to invoke acquire_state. */
1082 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1083 /* We don't check ERR here, since the initial state must not be NULL. */
1084 if (BE (dfa
->init_state
== NULL
, 0))
1086 if (dfa
->init_state
->has_constraint
)
1088 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1090 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1092 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1096 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1097 || dfa
->init_state_begbuf
== NULL
, 0))
1101 dfa
->init_state_word
= dfa
->init_state_nl
1102 = dfa
->init_state_begbuf
= dfa
->init_state
;
1104 re_node_set_free (&init_nodes
);
1108 #ifdef RE_ENABLE_I18N
1109 /* If it is possible to do searching in single byte encoding instead of UTF-8
1110 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1111 DFA nodes where needed. */
1114 optimize_utf8 (re_dfa_t
*dfa
)
1118 bool mb_chars
= false;
1119 bool has_period
= false;
1121 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1122 switch (dfa
->nodes
[node
].type
)
1125 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1129 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1137 /* Word anchors etc. cannot be handled. It's okay to test
1138 opr.ctx_type since constraints (for all DFA nodes) are
1139 created by ORing one or more opr.ctx_type values. */
1149 case OP_DUP_ASTERISK
:
1150 case OP_OPEN_SUBEXP
:
1151 case OP_CLOSE_SUBEXP
:
1153 case COMPLEX_BRACKET
:
1155 case SIMPLE_BRACKET
:
1156 /* Just double check. */
1158 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1160 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1161 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1163 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1173 if (mb_chars
|| has_period
)
1174 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1176 if (dfa
->nodes
[node
].type
== CHARACTER
1177 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1178 dfa
->nodes
[node
].mb_partial
= 0;
1179 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1180 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1183 /* The search can be in single byte locale. */
1184 dfa
->mb_cur_max
= 1;
1186 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1190 /* Analyze the structure tree, and calculate "first", "next", "edest",
1191 "eclosure", and "inveclosure". */
1193 static reg_errcode_t
1194 analyze (regex_t
*preg
)
1196 re_dfa_t
*dfa
= preg
->buffer
;
1199 /* Allocate arrays. */
1200 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1201 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1202 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1203 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1204 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1205 || dfa
->eclosures
== NULL
, 0))
1208 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1209 if (dfa
->subexp_map
!= NULL
)
1212 for (i
= 0; i
< preg
->re_nsub
; i
++)
1213 dfa
->subexp_map
[i
] = i
;
1214 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1215 for (i
= 0; i
< preg
->re_nsub
; i
++)
1216 if (dfa
->subexp_map
[i
] != i
)
1218 if (i
== preg
->re_nsub
)
1220 free (dfa
->subexp_map
);
1221 dfa
->subexp_map
= NULL
;
1225 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1226 if (BE (ret
!= REG_NOERROR
, 0))
1228 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1229 if (BE (ret
!= REG_NOERROR
, 0))
1231 preorder (dfa
->str_tree
, calc_next
, dfa
);
1232 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1233 if (BE (ret
!= REG_NOERROR
, 0))
1235 ret
= calc_eclosure (dfa
);
1236 if (BE (ret
!= REG_NOERROR
, 0))
1239 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1240 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1241 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1244 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1245 if (BE (dfa
->inveclosures
== NULL
, 0))
1247 ret
= calc_inveclosure (dfa
);
1253 /* Our parse trees are very unbalanced, so we cannot use a stack to
1254 implement parse tree visits. Instead, we use parent pointers and
1255 some hairy code in these two functions. */
1256 static reg_errcode_t
1257 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1260 bin_tree_t
*node
, *prev
;
1262 for (node
= root
; ; )
1264 /* Descend down the tree, preferably to the left (or to the right
1265 if that's the only child). */
1266 while (node
->left
|| node
->right
)
1274 reg_errcode_t err
= fn (extra
, node
);
1275 if (BE (err
!= REG_NOERROR
, 0))
1277 if (node
->parent
== NULL
)
1280 node
= node
->parent
;
1282 /* Go up while we have a node that is reached from the right. */
1283 while (node
->right
== prev
|| node
->right
== NULL
);
1288 static reg_errcode_t
1289 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1294 for (node
= root
; ; )
1296 reg_errcode_t err
= fn (extra
, node
);
1297 if (BE (err
!= REG_NOERROR
, 0))
1300 /* Go to the left node, or up and to the right. */
1305 bin_tree_t
*prev
= NULL
;
1306 while (node
->right
== prev
|| node
->right
== NULL
)
1309 node
= node
->parent
;
1318 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1319 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1320 backreferences as well. Requires a preorder visit. */
1321 static reg_errcode_t
1322 optimize_subexps (void *extra
, bin_tree_t
*node
)
1324 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1326 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1328 int idx
= node
->token
.opr
.idx
;
1329 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1330 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1333 else if (node
->token
.type
== SUBEXP
1334 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1336 Idx other_idx
= node
->left
->token
.opr
.idx
;
1338 node
->left
= node
->left
->left
;
1340 node
->left
->parent
= node
;
1342 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1343 if (other_idx
< BITSET_WORD_BITS
)
1344 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1350 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1351 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1352 static reg_errcode_t
1353 lower_subexps (void *extra
, bin_tree_t
*node
)
1355 regex_t
*preg
= (regex_t
*) extra
;
1356 reg_errcode_t err
= REG_NOERROR
;
1358 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1360 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1362 node
->left
->parent
= node
;
1364 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1366 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1368 node
->right
->parent
= node
;
1375 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1377 re_dfa_t
*dfa
= preg
->buffer
;
1378 bin_tree_t
*body
= node
->left
;
1379 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1382 /* We do not optimize empty subexpressions, because otherwise we may
1383 have bad CONCAT nodes with NULL children. This is obviously not
1384 very common, so we do not lose much. An example that triggers
1385 this case is the sed "script" /\(\)/x. */
1386 && node
->left
!= NULL
1387 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1388 || !(dfa
->used_bkref_map
1389 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1392 /* Convert the SUBEXP node to the concatenation of an
1393 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1394 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1395 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1396 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1397 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1398 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1404 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1405 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1409 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1410 nodes. Requires a postorder visit. */
1411 static reg_errcode_t
1412 calc_first (void *extra
, bin_tree_t
*node
)
1414 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1415 if (node
->token
.type
== CONCAT
)
1417 node
->first
= node
->left
->first
;
1418 node
->node_idx
= node
->left
->node_idx
;
1423 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1424 if (BE (node
->node_idx
== REG_MISSING
, 0))
1426 if (node
->token
.type
== ANCHOR
)
1427 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1432 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1433 static reg_errcode_t
1434 calc_next (void *extra
, bin_tree_t
*node
)
1436 switch (node
->token
.type
)
1438 case OP_DUP_ASTERISK
:
1439 node
->left
->next
= node
;
1442 node
->left
->next
= node
->right
->first
;
1443 node
->right
->next
= node
->next
;
1447 node
->left
->next
= node
->next
;
1449 node
->right
->next
= node
->next
;
1455 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1456 static reg_errcode_t
1457 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1459 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1460 Idx idx
= node
->node_idx
;
1461 reg_errcode_t err
= REG_NOERROR
;
1463 switch (node
->token
.type
)
1469 assert (node
->next
== NULL
);
1472 case OP_DUP_ASTERISK
:
1476 dfa
->has_plural_match
= 1;
1477 if (node
->left
!= NULL
)
1478 left
= node
->left
->first
->node_idx
;
1480 left
= node
->next
->node_idx
;
1481 if (node
->right
!= NULL
)
1482 right
= node
->right
->first
->node_idx
;
1484 right
= node
->next
->node_idx
;
1485 assert (REG_VALID_INDEX (left
));
1486 assert (REG_VALID_INDEX (right
));
1487 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1492 case OP_OPEN_SUBEXP
:
1493 case OP_CLOSE_SUBEXP
:
1494 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1498 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1499 if (node
->token
.type
== OP_BACK_REF
)
1500 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1504 assert (!IS_EPSILON_NODE (node
->token
.type
));
1505 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1512 /* Duplicate the epsilon closure of the node ROOT_NODE.
1513 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1514 to their own constraint. */
1516 static reg_errcode_t
1518 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1519 Idx root_node
, unsigned int init_constraint
)
1521 Idx org_node
, clone_node
;
1523 unsigned int constraint
= init_constraint
;
1524 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1526 Idx org_dest
, clone_dest
;
1527 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1529 /* If the back reference epsilon-transit, its destination must
1530 also have the constraint. Then duplicate the epsilon closure
1531 of the destination of the back reference, and store it in
1532 edests of the back reference. */
1533 org_dest
= dfa
->nexts
[org_node
];
1534 re_node_set_empty (dfa
->edests
+ clone_node
);
1535 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1536 if (BE (clone_dest
== REG_MISSING
, 0))
1538 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1539 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1543 else if (dfa
->edests
[org_node
].nelem
== 0)
1545 /* In case of the node can't epsilon-transit, don't duplicate the
1546 destination and store the original destination as the
1547 destination of the node. */
1548 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1551 else if (dfa
->edests
[org_node
].nelem
== 1)
1553 /* In case of the node can epsilon-transit, and it has only one
1555 org_dest
= dfa
->edests
[org_node
].elems
[0];
1556 re_node_set_empty (dfa
->edests
+ clone_node
);
1557 /* If the node is root_node itself, it means the epsilon closure
1558 has a loop. Then tie it to the destination of the root_node. */
1559 if (org_node
== root_node
&& clone_node
!= org_node
)
1561 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1566 /* In case the node has another constraint, append it. */
1567 constraint
|= dfa
->nodes
[org_node
].constraint
;
1568 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1569 if (BE (clone_dest
== REG_MISSING
, 0))
1571 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1575 else /* dfa->edests[org_node].nelem == 2 */
1577 /* In case of the node can epsilon-transit, and it has two
1578 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1579 org_dest
= dfa
->edests
[org_node
].elems
[0];
1580 re_node_set_empty (dfa
->edests
+ clone_node
);
1581 /* Search for a duplicated node which satisfies the constraint. */
1582 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1583 if (clone_dest
== REG_MISSING
)
1585 /* There is no such duplicated node, create a new one. */
1587 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1588 if (BE (clone_dest
== REG_MISSING
, 0))
1590 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1593 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1594 root_node
, constraint
);
1595 if (BE (err
!= REG_NOERROR
, 0))
1600 /* There is a duplicated node which satisfies the constraint,
1601 use it to avoid infinite loop. */
1602 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1607 org_dest
= dfa
->edests
[org_node
].elems
[1];
1608 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1609 if (BE (clone_dest
== REG_MISSING
, 0))
1611 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1615 org_node
= org_dest
;
1616 clone_node
= clone_dest
;
1621 /* Search for a node which is duplicated from the node ORG_NODE, and
1622 satisfies the constraint CONSTRAINT. */
1625 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1626 unsigned int constraint
)
1629 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1631 if (org_node
== dfa
->org_indices
[idx
]
1632 && constraint
== dfa
->nodes
[idx
].constraint
)
1633 return idx
; /* Found. */
1635 return REG_MISSING
; /* Not found. */
1638 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1639 Return the index of the new node, or REG_MISSING if insufficient storage is
1643 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1645 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1646 if (BE (dup_idx
!= REG_MISSING
, 1))
1648 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1649 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1650 dfa
->nodes
[dup_idx
].duplicated
= 1;
1652 /* Store the index of the original node. */
1653 dfa
->org_indices
[dup_idx
] = org_idx
;
1658 static reg_errcode_t
1659 calc_inveclosure (re_dfa_t
*dfa
)
1663 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1664 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1666 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1668 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1669 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1671 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1680 /* Calculate "eclosure" for all the node in DFA. */
1682 static reg_errcode_t
1683 calc_eclosure (re_dfa_t
*dfa
)
1688 assert (dfa
->nodes_len
> 0);
1691 /* For each nodes, calculate epsilon closure. */
1692 for (node_idx
= 0; ; ++node_idx
)
1695 re_node_set eclosure_elem
;
1696 if (node_idx
== dfa
->nodes_len
)
1705 assert (dfa
->eclosures
[node_idx
].nelem
!= REG_MISSING
);
1708 /* If we have already calculated, skip it. */
1709 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1711 /* Calculate epsilon closure of 'node_idx'. */
1712 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1713 if (BE (err
!= REG_NOERROR
, 0))
1716 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1719 re_node_set_free (&eclosure_elem
);
1725 /* Calculate epsilon closure of NODE. */
1727 static reg_errcode_t
1728 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1732 re_node_set eclosure
;
1734 bool incomplete
= false;
1735 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1736 if (BE (err
!= REG_NOERROR
, 0))
1739 /* This indicates that we are calculating this node now.
1740 We reference this value to avoid infinite loop. */
1741 dfa
->eclosures
[node
].nelem
= REG_MISSING
;
1743 /* If the current node has constraints, duplicate all nodes
1744 since they must inherit the constraints. */
1745 if (dfa
->nodes
[node
].constraint
1746 && dfa
->edests
[node
].nelem
1747 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1749 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1750 dfa
->nodes
[node
].constraint
);
1751 if (BE (err
!= REG_NOERROR
, 0))
1755 /* Expand each epsilon destination nodes. */
1756 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1757 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1759 re_node_set eclosure_elem
;
1760 Idx edest
= dfa
->edests
[node
].elems
[i
];
1761 /* If calculating the epsilon closure of 'edest' is in progress,
1762 return intermediate result. */
1763 if (dfa
->eclosures
[edest
].nelem
== REG_MISSING
)
1768 /* If we haven't calculated the epsilon closure of 'edest' yet,
1769 calculate now. Otherwise use calculated epsilon closure. */
1770 if (dfa
->eclosures
[edest
].nelem
== 0)
1772 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1773 if (BE (err
!= REG_NOERROR
, 0))
1777 eclosure_elem
= dfa
->eclosures
[edest
];
1778 /* Merge the epsilon closure of 'edest'. */
1779 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1780 if (BE (err
!= REG_NOERROR
, 0))
1782 /* If the epsilon closure of 'edest' is incomplete,
1783 the epsilon closure of this node is also incomplete. */
1784 if (dfa
->eclosures
[edest
].nelem
== 0)
1787 re_node_set_free (&eclosure_elem
);
1791 /* An epsilon closure includes itself. */
1792 ok
= re_node_set_insert (&eclosure
, node
);
1795 if (incomplete
&& !root
)
1796 dfa
->eclosures
[node
].nelem
= 0;
1798 dfa
->eclosures
[node
] = eclosure
;
1799 *new_set
= eclosure
;
1803 /* Functions for token which are used in the parser. */
1805 /* Fetch a token from INPUT.
1806 We must not use this function inside bracket expressions. */
1810 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1812 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1815 /* Peek a token from INPUT, and return the length of the token.
1816 We must not use this function inside bracket expressions. */
1820 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1824 if (re_string_eoi (input
))
1826 token
->type
= END_OF_RE
;
1830 c
= re_string_peek_byte (input
, 0);
1833 token
->word_char
= 0;
1834 #ifdef RE_ENABLE_I18N
1835 token
->mb_partial
= 0;
1836 if (input
->mb_cur_max
> 1 &&
1837 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1839 token
->type
= CHARACTER
;
1840 token
->mb_partial
= 1;
1847 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1849 token
->type
= BACK_SLASH
;
1853 c2
= re_string_peek_byte_case (input
, 1);
1855 token
->type
= CHARACTER
;
1856 #ifdef RE_ENABLE_I18N
1857 if (input
->mb_cur_max
> 1)
1859 wint_t wc
= re_string_wchar_at (input
,
1860 re_string_cur_idx (input
) + 1);
1861 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1865 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1870 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1871 token
->type
= OP_ALT
;
1873 case '1': case '2': case '3': case '4': case '5':
1874 case '6': case '7': case '8': case '9':
1875 if (!(syntax
& RE_NO_BK_REFS
))
1877 token
->type
= OP_BACK_REF
;
1878 token
->opr
.idx
= c2
- '1';
1882 if (!(syntax
& RE_NO_GNU_OPS
))
1884 token
->type
= ANCHOR
;
1885 token
->opr
.ctx_type
= WORD_FIRST
;
1889 if (!(syntax
& RE_NO_GNU_OPS
))
1891 token
->type
= ANCHOR
;
1892 token
->opr
.ctx_type
= WORD_LAST
;
1896 if (!(syntax
& RE_NO_GNU_OPS
))
1898 token
->type
= ANCHOR
;
1899 token
->opr
.ctx_type
= WORD_DELIM
;
1903 if (!(syntax
& RE_NO_GNU_OPS
))
1905 token
->type
= ANCHOR
;
1906 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1910 if (!(syntax
& RE_NO_GNU_OPS
))
1911 token
->type
= OP_WORD
;
1914 if (!(syntax
& RE_NO_GNU_OPS
))
1915 token
->type
= OP_NOTWORD
;
1918 if (!(syntax
& RE_NO_GNU_OPS
))
1919 token
->type
= OP_SPACE
;
1922 if (!(syntax
& RE_NO_GNU_OPS
))
1923 token
->type
= OP_NOTSPACE
;
1926 if (!(syntax
& RE_NO_GNU_OPS
))
1928 token
->type
= ANCHOR
;
1929 token
->opr
.ctx_type
= BUF_FIRST
;
1933 if (!(syntax
& RE_NO_GNU_OPS
))
1935 token
->type
= ANCHOR
;
1936 token
->opr
.ctx_type
= BUF_LAST
;
1940 if (!(syntax
& RE_NO_BK_PARENS
))
1941 token
->type
= OP_OPEN_SUBEXP
;
1944 if (!(syntax
& RE_NO_BK_PARENS
))
1945 token
->type
= OP_CLOSE_SUBEXP
;
1948 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1949 token
->type
= OP_DUP_PLUS
;
1952 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1953 token
->type
= OP_DUP_QUESTION
;
1956 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1957 token
->type
= OP_OPEN_DUP_NUM
;
1960 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1961 token
->type
= OP_CLOSE_DUP_NUM
;
1969 token
->type
= CHARACTER
;
1970 #ifdef RE_ENABLE_I18N
1971 if (input
->mb_cur_max
> 1)
1973 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1974 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1978 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1983 if (syntax
& RE_NEWLINE_ALT
)
1984 token
->type
= OP_ALT
;
1987 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1988 token
->type
= OP_ALT
;
1991 token
->type
= OP_DUP_ASTERISK
;
1994 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1995 token
->type
= OP_DUP_PLUS
;
1998 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1999 token
->type
= OP_DUP_QUESTION
;
2002 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
2003 token
->type
= OP_OPEN_DUP_NUM
;
2006 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
2007 token
->type
= OP_CLOSE_DUP_NUM
;
2010 if (syntax
& RE_NO_BK_PARENS
)
2011 token
->type
= OP_OPEN_SUBEXP
;
2014 if (syntax
& RE_NO_BK_PARENS
)
2015 token
->type
= OP_CLOSE_SUBEXP
;
2018 token
->type
= OP_OPEN_BRACKET
;
2021 token
->type
= OP_PERIOD
;
2024 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
2025 re_string_cur_idx (input
) != 0)
2027 char prev
= re_string_peek_byte (input
, -1);
2028 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
2031 token
->type
= ANCHOR
;
2032 token
->opr
.ctx_type
= LINE_FIRST
;
2035 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2036 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2039 re_string_skip_bytes (input
, 1);
2040 peek_token (&next
, input
, syntax
);
2041 re_string_skip_bytes (input
, -1);
2042 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2045 token
->type
= ANCHOR
;
2046 token
->opr
.ctx_type
= LINE_LAST
;
2054 /* Peek a token from INPUT, and return the length of the token.
2055 We must not use this function out of bracket expressions. */
2059 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2062 if (re_string_eoi (input
))
2064 token
->type
= END_OF_RE
;
2067 c
= re_string_peek_byte (input
, 0);
2070 #ifdef RE_ENABLE_I18N
2071 if (input
->mb_cur_max
> 1 &&
2072 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2074 token
->type
= CHARACTER
;
2077 #endif /* RE_ENABLE_I18N */
2079 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2080 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2082 /* In this case, '\' escape a character. */
2084 re_string_skip_bytes (input
, 1);
2085 c2
= re_string_peek_byte (input
, 0);
2087 token
->type
= CHARACTER
;
2090 if (c
== '[') /* '[' is a special char in a bracket exps. */
2094 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2095 c2
= re_string_peek_byte (input
, 1);
2103 token
->type
= OP_OPEN_COLL_ELEM
;
2106 token
->type
= OP_OPEN_EQUIV_CLASS
;
2109 if (syntax
& RE_CHAR_CLASSES
)
2111 token
->type
= OP_OPEN_CHAR_CLASS
;
2114 /* else fall through. */
2116 token
->type
= CHARACTER
;
2126 token
->type
= OP_CHARSET_RANGE
;
2129 token
->type
= OP_CLOSE_BRACKET
;
2132 token
->type
= OP_NON_MATCH_LIST
;
2135 token
->type
= CHARACTER
;
2140 /* Functions for parser. */
2142 /* Entry point of the parser.
2143 Parse the regular expression REGEXP and return the structure tree.
2144 If an error occurs, ERR is set by error code, and return NULL.
2145 This function build the following tree, from regular expression <reg_exp>:
2151 CAT means concatenation.
2152 EOR means end of regular expression. */
2155 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2158 re_dfa_t
*dfa
= preg
->buffer
;
2159 bin_tree_t
*tree
, *eor
, *root
;
2160 re_token_t current_token
;
2161 dfa
->syntax
= syntax
;
2162 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2163 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2164 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2166 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2168 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2171 if (BE (eor
== NULL
|| root
== NULL
, 0))
2179 /* This function build the following tree, from regular expression
2180 <branch1>|<branch2>:
2186 ALT means alternative, which represents the operator '|'. */
2189 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2190 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2192 re_dfa_t
*dfa
= preg
->buffer
;
2193 bin_tree_t
*tree
, *branch
= NULL
;
2194 bitset_word_t initial_bkref_map
= dfa
->completed_bkref_map
;
2195 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2196 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2199 while (token
->type
== OP_ALT
)
2201 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2202 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2203 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2205 bitset_word_t accumulated_bkref_map
= dfa
->completed_bkref_map
;
2206 dfa
->completed_bkref_map
= initial_bkref_map
;
2207 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2208 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2211 postorder (tree
, free_tree
, NULL
);
2214 dfa
->completed_bkref_map
|= accumulated_bkref_map
;
2218 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2219 if (BE (tree
== NULL
, 0))
2228 /* This function build the following tree, from regular expression
2235 CAT means concatenation. */
2238 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2239 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2241 bin_tree_t
*tree
, *expr
;
2242 re_dfa_t
*dfa
= preg
->buffer
;
2243 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2244 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2247 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2248 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2250 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2251 if (BE (*err
!= REG_NOERROR
&& expr
== NULL
, 0))
2254 postorder (tree
, free_tree
, NULL
);
2257 if (tree
!= NULL
&& expr
!= NULL
)
2259 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2260 if (newtree
== NULL
)
2262 postorder (expr
, free_tree
, NULL
);
2263 postorder (tree
, free_tree
, NULL
);
2269 else if (tree
== NULL
)
2271 /* Otherwise expr == NULL, we don't need to create new tree. */
2276 /* This function build the following tree, from regular expression a*:
2283 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2284 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2286 re_dfa_t
*dfa
= preg
->buffer
;
2288 switch (token
->type
)
2291 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2292 if (BE (tree
== NULL
, 0))
2297 #ifdef RE_ENABLE_I18N
2298 if (dfa
->mb_cur_max
> 1)
2300 while (!re_string_eoi (regexp
)
2301 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2303 bin_tree_t
*mbc_remain
;
2304 fetch_token (token
, regexp
, syntax
);
2305 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2306 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2307 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2316 case OP_OPEN_SUBEXP
:
2317 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2318 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2321 case OP_OPEN_BRACKET
:
2322 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2323 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2327 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2332 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2333 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2334 if (BE (tree
== NULL
, 0))
2340 dfa
->has_mb_node
= 1;
2342 case OP_OPEN_DUP_NUM
:
2343 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2349 case OP_DUP_ASTERISK
:
2351 case OP_DUP_QUESTION
:
2352 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2357 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2359 fetch_token (token
, regexp
, syntax
);
2360 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2362 /* else fall through */
2363 case OP_CLOSE_SUBEXP
:
2364 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2365 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2370 /* else fall through */
2371 case OP_CLOSE_DUP_NUM
:
2372 /* We treat it as a normal character. */
2374 /* Then we can these characters as normal characters. */
2375 token
->type
= CHARACTER
;
2376 /* mb_partial and word_char bits should be initialized already
2378 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2379 if (BE (tree
== NULL
, 0))
2386 if ((token
->opr
.ctx_type
2387 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2388 && dfa
->word_ops_used
== 0)
2389 init_word_char (dfa
);
2390 if (token
->opr
.ctx_type
== WORD_DELIM
2391 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2393 bin_tree_t
*tree_first
, *tree_last
;
2394 if (token
->opr
.ctx_type
== WORD_DELIM
)
2396 token
->opr
.ctx_type
= WORD_FIRST
;
2397 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2398 token
->opr
.ctx_type
= WORD_LAST
;
2402 token
->opr
.ctx_type
= INSIDE_WORD
;
2403 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2404 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2406 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2407 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2408 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2416 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2417 if (BE (tree
== NULL
, 0))
2423 /* We must return here, since ANCHORs can't be followed
2424 by repetition operators.
2425 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2426 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2427 fetch_token (token
, regexp
, syntax
);
2430 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2431 if (BE (tree
== NULL
, 0))
2436 if (dfa
->mb_cur_max
> 1)
2437 dfa
->has_mb_node
= 1;
2441 tree
= build_charclass_op (dfa
, regexp
->trans
,
2444 token
->type
== OP_NOTWORD
, err
);
2445 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2450 tree
= build_charclass_op (dfa
, regexp
->trans
,
2453 token
->type
== OP_NOTSPACE
, err
);
2454 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2464 /* Must not happen? */
2470 fetch_token (token
, regexp
, syntax
);
2472 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2473 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2475 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
,
2477 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2480 postorder (tree
, free_tree
, NULL
);
2484 /* In BRE consecutive duplications are not allowed. */
2485 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2486 && (token
->type
== OP_DUP_ASTERISK
2487 || token
->type
== OP_OPEN_DUP_NUM
))
2490 postorder (tree
, free_tree
, NULL
);
2499 /* This function build the following tree, from regular expression
2507 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2508 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2510 re_dfa_t
*dfa
= preg
->buffer
;
2513 cur_nsub
= preg
->re_nsub
++;
2515 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2517 /* The subexpression may be a null string. */
2518 if (token
->type
== OP_CLOSE_SUBEXP
)
2522 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2523 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2526 postorder (tree
, free_tree
, NULL
);
2529 if (BE (*err
!= REG_NOERROR
, 0))
2533 if (cur_nsub
<= '9' - '1')
2534 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2536 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2537 if (BE (tree
== NULL
, 0))
2542 tree
->token
.opr
.idx
= cur_nsub
;
2546 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2549 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2550 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2552 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2553 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2554 re_token_t start_token
= *token
;
2556 if (token
->type
== OP_OPEN_DUP_NUM
)
2559 start
= fetch_number (regexp
, token
, syntax
);
2560 if (start
== REG_MISSING
)
2562 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2563 start
= 0; /* We treat "{,m}" as "{0,m}". */
2566 *err
= REG_BADBR
; /* <re>{} is invalid. */
2570 if (BE (start
!= REG_ERROR
, 1))
2572 /* We treat "{n}" as "{n,n}". */
2573 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2574 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2575 ? fetch_number (regexp
, token
, syntax
) : REG_ERROR
));
2577 if (BE (start
== REG_ERROR
|| end
== REG_ERROR
, 0))
2579 /* Invalid sequence. */
2580 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2582 if (token
->type
== END_OF_RE
)
2590 /* If the syntax bit is set, rollback. */
2591 re_string_set_index (regexp
, start_idx
);
2592 *token
= start_token
;
2593 token
->type
= CHARACTER
;
2594 /* mb_partial and word_char bits should be already initialized by
2599 if (BE ((end
!= REG_MISSING
&& start
> end
)
2600 || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2602 /* First number greater than second. */
2607 if (BE (RE_DUP_MAX
< (end
== REG_MISSING
? start
: end
), 0))
2615 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2616 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : REG_MISSING
;
2619 fetch_token (token
, regexp
, syntax
);
2621 if (BE (elem
== NULL
, 0))
2623 if (BE (start
== 0 && end
== 0, 0))
2625 postorder (elem
, free_tree
, NULL
);
2629 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2630 if (BE (start
> 0, 0))
2633 for (i
= 2; i
<= start
; ++i
)
2635 elem
= duplicate_tree (elem
, dfa
);
2636 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2637 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2638 goto parse_dup_op_espace
;
2644 /* Duplicate ELEM before it is marked optional. */
2645 elem
= duplicate_tree (elem
, dfa
);
2646 if (BE (elem
== NULL
, 0))
2647 goto parse_dup_op_espace
;
2653 if (elem
->token
.type
== SUBEXP
)
2655 uintptr_t subidx
= elem
->token
.opr
.idx
;
2656 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2659 tree
= create_tree (dfa
, elem
, NULL
,
2660 (end
== REG_MISSING
? OP_DUP_ASTERISK
: OP_ALT
));
2661 if (BE (tree
== NULL
, 0))
2662 goto parse_dup_op_espace
;
2664 /* From gnulib's "intprops.h":
2665 True if the arithmetic type T is signed. */
2666 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2668 /* This loop is actually executed only when end != REG_MISSING,
2669 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2670 already created the start+1-th copy. */
2671 if (TYPE_SIGNED (Idx
) || end
!= REG_MISSING
)
2672 for (i
= start
+ 2; i
<= end
; ++i
)
2674 elem
= duplicate_tree (elem
, dfa
);
2675 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2676 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2677 goto parse_dup_op_espace
;
2679 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2680 if (BE (tree
== NULL
, 0))
2681 goto parse_dup_op_espace
;
2685 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2689 parse_dup_op_espace
:
2694 /* Size of the names for collating symbol/equivalence_class/character_class.
2695 I'm not sure, but maybe enough. */
2696 #define BRACKET_NAME_BUF_SIZE 32
2699 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2700 Build the range expression which starts from START_ELEM, and ends
2701 at END_ELEM. The result are written to MBCSET and SBCSET.
2702 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2703 mbcset->range_ends, is a pointer argument since we may
2706 static reg_errcode_t
2708 # ifdef RE_ENABLE_I18N
2709 build_range_exp (const reg_syntax_t syntax
,
2711 re_charset_t
*mbcset
,
2713 const bracket_elem_t
*start_elem
,
2714 const bracket_elem_t
*end_elem
)
2715 # else /* not RE_ENABLE_I18N */
2716 build_range_exp (const reg_syntax_t syntax
,
2718 const bracket_elem_t
*start_elem
,
2719 const bracket_elem_t
*end_elem
)
2720 # endif /* not RE_ENABLE_I18N */
2722 unsigned int start_ch
, end_ch
;
2723 /* Equivalence Classes and Character Classes can't be a range start/end. */
2724 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2725 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2729 /* We can handle no multi character collating elements without libc
2731 if (BE ((start_elem
->type
== COLL_SYM
2732 && strlen ((char *) start_elem
->opr
.name
) > 1)
2733 || (end_elem
->type
== COLL_SYM
2734 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2735 return REG_ECOLLATE
;
2737 # ifdef RE_ENABLE_I18N
2743 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2744 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2746 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2747 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2749 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2750 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2751 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2752 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2753 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2754 return REG_ECOLLATE
;
2755 else if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_wc
> end_wc
, 0))
2758 /* Got valid collation sequence values, add them as a new entry.
2759 However, for !_LIBC we have no collation elements: if the
2760 character set is single byte, the single byte character set
2761 that we build below suffices. parse_bracket_exp passes
2762 no MBCSET if dfa->mb_cur_max == 1. */
2765 /* Check the space of the arrays. */
2766 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2768 /* There is not enough space, need realloc. */
2769 wchar_t *new_array_start
, *new_array_end
;
2772 /* +1 in case of mbcset->nranges is 0. */
2773 new_nranges
= 2 * mbcset
->nranges
+ 1;
2774 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2775 are NULL if *range_alloc == 0. */
2776 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2778 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2781 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2784 mbcset
->range_starts
= new_array_start
;
2785 mbcset
->range_ends
= new_array_end
;
2786 *range_alloc
= new_nranges
;
2789 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2790 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2793 /* Build the table for single byte characters. */
2794 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2796 if (start_wc
<= wc
&& wc
<= end_wc
)
2797 bitset_set (sbcset
, wc
);
2800 # else /* not RE_ENABLE_I18N */
2803 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2804 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2806 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2807 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2809 if (start_ch
> end_ch
)
2811 /* Build the table for single byte characters. */
2812 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2813 if (start_ch
<= ch
&& ch
<= end_ch
)
2814 bitset_set (sbcset
, ch
);
2816 # endif /* not RE_ENABLE_I18N */
2819 #endif /* not _LIBC */
2822 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2823 Build the collating element which is represented by NAME.
2824 The result are written to MBCSET and SBCSET.
2825 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2826 pointer argument since we may update it. */
2828 static reg_errcode_t
2830 # ifdef RE_ENABLE_I18N
2831 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2832 Idx
*coll_sym_alloc
, const unsigned char *name
)
2833 # else /* not RE_ENABLE_I18N */
2834 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2835 # endif /* not RE_ENABLE_I18N */
2837 size_t name_len
= strlen ((const char *) name
);
2838 if (BE (name_len
!= 1, 0))
2839 return REG_ECOLLATE
;
2842 bitset_set (sbcset
, name
[0]);
2846 #endif /* not _LIBC */
2848 /* This function parse bracket expression like "[abc]", "[a-c]",
2852 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2853 reg_syntax_t syntax
, reg_errcode_t
*err
)
2856 const unsigned char *collseqmb
;
2857 const char *collseqwc
;
2860 const int32_t *symb_table
;
2861 const unsigned char *extra
;
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Seek the collating symbol entry corresponding to NAME.
2865 Return the index of the symbol in the SYMB_TABLE,
2866 or -1 if not found. */
2869 __attribute__ ((always_inline
))
2870 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2874 for (elem
= 0; elem
< table_size
; elem
++)
2875 if (symb_table
[2 * elem
] != 0)
2877 int32_t idx
= symb_table
[2 * elem
+ 1];
2878 /* Skip the name of collating element name. */
2879 idx
+= 1 + extra
[idx
];
2880 if (/* Compare the length of the name. */
2881 name_len
== extra
[idx
]
2882 /* Compare the name. */
2883 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2884 /* Yep, this is the entry. */
2890 /* Local function for parse_bracket_exp used in _LIBC environment.
2891 Look up the collation sequence value of BR_ELEM.
2892 Return the value if succeeded, UINT_MAX otherwise. */
2894 auto inline unsigned int
2895 __attribute__ ((always_inline
))
2896 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2898 if (br_elem
->type
== SB_CHAR
)
2901 if (MB_CUR_MAX == 1)
2904 return collseqmb
[br_elem
->opr
.ch
];
2907 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2908 return __collseq_table_lookup (collseqwc
, wc
);
2911 else if (br_elem
->type
== MB_CHAR
)
2914 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2916 else if (br_elem
->type
== COLL_SYM
)
2918 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2922 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2926 /* We found the entry. */
2927 idx
= symb_table
[2 * elem
+ 1];
2928 /* Skip the name of collating element name. */
2929 idx
+= 1 + extra
[idx
];
2930 /* Skip the byte sequence of the collating element. */
2931 idx
+= 1 + extra
[idx
];
2932 /* Adjust for the alignment. */
2933 idx
= (idx
+ 3) & ~3;
2934 /* Skip the multibyte collation sequence value. */
2935 idx
+= sizeof (unsigned int);
2936 /* Skip the wide char sequence of the collating element. */
2937 idx
+= sizeof (unsigned int) *
2938 (1 + *(unsigned int *) (extra
+ idx
));
2939 /* Return the collation sequence value. */
2940 return *(unsigned int *) (extra
+ idx
);
2942 else if (sym_name_len
== 1)
2944 /* No valid character. Match it as a single byte
2946 return collseqmb
[br_elem
->opr
.name
[0]];
2949 else if (sym_name_len
== 1)
2950 return collseqmb
[br_elem
->opr
.name
[0]];
2955 /* Local function for parse_bracket_exp used in _LIBC environment.
2956 Build the range expression which starts from START_ELEM, and ends
2957 at END_ELEM. The result are written to MBCSET and SBCSET.
2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959 mbcset->range_ends, is a pointer argument since we may
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline
))
2964 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2965 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2968 uint32_t start_collseq
;
2969 uint32_t end_collseq
;
2971 /* Equivalence Classes and Character Classes can't be a range
2973 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2974 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2978 /* FIXME: Implement rational ranges here, too. */
2979 start_collseq
= lookup_collation_sequence_value (start_elem
);
2980 end_collseq
= lookup_collation_sequence_value (end_elem
);
2981 /* Check start/end collation sequence values. */
2982 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2983 return REG_ECOLLATE
;
2984 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2987 /* Got valid collation sequence values, add them as a new entry.
2988 However, if we have no collation elements, and the character set
2989 is single byte, the single byte character set that we
2990 build below suffices. */
2991 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2993 /* Check the space of the arrays. */
2994 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2996 /* There is not enough space, need realloc. */
2997 uint32_t *new_array_start
;
2998 uint32_t *new_array_end
;
3001 /* +1 in case of mbcset->nranges is 0. */
3002 new_nranges
= 2 * mbcset
->nranges
+ 1;
3003 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
3005 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
3008 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
3011 mbcset
->range_starts
= new_array_start
;
3012 mbcset
->range_ends
= new_array_end
;
3013 *range_alloc
= new_nranges
;
3016 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
3017 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
3020 /* Build the table for single byte characters. */
3021 for (ch
= 0; ch
< SBC_MAX
; ch
++)
3023 uint32_t ch_collseq
;
3025 if (MB_CUR_MAX == 1)
3028 ch_collseq
= collseqmb
[ch
];
3030 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3031 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3032 bitset_set (sbcset
, ch
);
3037 /* Local function for parse_bracket_exp used in _LIBC environment.
3038 Build the collating element which is represented by NAME.
3039 The result are written to MBCSET and SBCSET.
3040 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3041 pointer argument since we may update it. */
3043 auto inline reg_errcode_t
3044 __attribute__ ((always_inline
))
3045 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
3046 Idx
*coll_sym_alloc
, const unsigned char *name
)
3049 size_t name_len
= strlen ((const char *) name
);
3052 elem
= seek_collating_symbol_entry (name
, name_len
);
3055 /* We found the entry. */
3056 idx
= symb_table
[2 * elem
+ 1];
3057 /* Skip the name of collating element name. */
3058 idx
+= 1 + extra
[idx
];
3060 else if (name_len
== 1)
3062 /* No valid character, treat it as a normal
3064 bitset_set (sbcset
, name
[0]);
3068 return REG_ECOLLATE
;
3070 /* Got valid collation sequence, add it as a new entry. */
3071 /* Check the space of the arrays. */
3072 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3074 /* Not enough, realloc it. */
3075 /* +1 in case of mbcset->ncoll_syms is 0. */
3076 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3077 /* Use realloc since mbcset->coll_syms is NULL
3079 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3080 new_coll_sym_alloc
);
3081 if (BE (new_coll_syms
== NULL
, 0))
3083 mbcset
->coll_syms
= new_coll_syms
;
3084 *coll_sym_alloc
= new_coll_sym_alloc
;
3086 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3091 if (BE (name_len
!= 1, 0))
3092 return REG_ECOLLATE
;
3095 bitset_set (sbcset
, name
[0]);
3102 re_token_t br_token
;
3103 re_bitset_ptr_t sbcset
;
3104 #ifdef RE_ENABLE_I18N
3105 re_charset_t
*mbcset
;
3106 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3107 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3108 #endif /* not RE_ENABLE_I18N */
3109 bool non_match
= false;
3110 bin_tree_t
*work_tree
;
3112 bool first_round
= true;
3114 collseqmb
= (const unsigned char *)
3115 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3116 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3122 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3123 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3124 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3125 _NL_COLLATE_SYMB_TABLEMB
);
3126 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3127 _NL_COLLATE_SYMB_EXTRAMB
);
3130 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3131 #ifdef RE_ENABLE_I18N
3132 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3133 #endif /* RE_ENABLE_I18N */
3134 #ifdef RE_ENABLE_I18N
3135 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3137 if (BE (sbcset
== NULL
, 0))
3138 #endif /* RE_ENABLE_I18N */
3141 #ifdef RE_ENABLE_I18N
3148 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3149 if (BE (token
->type
== END_OF_RE
, 0))
3152 goto parse_bracket_exp_free_return
;
3154 if (token
->type
== OP_NON_MATCH_LIST
)
3156 #ifdef RE_ENABLE_I18N
3157 mbcset
->non_match
= 1;
3158 #endif /* not RE_ENABLE_I18N */
3160 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3161 bitset_set (sbcset
, '\n');
3162 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3163 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3164 if (BE (token
->type
== END_OF_RE
, 0))
3167 goto parse_bracket_exp_free_return
;
3171 /* We treat the first ']' as a normal character. */
3172 if (token
->type
== OP_CLOSE_BRACKET
)
3173 token
->type
= CHARACTER
;
3177 bracket_elem_t start_elem
, end_elem
;
3178 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3179 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3182 bool is_range_exp
= false;
3185 start_elem
.opr
.name
= start_name_buf
;
3186 start_elem
.type
= COLL_SYM
;
3187 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3188 syntax
, first_round
);
3189 if (BE (ret
!= REG_NOERROR
, 0))
3192 goto parse_bracket_exp_free_return
;
3194 first_round
= false;
3196 /* Get information about the next token. We need it in any case. */
3197 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3199 /* Do not check for ranges if we know they are not allowed. */
3200 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3202 if (BE (token
->type
== END_OF_RE
, 0))
3205 goto parse_bracket_exp_free_return
;
3207 if (token
->type
== OP_CHARSET_RANGE
)
3209 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3210 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3211 if (BE (token2
.type
== END_OF_RE
, 0))
3214 goto parse_bracket_exp_free_return
;
3216 if (token2
.type
== OP_CLOSE_BRACKET
)
3218 /* We treat the last '-' as a normal character. */
3219 re_string_skip_bytes (regexp
, -token_len
);
3220 token
->type
= CHARACTER
;
3223 is_range_exp
= true;
3227 if (is_range_exp
== true)
3229 end_elem
.opr
.name
= end_name_buf
;
3230 end_elem
.type
= COLL_SYM
;
3231 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3233 if (BE (ret
!= REG_NOERROR
, 0))
3236 goto parse_bracket_exp_free_return
;
3239 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3242 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3243 &start_elem
, &end_elem
);
3245 # ifdef RE_ENABLE_I18N
3246 *err
= build_range_exp (syntax
, sbcset
,
3247 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3248 &range_alloc
, &start_elem
, &end_elem
);
3250 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3252 #endif /* RE_ENABLE_I18N */
3253 if (BE (*err
!= REG_NOERROR
, 0))
3254 goto parse_bracket_exp_free_return
;
3258 switch (start_elem
.type
)
3261 bitset_set (sbcset
, start_elem
.opr
.ch
);
3263 #ifdef RE_ENABLE_I18N
3265 /* Check whether the array has enough space. */
3266 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3268 wchar_t *new_mbchars
;
3269 /* Not enough, realloc it. */
3270 /* +1 in case of mbcset->nmbchars is 0. */
3271 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3272 /* Use realloc since array is NULL if *alloc == 0. */
3273 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3275 if (BE (new_mbchars
== NULL
, 0))
3276 goto parse_bracket_exp_espace
;
3277 mbcset
->mbchars
= new_mbchars
;
3279 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3281 #endif /* RE_ENABLE_I18N */
3283 *err
= build_equiv_class (sbcset
,
3284 #ifdef RE_ENABLE_I18N
3285 mbcset
, &equiv_class_alloc
,
3286 #endif /* RE_ENABLE_I18N */
3287 start_elem
.opr
.name
);
3288 if (BE (*err
!= REG_NOERROR
, 0))
3289 goto parse_bracket_exp_free_return
;
3292 *err
= build_collating_symbol (sbcset
,
3293 #ifdef RE_ENABLE_I18N
3294 mbcset
, &coll_sym_alloc
,
3295 #endif /* RE_ENABLE_I18N */
3296 start_elem
.opr
.name
);
3297 if (BE (*err
!= REG_NOERROR
, 0))
3298 goto parse_bracket_exp_free_return
;
3301 *err
= build_charclass (regexp
->trans
, sbcset
,
3302 #ifdef RE_ENABLE_I18N
3303 mbcset
, &char_class_alloc
,
3304 #endif /* RE_ENABLE_I18N */
3305 (const char *) start_elem
.opr
.name
,
3307 if (BE (*err
!= REG_NOERROR
, 0))
3308 goto parse_bracket_exp_free_return
;
3315 if (BE (token
->type
== END_OF_RE
, 0))
3318 goto parse_bracket_exp_free_return
;
3320 if (token
->type
== OP_CLOSE_BRACKET
)
3324 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3326 /* If it is non-matching list. */
3328 bitset_not (sbcset
);
3330 #ifdef RE_ENABLE_I18N
3331 /* Ensure only single byte characters are set. */
3332 if (dfa
->mb_cur_max
> 1)
3333 bitset_mask (sbcset
, dfa
->sb_char
);
3335 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3336 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3337 || mbcset
->non_match
)))
3339 bin_tree_t
*mbc_tree
;
3341 /* Build a tree for complex bracket. */
3342 dfa
->has_mb_node
= 1;
3343 br_token
.type
= COMPLEX_BRACKET
;
3344 br_token
.opr
.mbcset
= mbcset
;
3345 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3346 if (BE (mbc_tree
== NULL
, 0))
3347 goto parse_bracket_exp_espace
;
3348 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3349 if (sbcset
[sbc_idx
])
3351 /* If there are no bits set in sbcset, there is no point
3352 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3353 if (sbc_idx
< BITSET_WORDS
)
3355 /* Build a tree for simple bracket. */
3356 br_token
.type
= SIMPLE_BRACKET
;
3357 br_token
.opr
.sbcset
= sbcset
;
3358 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3359 if (BE (work_tree
== NULL
, 0))
3360 goto parse_bracket_exp_espace
;
3362 /* Then join them by ALT node. */
3363 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3364 if (BE (work_tree
== NULL
, 0))
3365 goto parse_bracket_exp_espace
;
3370 work_tree
= mbc_tree
;
3374 #endif /* not RE_ENABLE_I18N */
3376 #ifdef RE_ENABLE_I18N
3377 free_charset (mbcset
);
3379 /* Build a tree for simple bracket. */
3380 br_token
.type
= SIMPLE_BRACKET
;
3381 br_token
.opr
.sbcset
= sbcset
;
3382 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3383 if (BE (work_tree
== NULL
, 0))
3384 goto parse_bracket_exp_espace
;
3388 parse_bracket_exp_espace
:
3390 parse_bracket_exp_free_return
:
3392 #ifdef RE_ENABLE_I18N
3393 free_charset (mbcset
);
3394 #endif /* RE_ENABLE_I18N */
3398 /* Parse an element in the bracket expression. */
3400 static reg_errcode_t
3401 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3402 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3403 reg_syntax_t syntax
, bool accept_hyphen
)
3405 #ifdef RE_ENABLE_I18N
3407 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3408 if (cur_char_size
> 1)
3410 elem
->type
= MB_CHAR
;
3411 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3412 re_string_skip_bytes (regexp
, cur_char_size
);
3415 #endif /* RE_ENABLE_I18N */
3416 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3417 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3418 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3419 return parse_bracket_symbol (elem
, regexp
, token
);
3420 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3422 /* A '-' must only appear as anything but a range indicator before
3423 the closing bracket. Everything else is an error. */
3425 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3426 if (token2
.type
!= OP_CLOSE_BRACKET
)
3427 /* The actual error value is not standardized since this whole
3428 case is undefined. But ERANGE makes good sense. */
3431 elem
->type
= SB_CHAR
;
3432 elem
->opr
.ch
= token
->opr
.c
;
3436 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3437 such as [:<character_class>:], [.<collating_element>.], and
3438 [=<equivalent_class>=]. */
3440 static reg_errcode_t
3441 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3444 unsigned char ch
, delim
= token
->opr
.c
;
3446 if (re_string_eoi(regexp
))
3450 if (i
>= BRACKET_NAME_BUF_SIZE
)
3452 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3453 ch
= re_string_fetch_byte_case (regexp
);
3455 ch
= re_string_fetch_byte (regexp
);
3456 if (re_string_eoi(regexp
))
3458 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3460 elem
->opr
.name
[i
] = ch
;
3462 re_string_skip_bytes (regexp
, 1);
3463 elem
->opr
.name
[i
] = '\0';
3464 switch (token
->type
)
3466 case OP_OPEN_COLL_ELEM
:
3467 elem
->type
= COLL_SYM
;
3469 case OP_OPEN_EQUIV_CLASS
:
3470 elem
->type
= EQUIV_CLASS
;
3472 case OP_OPEN_CHAR_CLASS
:
3473 elem
->type
= CHAR_CLASS
;
3481 /* Helper function for parse_bracket_exp.
3482 Build the equivalence class which is represented by NAME.
3483 The result are written to MBCSET and SBCSET.
3484 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3485 is a pointer argument since we may update it. */
3487 static reg_errcode_t
3488 #ifdef RE_ENABLE_I18N
3489 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3490 Idx
*equiv_class_alloc
, const unsigned char *name
)
3491 #else /* not RE_ENABLE_I18N */
3492 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3493 #endif /* not RE_ENABLE_I18N */
3496 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3499 const int32_t *table
, *indirect
;
3500 const unsigned char *weights
, *extra
, *cp
;
3501 unsigned char char_buf
[2];
3505 /* Calculate the index for equivalence class. */
3507 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3508 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3509 _NL_COLLATE_WEIGHTMB
);
3510 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3511 _NL_COLLATE_EXTRAMB
);
3512 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3513 _NL_COLLATE_INDIRECTMB
);
3514 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3515 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3516 /* This isn't a valid character. */
3517 return REG_ECOLLATE
;
3519 /* Build single byte matching table for this equivalence class. */
3520 len
= weights
[idx1
& 0xffffff];
3521 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3525 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3530 /* This isn't a valid character. */
3532 /* Compare only if the length matches and the collation rule
3533 index is the same. */
3534 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3538 while (cnt
<= len
&&
3539 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3540 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3544 bitset_set (sbcset
, ch
);
3547 /* Check whether the array has enough space. */
3548 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nequiv_classes is 0. */
3552 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3553 /* Use realloc since the array is NULL if *alloc == 0. */
3554 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3556 new_equiv_class_alloc
);
3557 if (BE (new_equiv_classes
== NULL
, 0))
3559 mbcset
->equiv_classes
= new_equiv_classes
;
3560 *equiv_class_alloc
= new_equiv_class_alloc
;
3562 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3567 if (BE (strlen ((const char *) name
) != 1, 0))
3568 return REG_ECOLLATE
;
3569 bitset_set (sbcset
, *name
);
3574 /* Helper function for parse_bracket_exp.
3575 Build the character class which is represented by NAME.
3576 The result are written to MBCSET and SBCSET.
3577 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578 is a pointer argument since we may update it. */
3580 static reg_errcode_t
3581 #ifdef RE_ENABLE_I18N
3582 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3583 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3584 const char *class_name
, reg_syntax_t syntax
)
3585 #else /* not RE_ENABLE_I18N */
3586 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3587 const char *class_name
, reg_syntax_t syntax
)
3588 #endif /* not RE_ENABLE_I18N */
3591 const char *name
= class_name
;
3593 /* In case of REG_ICASE "upper" and "lower" match the both of
3594 upper and lower cases. */
3595 if ((syntax
& RE_ICASE
)
3596 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3599 #ifdef RE_ENABLE_I18N
3600 /* Check the space of the arrays. */
3601 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3603 /* Not enough, realloc it. */
3604 /* +1 in case of mbcset->nchar_classes is 0. */
3605 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3606 /* Use realloc since array is NULL if *alloc == 0. */
3607 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3608 new_char_class_alloc
);
3609 if (BE (new_char_classes
== NULL
, 0))
3611 mbcset
->char_classes
= new_char_classes
;
3612 *char_class_alloc
= new_char_class_alloc
;
3614 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3615 #endif /* RE_ENABLE_I18N */
3617 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3619 if (BE (trans != NULL, 0)) \
3621 for (i = 0; i < SBC_MAX; ++i) \
3622 if (ctype_func (i)) \
3623 bitset_set (sbcset, trans[i]); \
3627 for (i = 0; i < SBC_MAX; ++i) \
3628 if (ctype_func (i)) \
3629 bitset_set (sbcset, i); \
3633 if (strcmp (name
, "alnum") == 0)
3634 BUILD_CHARCLASS_LOOP (isalnum
);
3635 else if (strcmp (name
, "cntrl") == 0)
3636 BUILD_CHARCLASS_LOOP (iscntrl
);
3637 else if (strcmp (name
, "lower") == 0)
3638 BUILD_CHARCLASS_LOOP (islower
);
3639 else if (strcmp (name
, "space") == 0)
3640 BUILD_CHARCLASS_LOOP (isspace
);
3641 else if (strcmp (name
, "alpha") == 0)
3642 BUILD_CHARCLASS_LOOP (isalpha
);
3643 else if (strcmp (name
, "digit") == 0)
3644 BUILD_CHARCLASS_LOOP (isdigit
);
3645 else if (strcmp (name
, "print") == 0)
3646 BUILD_CHARCLASS_LOOP (isprint
);
3647 else if (strcmp (name
, "upper") == 0)
3648 BUILD_CHARCLASS_LOOP (isupper
);
3649 else if (strcmp (name
, "blank") == 0)
3650 BUILD_CHARCLASS_LOOP (isblank
);
3651 else if (strcmp (name
, "graph") == 0)
3652 BUILD_CHARCLASS_LOOP (isgraph
);
3653 else if (strcmp (name
, "punct") == 0)
3654 BUILD_CHARCLASS_LOOP (ispunct
);
3655 else if (strcmp (name
, "xdigit") == 0)
3656 BUILD_CHARCLASS_LOOP (isxdigit
);
3664 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3665 const char *class_name
,
3666 const char *extra
, bool non_match
,
3669 re_bitset_ptr_t sbcset
;
3670 #ifdef RE_ENABLE_I18N
3671 re_charset_t
*mbcset
;
3673 #endif /* not RE_ENABLE_I18N */
3675 re_token_t br_token
;
3678 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3679 #ifdef RE_ENABLE_I18N
3680 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3681 #endif /* RE_ENABLE_I18N */
3683 #ifdef RE_ENABLE_I18N
3684 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3685 #else /* not RE_ENABLE_I18N */
3686 if (BE (sbcset
== NULL
, 0))
3687 #endif /* not RE_ENABLE_I18N */
3695 #ifdef RE_ENABLE_I18N
3696 mbcset
->non_match
= 1;
3697 #endif /* not RE_ENABLE_I18N */
3700 /* We don't care the syntax in this case. */
3701 ret
= build_charclass (trans
, sbcset
,
3702 #ifdef RE_ENABLE_I18N
3704 #endif /* RE_ENABLE_I18N */
3707 if (BE (ret
!= REG_NOERROR
, 0))
3710 #ifdef RE_ENABLE_I18N
3711 free_charset (mbcset
);
3712 #endif /* RE_ENABLE_I18N */
3716 /* \w match '_' also. */
3717 for (; *extra
; extra
++)
3718 bitset_set (sbcset
, *extra
);
3720 /* If it is non-matching list. */
3722 bitset_not (sbcset
);
3724 #ifdef RE_ENABLE_I18N
3725 /* Ensure only single byte characters are set. */
3726 if (dfa
->mb_cur_max
> 1)
3727 bitset_mask (sbcset
, dfa
->sb_char
);
3730 /* Build a tree for simple bracket. */
3731 br_token
.type
= SIMPLE_BRACKET
;
3732 br_token
.opr
.sbcset
= sbcset
;
3733 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3734 if (BE (tree
== NULL
, 0))
3735 goto build_word_op_espace
;
3737 #ifdef RE_ENABLE_I18N
3738 if (dfa
->mb_cur_max
> 1)
3740 bin_tree_t
*mbc_tree
;
3741 /* Build a tree for complex bracket. */
3742 br_token
.type
= COMPLEX_BRACKET
;
3743 br_token
.opr
.mbcset
= mbcset
;
3744 dfa
->has_mb_node
= 1;
3745 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3746 if (BE (mbc_tree
== NULL
, 0))
3747 goto build_word_op_espace
;
3748 /* Then join them by ALT node. */
3749 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3750 if (BE (mbc_tree
!= NULL
, 1))
3755 free_charset (mbcset
);
3758 #else /* not RE_ENABLE_I18N */
3760 #endif /* not RE_ENABLE_I18N */
3762 build_word_op_espace
:
3764 #ifdef RE_ENABLE_I18N
3765 free_charset (mbcset
);
3766 #endif /* RE_ENABLE_I18N */
3771 /* This is intended for the expressions like "a{1,3}".
3772 Fetch a number from 'input', and return the number.
3773 Return REG_MISSING if the number field is empty like "{,1}".
3774 Return RE_DUP_MAX + 1 if the number field is too large.
3775 Return REG_ERROR if an error occurred. */
3778 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3780 Idx num
= REG_MISSING
;
3784 fetch_token (token
, input
, syntax
);
3786 if (BE (token
->type
== END_OF_RE
, 0))
3788 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3790 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
3791 || num
== REG_ERROR
)
3793 : num
== REG_MISSING
3795 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3800 #ifdef RE_ENABLE_I18N
3802 free_charset (re_charset_t
*cset
)
3804 re_free (cset
->mbchars
);
3806 re_free (cset
->coll_syms
);
3807 re_free (cset
->equiv_classes
);
3808 re_free (cset
->range_starts
);
3809 re_free (cset
->range_ends
);
3811 re_free (cset
->char_classes
);
3814 #endif /* RE_ENABLE_I18N */
3816 /* Functions for binary tree operation. */
3818 /* Create a tree node. */
3821 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3822 re_token_type_t type
)
3826 return create_token_tree (dfa
, left
, right
, &t
);
3830 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3831 const re_token_t
*token
)
3834 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3836 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3838 if (storage
== NULL
)
3840 storage
->next
= dfa
->str_tree_storage
;
3841 dfa
->str_tree_storage
= storage
;
3842 dfa
->str_tree_storage_idx
= 0;
3844 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3846 tree
->parent
= NULL
;
3848 tree
->right
= right
;
3849 tree
->token
= *token
;
3850 tree
->token
.duplicated
= 0;
3851 tree
->token
.opt_subexp
= 0;
3854 tree
->node_idx
= REG_MISSING
;
3857 left
->parent
= tree
;
3859 right
->parent
= tree
;
3863 /* Mark the tree SRC as an optional subexpression.
3864 To be called from preorder or postorder. */
3866 static reg_errcode_t
3867 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3869 Idx idx
= (uintptr_t) extra
;
3870 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3871 node
->token
.opt_subexp
= 1;
3876 /* Free the allocated memory inside NODE. */
3879 free_token (re_token_t
*node
)
3881 #ifdef RE_ENABLE_I18N
3882 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3883 free_charset (node
->opr
.mbcset
);
3885 #endif /* RE_ENABLE_I18N */
3886 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3887 re_free (node
->opr
.sbcset
);
3890 /* Worker function for tree walking. Free the allocated memory inside NODE
3891 and its children. */
3893 static reg_errcode_t
3894 free_tree (void *extra
, bin_tree_t
*node
)
3896 free_token (&node
->token
);
3901 /* Duplicate the node SRC, and return new node. This is a preorder
3902 visit similar to the one implemented by the generic visitor, but
3903 we need more infrastructure to maintain two parallel trees --- so,
3904 it's easier to duplicate. */
3907 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3909 const bin_tree_t
*node
;
3910 bin_tree_t
*dup_root
;
3911 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3913 for (node
= root
; ; )
3915 /* Create a new tree and link it back to the current parent. */
3916 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3919 (*p_new
)->parent
= dup_node
;
3920 (*p_new
)->token
.duplicated
= 1;
3923 /* Go to the left node, or up and to the right. */
3927 p_new
= &dup_node
->left
;
3931 const bin_tree_t
*prev
= NULL
;
3932 while (node
->right
== prev
|| node
->right
== NULL
)
3935 node
= node
->parent
;
3936 dup_node
= dup_node
->parent
;
3941 p_new
= &dup_node
->right
;