1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 # include <locale/weight.h>
26 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
27 size_t length
, reg_syntax_t syntax
);
28 static void re_compile_fastmap_iter (regex_t
*bufp
,
29 const re_dfastate_t
*init_state
,
31 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
33 static void free_charset (re_charset_t
*cset
);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t
*preg
);
36 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
38 static void optimize_utf8 (re_dfa_t
*dfa
);
40 static reg_errcode_t
analyze (regex_t
*preg
);
41 static reg_errcode_t
preorder (bin_tree_t
*root
,
42 reg_errcode_t (fn (void *, bin_tree_t
*)),
44 static reg_errcode_t
postorder (bin_tree_t
*root
,
45 reg_errcode_t (fn (void *, bin_tree_t
*)),
47 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
49 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
51 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
52 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
53 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
54 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
55 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
56 unsigned int constraint
);
57 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
58 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
60 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
61 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
63 static int peek_token (re_token_t
*token
, re_string_t
*input
,
65 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
66 reg_syntax_t syntax
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
80 re_dfa_t
*dfa
, re_token_t
*token
,
81 reg_syntax_t syntax
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
83 re_token_t
*token
, reg_syntax_t syntax
,
85 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
87 re_token_t
*token
, int token_len
,
91 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
95 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
97 int *equiv_class_alloc
,
98 const unsigned char *name
);
99 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
101 re_charset_t
*mbcset
,
102 int *char_class_alloc
,
103 const unsigned char *class_name
,
104 reg_syntax_t syntax
);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
107 const unsigned char *name
);
108 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 reg_syntax_t syntax
);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
114 RE_TRANSLATE_TYPE trans
,
115 const unsigned char *class_name
,
116 const unsigned char *extra
,
117 int non_match
, reg_errcode_t
*err
);
118 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
119 bin_tree_t
*left
, bin_tree_t
*right
,
120 re_token_type_t type
);
121 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
122 bin_tree_t
*left
, bin_tree_t
*right
,
123 const re_token_t
*token
);
124 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
125 static void free_token (re_token_t
*node
);
126 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
127 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid
[] attribute_hidden
=
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx
[] attribute_hidden
=
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
219 re_compile_pattern (const char *pattern
, size_t length
,
220 struct re_pattern_buffer
*bufp
)
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
229 /* Match anchors at newline. */
230 bufp
->newline_anchor
= 1;
232 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
236 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
239 weak_alias (__re_compile_pattern
, re_compile_pattern
)
242 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
243 also be assigned to arbitrarily: each pattern buffer stores its own
244 syntax, so it can be changed between regex compilations. */
245 /* This has no initializer because initialized variables in Emacs
246 become read-only after dumping. */
247 reg_syntax_t re_syntax_options
;
250 /* Specify the precise syntax of regexps for compilation. This provides
251 for compatibility for various utilities which historically have
252 different, incompatible syntaxes.
254 The argument SYNTAX is a bit mask comprised of the various bits
255 defined in regex.h. We return the old syntax. */
258 re_set_syntax (reg_syntax_t syntax
)
260 reg_syntax_t ret
= re_syntax_options
;
262 re_syntax_options
= syntax
;
266 weak_alias (__re_set_syntax
, re_set_syntax
)
270 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
272 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
273 char *fastmap
= bufp
->fastmap
;
275 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
276 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_word
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_nl
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
281 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
282 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
283 bufp
->fastmap_accurate
= 1;
287 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
291 __attribute__ ((always_inline
))
292 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
296 fastmap
[tolower (ch
)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
303 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
306 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
308 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
309 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
311 int node
= init_state
->nodes
.elems
[node_cnt
];
312 re_token_type_t type
= dfa
->nodes
[node
].type
;
314 if (type
== CHARACTER
)
316 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
320 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
325 *p
++ = dfa
->nodes
[node
].opr
.c
;
326 while (++node
< dfa
->nodes_len
327 && dfa
->nodes
[node
].type
== CHARACTER
328 && dfa
->nodes
[node
].mb_partial
)
329 *p
++ = dfa
->nodes
[node
].opr
.c
;
330 memset (&state
, '\0', sizeof (state
));
331 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
333 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
335 re_set_fastmap (fastmap
, 0, buf
[0]);
339 else if (type
== SIMPLE_BRACKET
)
342 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
345 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
346 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
347 if (w
& ((bitset_word_t
) 1 << j
))
348 re_set_fastmap (fastmap
, icase
, ch
);
351 #ifdef RE_ENABLE_I18N
352 else if (type
== COMPLEX_BRACKET
)
354 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
358 /* See if we have to try all bytes which start multiple collation
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
365 && (cset
->ncoll_syms
|| cset
->nranges
))
367 const int32_t *table
= (const int32_t *)
368 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
369 for (i
= 0; i
< SBC_MAX
; ++i
)
371 re_set_fastmap (fastmap
, icase
, i
);
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa
->mb_cur_max
> 1
380 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
382 || cset
->nequiv_classes
390 memset (&mbs
, 0, sizeof (mbs
));
391 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
392 re_set_fastmap (fastmap
, false, (int) c
);
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i
= 0; i
< cset
->nmbchars
; ++i
)
404 memset (&state
, '\0', sizeof (state
));
405 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
406 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
407 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
409 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
411 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
416 #endif /* RE_ENABLE_I18N */
417 else if (type
== OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type
== OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type
== END_OF_RE
)
423 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
424 if (type
== END_OF_RE
)
425 bufp
->can_be_null
= 1;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 'buffer' to the compiled pattern;
438 'used' to the length of the compiled pattern;
439 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 'fastmap' to an allocated space for the fastmap;
444 'fastmap_accurate' to zero;
445 're_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (regex_t
*__restrict preg
, const char *__restrict pattern
, int cflags
)
471 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
472 : RE_SYNTAX_POSIX_BASIC
);
478 /* Try to allocate space for the fastmap. */
479 preg
->fastmap
= re_malloc (char, SBC_MAX
);
480 if (BE (preg
->fastmap
== NULL
, 0))
483 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
485 /* If REG_NEWLINE is set, newlines are treated differently. */
486 if (cflags
& REG_NEWLINE
)
487 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
488 syntax
&= ~RE_DOT_NEWLINE
;
489 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
490 /* It also changes the matching behavior. */
491 preg
->newline_anchor
= 1;
494 preg
->newline_anchor
= 0;
495 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
496 preg
->translate
= NULL
;
498 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
500 /* POSIX doesn't distinguish between an unmatched open-group and an
501 unmatched close-group: both are REG_EPAREN. */
502 if (ret
== REG_ERPAREN
)
505 /* We have already checked preg->fastmap != NULL. */
506 if (BE (ret
== REG_NOERROR
, 1))
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. This function never fails in this implementation. */
509 (void) re_compile_fastmap (preg
);
512 /* Some error occurred while compiling the expression. */
513 re_free (preg
->fastmap
);
514 preg
->fastmap
= NULL
;
520 libc_hidden_def (__regcomp
)
521 weak_alias (__regcomp
, regcomp
)
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
528 regerror (int errcode
, const regex_t
*__restrict preg
, char *__restrict errbuf
,
535 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
536 / sizeof (__re_error_msgid_idx
[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
543 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
545 msg_size
= strlen (msg
) + 1; /* Includes the null. */
547 if (BE (errbuf_size
!= 0, 1))
549 if (BE (msg_size
> errbuf_size
, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
554 memcpy (errbuf
, msg
, errbuf_size
- 1);
555 errbuf
[errbuf_size
- 1] = 0;
559 memcpy (errbuf
, msg
, msg_size
);
565 weak_alias (__regerror
, regerror
)
569 #ifdef RE_ENABLE_I18N
570 /* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574 static const bitset_t utf8_sb_map
=
576 /* Set the first 128 bits. */
577 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
583 free_dfa_content (re_dfa_t
*dfa
)
588 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
589 free_token (dfa
->nodes
+ i
);
590 re_free (dfa
->nexts
);
591 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
593 if (dfa
->eclosures
!= NULL
)
594 re_node_set_free (dfa
->eclosures
+ i
);
595 if (dfa
->inveclosures
!= NULL
)
596 re_node_set_free (dfa
->inveclosures
+ i
);
597 if (dfa
->edests
!= NULL
)
598 re_node_set_free (dfa
->edests
+ i
);
600 re_free (dfa
->edests
);
601 re_free (dfa
->eclosures
);
602 re_free (dfa
->inveclosures
);
603 re_free (dfa
->nodes
);
605 if (dfa
->state_table
)
606 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
608 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
609 for (j
= 0; j
< entry
->num
; ++j
)
611 re_dfastate_t
*state
= entry
->array
[j
];
614 re_free (entry
->array
);
616 re_free (dfa
->state_table
);
617 #ifdef RE_ENABLE_I18N
618 if (dfa
->sb_char
!= utf8_sb_map
)
619 re_free (dfa
->sb_char
);
621 re_free (dfa
->subexp_map
);
623 re_free (dfa
->re_str
);
630 /* Free dynamically allocated space used by PREG. */
633 regfree (regex_t
*preg
)
635 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
636 if (BE (dfa
!= NULL
, 1))
637 free_dfa_content (dfa
);
641 re_free (preg
->fastmap
);
642 preg
->fastmap
= NULL
;
644 re_free (preg
->translate
);
645 preg
->translate
= NULL
;
648 libc_hidden_def (__regfree
)
649 weak_alias (__regfree
, regfree
)
652 /* Entry points compatible with 4.2 BSD regex library. We don't define
653 them unless specifically requested. */
655 #if defined _REGEX_RE_COMP || defined _LIBC
657 /* BSD has one and only one pattern buffer. */
658 static struct re_pattern_buffer re_comp_buf
;
662 /* Make these definitions weak in libc, so POSIX programs can redefine
663 these names if they don't use our functions, and still use
664 regcomp/regexec above without link errors. */
667 re_comp (const char *s
)
674 if (!re_comp_buf
.buffer
)
675 return gettext ("No previous regular expression");
679 if (re_comp_buf
.buffer
)
681 fastmap
= re_comp_buf
.fastmap
;
682 re_comp_buf
.fastmap
= NULL
;
683 __regfree (&re_comp_buf
);
684 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
685 re_comp_buf
.fastmap
= fastmap
;
688 if (re_comp_buf
.fastmap
== NULL
)
690 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
691 if (re_comp_buf
.fastmap
== NULL
)
692 return (char *) gettext (__re_error_msgid
693 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
696 /* Since 're_exec' always passes NULL for the 'regs' argument, we
697 don't need to initialize the pattern buffer fields which affect it. */
699 /* Match anchors at newlines. */
700 re_comp_buf
.newline_anchor
= 1;
702 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
707 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
708 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
712 libc_freeres_fn (free_mem
)
714 __regfree (&re_comp_buf
);
718 #endif /* _REGEX_RE_COMP */
720 /* Internal entry point.
721 Compile the regular expression PATTERN, whose length is LENGTH.
722 SYNTAX indicate regular expression's syntax. */
725 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
728 reg_errcode_t err
= REG_NOERROR
;
732 /* Initialize the pattern buffer. */
733 preg
->fastmap_accurate
= 0;
734 preg
->syntax
= syntax
;
735 preg
->not_bol
= preg
->not_eol
= 0;
738 preg
->can_be_null
= 0;
739 preg
->regs_allocated
= REGS_UNALLOCATED
;
741 /* Initialize the dfa. */
742 dfa
= (re_dfa_t
*) preg
->buffer
;
743 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
745 /* If zero allocated, but buffer is non-null, try to realloc
746 enough space. This loses if buffer's address is bogus, but
747 that is the user's responsibility. If ->buffer is NULL this
748 is a simple allocation. */
749 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
752 preg
->allocated
= sizeof (re_dfa_t
);
753 preg
->buffer
= (unsigned char *) dfa
;
755 preg
->used
= sizeof (re_dfa_t
);
757 err
= init_dfa (dfa
, length
);
758 if (BE (err
!= REG_NOERROR
, 0))
760 free_dfa_content (dfa
);
766 /* Note: length+1 will not overflow since it is checked in init_dfa. */
767 dfa
->re_str
= re_malloc (char, length
+ 1);
768 strncpy (dfa
->re_str
, pattern
, length
+ 1);
771 __libc_lock_init (dfa
->lock
);
773 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
774 syntax
& RE_ICASE
, dfa
);
775 if (BE (err
!= REG_NOERROR
, 0))
777 re_compile_internal_free_return
:
778 free_workarea_compile (preg
);
779 re_string_destruct (®exp
);
780 free_dfa_content (dfa
);
786 /* Parse the regular expression, and build a structure tree. */
788 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
789 if (BE (dfa
->str_tree
== NULL
, 0))
790 goto re_compile_internal_free_return
;
792 /* Analyze the tree and create the nfa. */
793 err
= analyze (preg
);
794 if (BE (err
!= REG_NOERROR
, 0))
795 goto re_compile_internal_free_return
;
797 #ifdef RE_ENABLE_I18N
798 /* If possible, do searching in single byte encoding to speed things up. */
799 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
803 /* Then create the initial state of the dfa. */
804 err
= create_initial_state (dfa
);
806 /* Release work areas. */
807 free_workarea_compile (preg
);
808 re_string_destruct (®exp
);
810 if (BE (err
!= REG_NOERROR
, 0))
812 free_dfa_content (dfa
);
820 /* Initialize DFA. We use the length of the regular expression PAT_LEN
821 as the initial length of some arrays. */
824 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
826 unsigned int table_size
;
831 memset (dfa
, '\0', sizeof (re_dfa_t
));
833 /* Force allocation of str_tree_storage the first time. */
834 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
836 /* Avoid overflows. */
837 if (pat_len
== SIZE_MAX
)
840 dfa
->nodes_alloc
= pat_len
+ 1;
841 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
843 /* table_size = 2 ^ ceil(log pat_len) */
844 for (table_size
= 1; ; table_size
<<= 1)
845 if (table_size
> pat_len
)
848 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
849 dfa
->state_hash_mask
= table_size
- 1;
851 dfa
->mb_cur_max
= MB_CUR_MAX
;
853 if (dfa
->mb_cur_max
== 6
854 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
856 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
859 # ifdef HAVE_LANGINFO_CODESET
860 codeset_name
= nl_langinfo (CODESET
);
862 codeset_name
= getenv ("LC_ALL");
863 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
864 codeset_name
= getenv ("LC_CTYPE");
865 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
866 codeset_name
= getenv ("LANG");
867 if (codeset_name
== NULL
)
869 else if (strchr (codeset_name
, '.') != NULL
)
870 codeset_name
= strchr (codeset_name
, '.') + 1;
873 if (strcasecmp (codeset_name
, "UTF-8") == 0
874 || strcasecmp (codeset_name
, "UTF8") == 0)
877 /* We check exhaustively in the loop below if this charset is a
878 superset of ASCII. */
879 dfa
->map_notascii
= 0;
882 #ifdef RE_ENABLE_I18N
883 if (dfa
->mb_cur_max
> 1)
886 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
891 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
892 if (BE (dfa
->sb_char
== NULL
, 0))
895 /* Set the bits corresponding to single byte chars. */
896 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
897 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
899 wint_t wch
= __btowc (ch
);
901 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
903 if (isascii (ch
) && wch
!= ch
)
904 dfa
->map_notascii
= 1;
911 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
916 /* Initialize WORD_CHAR table, which indicate which character is
917 "word". In this case "word" means that it is the word construction
918 character used by some operators like "\<", "\>", etc. */
921 init_word_char (re_dfa_t
*dfa
)
923 dfa
->word_ops_used
= 1;
926 if (BE (dfa
->map_notascii
== 0, 1))
928 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
929 them, an issue when this code is used in Gnulib. */
930 bitset_word_t bits0
= 0x00000000;
931 bitset_word_t bits1
= 0x03ff0000;
932 bitset_word_t bits2
= 0x87fffffe;
933 bitset_word_t bits3
= 0x07fffffe;
934 if (BITSET_WORD_BITS
== 64)
936 /* Pacify gcc -Woverflow on 32-bit platformns. */
937 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
938 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
941 else if (BITSET_WORD_BITS
== 32)
943 dfa
->word_char
[0] = bits0
;
944 dfa
->word_char
[1] = bits1
;
945 dfa
->word_char
[2] = bits2
;
946 dfa
->word_char
[3] = bits3
;
953 if (BE (dfa
->is_utf8
, 1))
955 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
961 for (; i
< BITSET_WORDS
; ++i
)
962 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
963 if (isalnum (ch
) || ch
== '_')
964 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
967 /* Free the work area which are only used while compiling. */
970 free_workarea_compile (regex_t
*preg
)
972 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
973 bin_tree_storage_t
*storage
, *next
;
974 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
976 next
= storage
->next
;
979 dfa
->str_tree_storage
= NULL
;
980 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
981 dfa
->str_tree
= NULL
;
982 re_free (dfa
->org_indices
);
983 dfa
->org_indices
= NULL
;
986 /* Create initial states for all contexts. */
989 create_initial_state (re_dfa_t
*dfa
)
993 re_node_set init_nodes
;
995 /* Initial states have the epsilon closure of the node which is
996 the first node of the regular expression. */
997 first
= dfa
->str_tree
->first
->node_idx
;
998 dfa
->init_node
= first
;
999 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1000 if (BE (err
!= REG_NOERROR
, 0))
1003 /* The back-references which are in initial states can epsilon transit,
1004 since in this case all of the subexpressions can be null.
1005 Then we add epsilon closures of the nodes which are the next nodes of
1006 the back-references. */
1007 if (dfa
->nbackref
> 0)
1008 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1010 int node_idx
= init_nodes
.elems
[i
];
1011 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1014 if (type
!= OP_BACK_REF
)
1016 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1018 re_token_t
*clexp_node
;
1019 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1020 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1021 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1024 if (clexp_idx
== init_nodes
.nelem
)
1027 if (type
== OP_BACK_REF
)
1029 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1030 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1032 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1035 if (err
!= REG_NOERROR
)
1042 /* It must be the first time to invoke acquire_state. */
1043 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1044 /* We don't check ERR here, since the initial state must not be NULL. */
1045 if (BE (dfa
->init_state
== NULL
, 0))
1047 if (dfa
->init_state
->has_constraint
)
1049 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1051 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1053 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1057 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1058 || dfa
->init_state_begbuf
== NULL
, 0))
1062 dfa
->init_state_word
= dfa
->init_state_nl
1063 = dfa
->init_state_begbuf
= dfa
->init_state
;
1065 re_node_set_free (&init_nodes
);
1069 #ifdef RE_ENABLE_I18N
1070 /* If it is possible to do searching in single byte encoding instead of UTF-8
1071 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1072 DFA nodes where needed. */
1075 optimize_utf8 (re_dfa_t
*dfa
)
1077 int node
, i
, mb_chars
= 0, has_period
= 0;
1079 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1080 switch (dfa
->nodes
[node
].type
)
1083 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1087 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1095 /* Word anchors etc. cannot be handled. It's okay to test
1096 opr.ctx_type since constraints (for all DFA nodes) are
1097 created by ORing one or more opr.ctx_type values. */
1107 case OP_DUP_ASTERISK
:
1108 case OP_OPEN_SUBEXP
:
1109 case OP_CLOSE_SUBEXP
:
1111 case COMPLEX_BRACKET
:
1113 case SIMPLE_BRACKET
:
1114 /* Just double check. The non-ASCII range starts at 0x80. */
1115 assert (0x80 % BITSET_WORD_BITS
== 0);
1116 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1117 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1124 if (mb_chars
|| has_period
)
1125 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1127 if (dfa
->nodes
[node
].type
== CHARACTER
1128 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1129 dfa
->nodes
[node
].mb_partial
= 0;
1130 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1131 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1134 /* The search can be in single byte locale. */
1135 dfa
->mb_cur_max
= 1;
1137 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1141 /* Analyze the structure tree, and calculate "first", "next", "edest",
1142 "eclosure", and "inveclosure". */
1144 static reg_errcode_t
1145 analyze (regex_t
*preg
)
1147 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1150 /* Allocate arrays. */
1151 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1152 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1153 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1154 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1155 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1156 || dfa
->eclosures
== NULL
, 0))
1159 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1160 if (dfa
->subexp_map
!= NULL
)
1163 for (i
= 0; i
< preg
->re_nsub
; i
++)
1164 dfa
->subexp_map
[i
] = i
;
1165 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1166 for (i
= 0; i
< preg
->re_nsub
; i
++)
1167 if (dfa
->subexp_map
[i
] != i
)
1169 if (i
== preg
->re_nsub
)
1171 free (dfa
->subexp_map
);
1172 dfa
->subexp_map
= NULL
;
1176 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1177 if (BE (ret
!= REG_NOERROR
, 0))
1179 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1180 if (BE (ret
!= REG_NOERROR
, 0))
1182 preorder (dfa
->str_tree
, calc_next
, dfa
);
1183 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1184 if (BE (ret
!= REG_NOERROR
, 0))
1186 ret
= calc_eclosure (dfa
);
1187 if (BE (ret
!= REG_NOERROR
, 0))
1190 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1191 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1192 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1195 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1196 if (BE (dfa
->inveclosures
== NULL
, 0))
1198 ret
= calc_inveclosure (dfa
);
1204 /* Our parse trees are very unbalanced, so we cannot use a stack to
1205 implement parse tree visits. Instead, we use parent pointers and
1206 some hairy code in these two functions. */
1207 static reg_errcode_t
1208 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1211 bin_tree_t
*node
, *prev
;
1213 for (node
= root
; ; )
1215 /* Descend down the tree, preferably to the left (or to the right
1216 if that's the only child). */
1217 while (node
->left
|| node
->right
)
1225 reg_errcode_t err
= fn (extra
, node
);
1226 if (BE (err
!= REG_NOERROR
, 0))
1228 if (node
->parent
== NULL
)
1231 node
= node
->parent
;
1233 /* Go up while we have a node that is reached from the right. */
1234 while (node
->right
== prev
|| node
->right
== NULL
);
1239 static reg_errcode_t
1240 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1245 for (node
= root
; ; )
1247 reg_errcode_t err
= fn (extra
, node
);
1248 if (BE (err
!= REG_NOERROR
, 0))
1251 /* Go to the left node, or up and to the right. */
1256 bin_tree_t
*prev
= NULL
;
1257 while (node
->right
== prev
|| node
->right
== NULL
)
1260 node
= node
->parent
;
1269 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1270 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1271 backreferences as well. Requires a preorder visit. */
1272 static reg_errcode_t
1273 optimize_subexps (void *extra
, bin_tree_t
*node
)
1275 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1277 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1279 int idx
= node
->token
.opr
.idx
;
1280 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1281 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1284 else if (node
->token
.type
== SUBEXP
1285 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1287 int other_idx
= node
->left
->token
.opr
.idx
;
1289 node
->left
= node
->left
->left
;
1291 node
->left
->parent
= node
;
1293 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1294 if (other_idx
< BITSET_WORD_BITS
)
1295 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1301 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1302 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1303 static reg_errcode_t
1304 lower_subexps (void *extra
, bin_tree_t
*node
)
1306 regex_t
*preg
= (regex_t
*) extra
;
1307 reg_errcode_t err
= REG_NOERROR
;
1309 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1311 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1313 node
->left
->parent
= node
;
1315 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1317 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1319 node
->right
->parent
= node
;
1326 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1328 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1329 bin_tree_t
*body
= node
->left
;
1330 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1333 /* We do not optimize empty subexpressions, because otherwise we may
1334 have bad CONCAT nodes with NULL children. This is obviously not
1335 very common, so we do not lose much. An example that triggers
1336 this case is the sed "script" /\(\)/x. */
1337 && node
->left
!= NULL
1338 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1339 || !(dfa
->used_bkref_map
1340 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1343 /* Convert the SUBEXP node to the concatenation of an
1344 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1345 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1346 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1347 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1348 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1349 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1355 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1356 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1360 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361 nodes. Requires a postorder visit. */
1362 static reg_errcode_t
1363 calc_first (void *extra
, bin_tree_t
*node
)
1365 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1366 if (node
->token
.type
== CONCAT
)
1368 node
->first
= node
->left
->first
;
1369 node
->node_idx
= node
->left
->node_idx
;
1374 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1375 if (BE (node
->node_idx
== -1, 0))
1377 if (node
->token
.type
== ANCHOR
)
1378 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1383 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1384 static reg_errcode_t
1385 calc_next (void *extra
, bin_tree_t
*node
)
1387 switch (node
->token
.type
)
1389 case OP_DUP_ASTERISK
:
1390 node
->left
->next
= node
;
1393 node
->left
->next
= node
->right
->first
;
1394 node
->right
->next
= node
->next
;
1398 node
->left
->next
= node
->next
;
1400 node
->right
->next
= node
->next
;
1406 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1407 static reg_errcode_t
1408 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1410 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1411 int idx
= node
->node_idx
;
1412 reg_errcode_t err
= REG_NOERROR
;
1414 switch (node
->token
.type
)
1420 assert (node
->next
== NULL
);
1423 case OP_DUP_ASTERISK
:
1427 dfa
->has_plural_match
= 1;
1428 if (node
->left
!= NULL
)
1429 left
= node
->left
->first
->node_idx
;
1431 left
= node
->next
->node_idx
;
1432 if (node
->right
!= NULL
)
1433 right
= node
->right
->first
->node_idx
;
1435 right
= node
->next
->node_idx
;
1437 assert (right
> -1);
1438 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1443 case OP_OPEN_SUBEXP
:
1444 case OP_CLOSE_SUBEXP
:
1445 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1449 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1450 if (node
->token
.type
== OP_BACK_REF
)
1451 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1455 assert (!IS_EPSILON_NODE (node
->token
.type
));
1456 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1463 /* Duplicate the epsilon closure of the node ROOT_NODE.
1464 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465 to their own constraint. */
1467 static reg_errcode_t
1468 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1469 int root_node
, unsigned int init_constraint
)
1471 int org_node
, clone_node
, ret
;
1472 unsigned int constraint
= init_constraint
;
1473 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1475 int org_dest
, clone_dest
;
1476 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1478 /* If the back reference epsilon-transit, its destination must
1479 also have the constraint. Then duplicate the epsilon closure
1480 of the destination of the back reference, and store it in
1481 edests of the back reference. */
1482 org_dest
= dfa
->nexts
[org_node
];
1483 re_node_set_empty (dfa
->edests
+ clone_node
);
1484 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1485 if (BE (clone_dest
== -1, 0))
1487 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1488 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1489 if (BE (ret
< 0, 0))
1492 else if (dfa
->edests
[org_node
].nelem
== 0)
1494 /* In case of the node can't epsilon-transit, don't duplicate the
1495 destination and store the original destination as the
1496 destination of the node. */
1497 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1500 else if (dfa
->edests
[org_node
].nelem
== 1)
1502 /* In case of the node can epsilon-transit, and it has only one
1504 org_dest
= dfa
->edests
[org_node
].elems
[0];
1505 re_node_set_empty (dfa
->edests
+ clone_node
);
1506 /* If the node is root_node itself, it means the epsilon closure
1507 has a loop. Then tie it to the destination of the root_node. */
1508 if (org_node
== root_node
&& clone_node
!= org_node
)
1510 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1511 if (BE (ret
< 0, 0))
1515 /* In case the node has another constraint, append it. */
1516 constraint
|= dfa
->nodes
[org_node
].constraint
;
1517 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1518 if (BE (clone_dest
== -1, 0))
1520 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1521 if (BE (ret
< 0, 0))
1524 else /* dfa->edests[org_node].nelem == 2 */
1526 /* In case of the node can epsilon-transit, and it has two
1527 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1528 org_dest
= dfa
->edests
[org_node
].elems
[0];
1529 re_node_set_empty (dfa
->edests
+ clone_node
);
1530 /* Search for a duplicated node which satisfies the constraint. */
1531 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1532 if (clone_dest
== -1)
1534 /* There is no such duplicated node, create a new one. */
1536 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1537 if (BE (clone_dest
== -1, 0))
1539 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1540 if (BE (ret
< 0, 0))
1542 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1543 root_node
, constraint
);
1544 if (BE (err
!= REG_NOERROR
, 0))
1549 /* There is a duplicated node which satisfies the constraint,
1550 use it to avoid infinite loop. */
1551 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1552 if (BE (ret
< 0, 0))
1556 org_dest
= dfa
->edests
[org_node
].elems
[1];
1557 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1558 if (BE (clone_dest
== -1, 0))
1560 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1561 if (BE (ret
< 0, 0))
1564 org_node
= org_dest
;
1565 clone_node
= clone_dest
;
1570 /* Search for a node which is duplicated from the node ORG_NODE, and
1571 satisfies the constraint CONSTRAINT. */
1574 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1575 unsigned int constraint
)
1578 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1580 if (org_node
== dfa
->org_indices
[idx
]
1581 && constraint
== dfa
->nodes
[idx
].constraint
)
1582 return idx
; /* Found. */
1584 return -1; /* Not found. */
1587 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1588 Return the index of the new node, or -1 if insufficient storage is
1592 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1594 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1595 if (BE (dup_idx
!= -1, 1))
1597 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1598 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1599 dfa
->nodes
[dup_idx
].duplicated
= 1;
1601 /* Store the index of the original node. */
1602 dfa
->org_indices
[dup_idx
] = org_idx
;
1607 static reg_errcode_t
1608 calc_inveclosure (re_dfa_t
*dfa
)
1611 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1612 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1614 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1616 int *elems
= dfa
->eclosures
[src
].elems
;
1617 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1619 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1620 if (BE (ret
== -1, 0))
1628 /* Calculate "eclosure" for all the node in DFA. */
1630 static reg_errcode_t
1631 calc_eclosure (re_dfa_t
*dfa
)
1633 int node_idx
, incomplete
;
1635 assert (dfa
->nodes_len
> 0);
1638 /* For each nodes, calculate epsilon closure. */
1639 for (node_idx
= 0; ; ++node_idx
)
1642 re_node_set eclosure_elem
;
1643 if (node_idx
== dfa
->nodes_len
)
1652 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1655 /* If we have already calculated, skip it. */
1656 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1658 /* Calculate epsilon closure of 'node_idx'. */
1659 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1660 if (BE (err
!= REG_NOERROR
, 0))
1663 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1666 re_node_set_free (&eclosure_elem
);
1672 /* Calculate epsilon closure of NODE. */
1674 static reg_errcode_t
1675 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1679 re_node_set eclosure
;
1682 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1683 if (BE (err
!= REG_NOERROR
, 0))
1686 /* This indicates that we are calculating this node now.
1687 We reference this value to avoid infinite loop. */
1688 dfa
->eclosures
[node
].nelem
= -1;
1690 /* If the current node has constraints, duplicate all nodes
1691 since they must inherit the constraints. */
1692 if (dfa
->nodes
[node
].constraint
1693 && dfa
->edests
[node
].nelem
1694 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1696 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1697 dfa
->nodes
[node
].constraint
);
1698 if (BE (err
!= REG_NOERROR
, 0))
1702 /* Expand each epsilon destination nodes. */
1703 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1704 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1706 re_node_set eclosure_elem
;
1707 int edest
= dfa
->edests
[node
].elems
[i
];
1708 /* If calculating the epsilon closure of `edest' is in progress,
1709 return intermediate result. */
1710 if (dfa
->eclosures
[edest
].nelem
== -1)
1715 /* If we haven't calculated the epsilon closure of `edest' yet,
1716 calculate now. Otherwise use calculated epsilon closure. */
1717 if (dfa
->eclosures
[edest
].nelem
== 0)
1719 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1720 if (BE (err
!= REG_NOERROR
, 0))
1724 eclosure_elem
= dfa
->eclosures
[edest
];
1725 /* Merge the epsilon closure of 'edest'. */
1726 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1727 if (BE (err
!= REG_NOERROR
, 0))
1729 /* If the epsilon closure of 'edest' is incomplete,
1730 the epsilon closure of this node is also incomplete. */
1731 if (dfa
->eclosures
[edest
].nelem
== 0)
1734 re_node_set_free (&eclosure_elem
);
1738 /* An epsilon closure includes itself. */
1739 ret
= re_node_set_insert (&eclosure
, node
);
1740 if (BE (ret
< 0, 0))
1742 if (incomplete
&& !root
)
1743 dfa
->eclosures
[node
].nelem
= 0;
1745 dfa
->eclosures
[node
] = eclosure
;
1746 *new_set
= eclosure
;
1750 /* Functions for token which are used in the parser. */
1752 /* Fetch a token from INPUT.
1753 We must not use this function inside bracket expressions. */
1756 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1758 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1761 /* Peek a token from INPUT, and return the length of the token.
1762 We must not use this function inside bracket expressions. */
1765 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1769 if (re_string_eoi (input
))
1771 token
->type
= END_OF_RE
;
1775 c
= re_string_peek_byte (input
, 0);
1778 token
->word_char
= 0;
1779 #ifdef RE_ENABLE_I18N
1780 token
->mb_partial
= 0;
1781 if (input
->mb_cur_max
> 1 &&
1782 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1784 token
->type
= CHARACTER
;
1785 token
->mb_partial
= 1;
1792 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1794 token
->type
= BACK_SLASH
;
1798 c2
= re_string_peek_byte_case (input
, 1);
1800 token
->type
= CHARACTER
;
1801 #ifdef RE_ENABLE_I18N
1802 if (input
->mb_cur_max
> 1)
1804 wint_t wc
= re_string_wchar_at (input
,
1805 re_string_cur_idx (input
) + 1);
1806 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1810 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1815 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1816 token
->type
= OP_ALT
;
1818 case '1': case '2': case '3': case '4': case '5':
1819 case '6': case '7': case '8': case '9':
1820 if (!(syntax
& RE_NO_BK_REFS
))
1822 token
->type
= OP_BACK_REF
;
1823 token
->opr
.idx
= c2
- '1';
1827 if (!(syntax
& RE_NO_GNU_OPS
))
1829 token
->type
= ANCHOR
;
1830 token
->opr
.ctx_type
= WORD_FIRST
;
1834 if (!(syntax
& RE_NO_GNU_OPS
))
1836 token
->type
= ANCHOR
;
1837 token
->opr
.ctx_type
= WORD_LAST
;
1841 if (!(syntax
& RE_NO_GNU_OPS
))
1843 token
->type
= ANCHOR
;
1844 token
->opr
.ctx_type
= WORD_DELIM
;
1848 if (!(syntax
& RE_NO_GNU_OPS
))
1850 token
->type
= ANCHOR
;
1851 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1855 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= OP_WORD
;
1859 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= OP_NOTWORD
;
1863 if (!(syntax
& RE_NO_GNU_OPS
))
1864 token
->type
= OP_SPACE
;
1867 if (!(syntax
& RE_NO_GNU_OPS
))
1868 token
->type
= OP_NOTSPACE
;
1871 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= ANCHOR
;
1874 token
->opr
.ctx_type
= BUF_FIRST
;
1878 if (!(syntax
& RE_NO_GNU_OPS
))
1880 token
->type
= ANCHOR
;
1881 token
->opr
.ctx_type
= BUF_LAST
;
1885 if (!(syntax
& RE_NO_BK_PARENS
))
1886 token
->type
= OP_OPEN_SUBEXP
;
1889 if (!(syntax
& RE_NO_BK_PARENS
))
1890 token
->type
= OP_CLOSE_SUBEXP
;
1893 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1894 token
->type
= OP_DUP_PLUS
;
1897 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1898 token
->type
= OP_DUP_QUESTION
;
1901 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1902 token
->type
= OP_OPEN_DUP_NUM
;
1905 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1906 token
->type
= OP_CLOSE_DUP_NUM
;
1914 token
->type
= CHARACTER
;
1915 #ifdef RE_ENABLE_I18N
1916 if (input
->mb_cur_max
> 1)
1918 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1919 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1923 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1928 if (syntax
& RE_NEWLINE_ALT
)
1929 token
->type
= OP_ALT
;
1932 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1933 token
->type
= OP_ALT
;
1936 token
->type
= OP_DUP_ASTERISK
;
1939 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1940 token
->type
= OP_DUP_PLUS
;
1943 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1944 token
->type
= OP_DUP_QUESTION
;
1947 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1948 token
->type
= OP_OPEN_DUP_NUM
;
1951 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1952 token
->type
= OP_CLOSE_DUP_NUM
;
1955 if (syntax
& RE_NO_BK_PARENS
)
1956 token
->type
= OP_OPEN_SUBEXP
;
1959 if (syntax
& RE_NO_BK_PARENS
)
1960 token
->type
= OP_CLOSE_SUBEXP
;
1963 token
->type
= OP_OPEN_BRACKET
;
1966 token
->type
= OP_PERIOD
;
1969 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1970 re_string_cur_idx (input
) != 0)
1972 char prev
= re_string_peek_byte (input
, -1);
1973 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1976 token
->type
= ANCHOR
;
1977 token
->opr
.ctx_type
= LINE_FIRST
;
1980 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1981 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1984 re_string_skip_bytes (input
, 1);
1985 peek_token (&next
, input
, syntax
);
1986 re_string_skip_bytes (input
, -1);
1987 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1990 token
->type
= ANCHOR
;
1991 token
->opr
.ctx_type
= LINE_LAST
;
1999 /* Peek a token from INPUT, and return the length of the token.
2000 We must not use this function out of bracket expressions. */
2003 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2006 if (re_string_eoi (input
))
2008 token
->type
= END_OF_RE
;
2011 c
= re_string_peek_byte (input
, 0);
2014 #ifdef RE_ENABLE_I18N
2015 if (input
->mb_cur_max
> 1 &&
2016 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2018 token
->type
= CHARACTER
;
2021 #endif /* RE_ENABLE_I18N */
2023 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2024 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2026 /* In this case, '\' escape a character. */
2028 re_string_skip_bytes (input
, 1);
2029 c2
= re_string_peek_byte (input
, 0);
2031 token
->type
= CHARACTER
;
2034 if (c
== '[') /* '[' is a special char in a bracket exps. */
2038 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2039 c2
= re_string_peek_byte (input
, 1);
2047 token
->type
= OP_OPEN_COLL_ELEM
;
2050 token
->type
= OP_OPEN_EQUIV_CLASS
;
2053 if (syntax
& RE_CHAR_CLASSES
)
2055 token
->type
= OP_OPEN_CHAR_CLASS
;
2058 /* else fall through. */
2060 token
->type
= CHARACTER
;
2070 token
->type
= OP_CHARSET_RANGE
;
2073 token
->type
= OP_CLOSE_BRACKET
;
2076 token
->type
= OP_NON_MATCH_LIST
;
2079 token
->type
= CHARACTER
;
2084 /* Functions for parser. */
2086 /* Entry point of the parser.
2087 Parse the regular expression REGEXP and return the structure tree.
2088 If an error occurs, ERR is set by error code, and return NULL.
2089 This function build the following tree, from regular expression <reg_exp>:
2095 CAT means concatenation.
2096 EOR means end of regular expression. */
2099 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2102 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2103 bin_tree_t
*tree
, *eor
, *root
;
2104 re_token_t current_token
;
2105 dfa
->syntax
= syntax
;
2106 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2107 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2108 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2110 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2112 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2115 if (BE (eor
== NULL
|| root
== NULL
, 0))
2123 /* This function build the following tree, from regular expression
2124 <branch1>|<branch2>:
2130 ALT means alternative, which represents the operator '|'. */
2133 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2134 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2136 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2137 bin_tree_t
*tree
, *branch
= NULL
;
2138 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2139 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2142 while (token
->type
== OP_ALT
)
2144 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2145 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2146 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2148 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2149 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2152 postorder (tree
, free_tree
, NULL
);
2158 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2159 if (BE (tree
== NULL
, 0))
2168 /* This function build the following tree, from regular expression
2175 CAT means concatenation. */
2178 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2179 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2181 bin_tree_t
*tree
, *exp
;
2182 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2183 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2184 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2187 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2188 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2190 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2191 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2194 postorder (tree
, free_tree
, NULL
);
2197 if (tree
!= NULL
&& exp
!= NULL
)
2199 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2200 if (newtree
== NULL
)
2202 postorder (exp
, free_tree
, NULL
);
2203 postorder (tree
, free_tree
, NULL
);
2209 else if (tree
== NULL
)
2211 /* Otherwise exp == NULL, we don't need to create new tree. */
2216 /* This function build the following tree, from regular expression a*:
2223 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2224 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2226 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2228 switch (token
->type
)
2231 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2232 if (BE (tree
== NULL
, 0))
2237 #ifdef RE_ENABLE_I18N
2238 if (dfa
->mb_cur_max
> 1)
2240 while (!re_string_eoi (regexp
)
2241 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2243 bin_tree_t
*mbc_remain
;
2244 fetch_token (token
, regexp
, syntax
);
2245 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2246 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2247 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2256 case OP_OPEN_SUBEXP
:
2257 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2258 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2261 case OP_OPEN_BRACKET
:
2262 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2263 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2267 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2272 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2273 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2274 if (BE (tree
== NULL
, 0))
2280 dfa
->has_mb_node
= 1;
2282 case OP_OPEN_DUP_NUM
:
2283 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2289 case OP_DUP_ASTERISK
:
2291 case OP_DUP_QUESTION
:
2292 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2297 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2299 fetch_token (token
, regexp
, syntax
);
2300 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2302 /* else fall through */
2303 case OP_CLOSE_SUBEXP
:
2304 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2305 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2310 /* else fall through */
2311 case OP_CLOSE_DUP_NUM
:
2312 /* We treat it as a normal character. */
2314 /* Then we can these characters as normal characters. */
2315 token
->type
= CHARACTER
;
2316 /* mb_partial and word_char bits should be initialized already
2318 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2319 if (BE (tree
== NULL
, 0))
2326 if ((token
->opr
.ctx_type
2327 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2328 && dfa
->word_ops_used
== 0)
2329 init_word_char (dfa
);
2330 if (token
->opr
.ctx_type
== WORD_DELIM
2331 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2333 bin_tree_t
*tree_first
, *tree_last
;
2334 if (token
->opr
.ctx_type
== WORD_DELIM
)
2336 token
->opr
.ctx_type
= WORD_FIRST
;
2337 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2338 token
->opr
.ctx_type
= WORD_LAST
;
2342 token
->opr
.ctx_type
= INSIDE_WORD
;
2343 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2344 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2346 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2347 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2348 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2356 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2357 if (BE (tree
== NULL
, 0))
2363 /* We must return here, since ANCHORs can't be followed
2364 by repetition operators.
2365 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2367 fetch_token (token
, regexp
, syntax
);
2370 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2371 if (BE (tree
== NULL
, 0))
2376 if (dfa
->mb_cur_max
> 1)
2377 dfa
->has_mb_node
= 1;
2381 tree
= build_charclass_op (dfa
, regexp
->trans
,
2382 (const unsigned char *) "alnum",
2383 (const unsigned char *) "_",
2384 token
->type
== OP_NOTWORD
, err
);
2385 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2390 tree
= build_charclass_op (dfa
, regexp
->trans
,
2391 (const unsigned char *) "space",
2392 (const unsigned char *) "",
2393 token
->type
== OP_NOTSPACE
, err
);
2394 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2404 /* Must not happen? */
2410 fetch_token (token
, regexp
, syntax
);
2412 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2413 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2415 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2416 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2419 postorder (tree
, free_tree
, NULL
);
2423 /* In BRE consecutive duplications are not allowed. */
2424 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2425 && (token
->type
== OP_DUP_ASTERISK
2426 || token
->type
== OP_OPEN_DUP_NUM
))
2429 postorder (tree
, free_tree
, NULL
);
2438 /* This function build the following tree, from regular expression
2446 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2447 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2449 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2452 cur_nsub
= preg
->re_nsub
++;
2454 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2456 /* The subexpression may be a null string. */
2457 if (token
->type
== OP_CLOSE_SUBEXP
)
2461 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2462 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2465 postorder (tree
, free_tree
, NULL
);
2468 if (BE (*err
!= REG_NOERROR
, 0))
2472 if (cur_nsub
<= '9' - '1')
2473 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2475 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2476 if (BE (tree
== NULL
, 0))
2481 tree
->token
.opr
.idx
= cur_nsub
;
2485 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2488 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2489 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2491 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2492 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2493 re_token_t start_token
= *token
;
2495 if (token
->type
== OP_OPEN_DUP_NUM
)
2498 start
= fetch_number (regexp
, token
, syntax
);
2501 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2502 start
= 0; /* We treat "{,m}" as "{0,m}". */
2505 *err
= REG_BADBR
; /* <re>{} is invalid. */
2509 if (BE (start
!= -2, 1))
2511 /* We treat "{n}" as "{n,n}". */
2512 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2513 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2514 ? fetch_number (regexp
, token
, syntax
) : -2));
2516 if (BE (start
== -2 || end
== -2, 0))
2518 /* Invalid sequence. */
2519 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2521 if (token
->type
== END_OF_RE
)
2529 /* If the syntax bit is set, rollback. */
2530 re_string_set_index (regexp
, start_idx
);
2531 *token
= start_token
;
2532 token
->type
= CHARACTER
;
2533 /* mb_partial and word_char bits should be already initialized by
2538 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2540 /* First number greater than second. */
2547 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2548 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2551 fetch_token (token
, regexp
, syntax
);
2553 if (BE (elem
== NULL
, 0))
2555 if (BE (start
== 0 && end
== 0, 0))
2557 postorder (elem
, free_tree
, NULL
);
2561 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2562 if (BE (start
> 0, 0))
2565 for (i
= 2; i
<= start
; ++i
)
2567 elem
= duplicate_tree (elem
, dfa
);
2568 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2569 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2570 goto parse_dup_op_espace
;
2576 /* Duplicate ELEM before it is marked optional. */
2577 elem
= duplicate_tree (elem
, dfa
);
2578 if (BE (elem
== NULL
, 0))
2579 goto parse_dup_op_espace
;
2585 if (elem
->token
.type
== SUBEXP
)
2586 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2588 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2589 if (BE (tree
== NULL
, 0))
2590 goto parse_dup_op_espace
;
2592 /* This loop is actually executed only when end != -1,
2593 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2594 already created the start+1-th copy. */
2595 for (i
= start
+ 2; i
<= end
; ++i
)
2597 elem
= duplicate_tree (elem
, dfa
);
2598 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2599 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2600 goto parse_dup_op_espace
;
2602 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2603 if (BE (tree
== NULL
, 0))
2604 goto parse_dup_op_espace
;
2608 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2612 parse_dup_op_espace
:
2617 /* Size of the names for collating symbol/equivalence_class/character_class.
2618 I'm not sure, but maybe enough. */
2619 #define BRACKET_NAME_BUF_SIZE 32
2622 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2623 Build the range expression which starts from START_ELEM, and ends
2624 at END_ELEM. The result are written to MBCSET and SBCSET.
2625 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2626 mbcset->range_ends, is a pointer argument since we may
2629 static reg_errcode_t
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2632 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2635 bracket_elem_t
*end_elem
)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch
, end_ch
;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2641 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2645 /* We can handle no multi character collating elements without libc
2647 if (BE ((start_elem
->type
== COLL_SYM
2648 && strlen ((char *) start_elem
->opr
.name
) > 1)
2649 || (end_elem
->type
== COLL_SYM
2650 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2651 return REG_ECOLLATE
;
2653 # ifdef RE_ENABLE_I18N
2658 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2660 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2661 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2663 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2664 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2666 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2667 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2668 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2669 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2670 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2671 return REG_ECOLLATE
;
2672 cmp_buf
[0] = start_wc
;
2673 cmp_buf
[4] = end_wc
;
2674 if (__wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2677 /* Got valid collation sequence values, add them as a new entry.
2678 However, for !_LIBC we have no collation elements: if the
2679 character set is single byte, the single byte character set
2680 that we build below suffices. parse_bracket_exp passes
2681 no MBCSET if dfa->mb_cur_max == 1. */
2684 /* Check the space of the arrays. */
2685 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2687 /* There is not enough space, need realloc. */
2688 wchar_t *new_array_start
, *new_array_end
;
2691 /* +1 in case of mbcset->nranges is 0. */
2692 new_nranges
= 2 * mbcset
->nranges
+ 1;
2693 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2694 are NULL if *range_alloc == 0. */
2695 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2697 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2700 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2703 mbcset
->range_starts
= new_array_start
;
2704 mbcset
->range_ends
= new_array_end
;
2705 *range_alloc
= new_nranges
;
2708 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2709 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2712 /* Build the table for single byte characters. */
2713 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2716 if (__wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2717 && __wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2718 bitset_set (sbcset
, wc
);
2721 # else /* not RE_ENABLE_I18N */
2724 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2725 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2727 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2728 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2730 if (start_ch
> end_ch
)
2732 /* Build the table for single byte characters. */
2733 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2734 if (start_ch
<= ch
&& ch
<= end_ch
)
2735 bitset_set (sbcset
, ch
);
2737 # endif /* not RE_ENABLE_I18N */
2740 #endif /* not _LIBC */
2743 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2744 Build the collating element which is represented by NAME.
2745 The result are written to MBCSET and SBCSET.
2746 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2747 pointer argument since we may update it. */
2749 static reg_errcode_t
2750 # ifdef RE_ENABLE_I18N
2751 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2752 int *coll_sym_alloc
, const unsigned char *name
)
2753 # else /* not RE_ENABLE_I18N */
2754 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2755 # endif /* not RE_ENABLE_I18N */
2757 size_t name_len
= strlen ((const char *) name
);
2758 if (BE (name_len
!= 1, 0))
2759 return REG_ECOLLATE
;
2762 bitset_set (sbcset
, name
[0]);
2766 #endif /* not _LIBC */
2768 /* This function parse bracket expression like "[abc]", "[a-c]",
2772 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2773 reg_syntax_t syntax
, reg_errcode_t
*err
)
2776 const unsigned char *collseqmb
;
2777 const char *collseqwc
;
2780 const int32_t *symb_table
;
2781 const unsigned char *extra
;
2783 /* Local function for parse_bracket_exp used in _LIBC environment.
2784 Seek the collating symbol entry corresponding to NAME.
2785 Return the index of the symbol in the SYMB_TABLE,
2786 or -1 if not found. */
2789 __attribute__ ((always_inline
))
2790 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2794 for (elem
= 0; elem
< table_size
; elem
++)
2795 if (symb_table
[2 * elem
] != 0)
2797 int32_t idx
= symb_table
[2 * elem
+ 1];
2798 /* Skip the name of collating element name. */
2799 idx
+= 1 + extra
[idx
];
2800 if (/* Compare the length of the name. */
2801 name_len
== extra
[idx
]
2802 /* Compare the name. */
2803 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2804 /* Yep, this is the entry. */
2810 /* Local function for parse_bracket_exp used in _LIBC environment.
2811 Look up the collation sequence value of BR_ELEM.
2812 Return the value if succeeded, UINT_MAX otherwise. */
2814 auto inline unsigned int
2815 __attribute__ ((always_inline
))
2816 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2818 if (br_elem
->type
== SB_CHAR
)
2821 if (MB_CUR_MAX == 1)
2824 return collseqmb
[br_elem
->opr
.ch
];
2827 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2828 return __collseq_table_lookup (collseqwc
, wc
);
2831 else if (br_elem
->type
== MB_CHAR
)
2834 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2836 else if (br_elem
->type
== COLL_SYM
)
2838 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2842 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2846 /* We found the entry. */
2847 idx
= symb_table
[2 * elem
+ 1];
2848 /* Skip the name of collating element name. */
2849 idx
+= 1 + extra
[idx
];
2850 /* Skip the byte sequence of the collating element. */
2851 idx
+= 1 + extra
[idx
];
2852 /* Adjust for the alignment. */
2853 idx
= (idx
+ 3) & ~3;
2854 /* Skip the multibyte collation sequence value. */
2855 idx
+= sizeof (unsigned int);
2856 /* Skip the wide char sequence of the collating element. */
2857 idx
+= sizeof (unsigned int) *
2858 (1 + *(unsigned int *) (extra
+ idx
));
2859 /* Return the collation sequence value. */
2860 return *(unsigned int *) (extra
+ idx
);
2862 else if (sym_name_len
== 1)
2864 /* No valid character. Match it as a single byte
2866 return collseqmb
[br_elem
->opr
.name
[0]];
2869 else if (sym_name_len
== 1)
2870 return collseqmb
[br_elem
->opr
.name
[0]];
2875 /* Local function for parse_bracket_exp used in _LIBC environment.
2876 Build the range expression which starts from START_ELEM, and ends
2877 at END_ELEM. The result are written to MBCSET and SBCSET.
2878 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2879 mbcset->range_ends, is a pointer argument since we may
2882 auto inline reg_errcode_t
2883 __attribute__ ((always_inline
))
2884 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2885 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2888 uint32_t start_collseq
;
2889 uint32_t end_collseq
;
2891 /* Equivalence Classes and Character Classes can't be a range
2893 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2894 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2898 start_collseq
= lookup_collation_sequence_value (start_elem
);
2899 end_collseq
= lookup_collation_sequence_value (end_elem
);
2900 /* Check start/end collation sequence values. */
2901 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2902 return REG_ECOLLATE
;
2903 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2906 /* Got valid collation sequence values, add them as a new entry.
2907 However, if we have no collation elements, and the character set
2908 is single byte, the single byte character set that we
2909 build below suffices. */
2910 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2912 /* Check the space of the arrays. */
2913 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2915 /* There is not enough space, need realloc. */
2916 uint32_t *new_array_start
;
2917 uint32_t *new_array_end
;
2920 /* +1 in case of mbcset->nranges is 0. */
2921 new_nranges
= 2 * mbcset
->nranges
+ 1;
2922 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2924 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2927 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2930 mbcset
->range_starts
= new_array_start
;
2931 mbcset
->range_ends
= new_array_end
;
2932 *range_alloc
= new_nranges
;
2935 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2936 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2939 /* Build the table for single byte characters. */
2940 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2942 uint32_t ch_collseq
;
2944 if (MB_CUR_MAX == 1)
2947 ch_collseq
= collseqmb
[ch
];
2949 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2950 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2951 bitset_set (sbcset
, ch
);
2956 /* Local function for parse_bracket_exp used in _LIBC environment.
2957 Build the collating element which is represented by NAME.
2958 The result are written to MBCSET and SBCSET.
2959 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2960 pointer argument since we may update it. */
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline
))
2964 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2965 int *coll_sym_alloc
, const unsigned char *name
)
2968 size_t name_len
= strlen ((const char *) name
);
2971 elem
= seek_collating_symbol_entry (name
, name_len
);
2974 /* We found the entry. */
2975 idx
= symb_table
[2 * elem
+ 1];
2976 /* Skip the name of collating element name. */
2977 idx
+= 1 + extra
[idx
];
2979 else if (name_len
== 1)
2981 /* No valid character, treat it as a normal
2983 bitset_set (sbcset
, name
[0]);
2987 return REG_ECOLLATE
;
2989 /* Got valid collation sequence, add it as a new entry. */
2990 /* Check the space of the arrays. */
2991 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2993 /* Not enough, realloc it. */
2994 /* +1 in case of mbcset->ncoll_syms is 0. */
2995 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2996 /* Use realloc since mbcset->coll_syms is NULL
2998 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2999 new_coll_sym_alloc
);
3000 if (BE (new_coll_syms
== NULL
, 0))
3002 mbcset
->coll_syms
= new_coll_syms
;
3003 *coll_sym_alloc
= new_coll_sym_alloc
;
3005 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3010 if (BE (name_len
!= 1, 0))
3011 return REG_ECOLLATE
;
3014 bitset_set (sbcset
, name
[0]);
3021 re_token_t br_token
;
3022 re_bitset_ptr_t sbcset
;
3023 #ifdef RE_ENABLE_I18N
3024 re_charset_t
*mbcset
;
3025 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3026 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3027 #endif /* not RE_ENABLE_I18N */
3029 bin_tree_t
*work_tree
;
3031 int first_round
= 1;
3033 collseqmb
= (const unsigned char *)
3034 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3035 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3041 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3042 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3043 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3044 _NL_COLLATE_SYMB_TABLEMB
);
3045 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3046 _NL_COLLATE_SYMB_EXTRAMB
);
3049 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3050 #ifdef RE_ENABLE_I18N
3051 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3052 #endif /* RE_ENABLE_I18N */
3053 #ifdef RE_ENABLE_I18N
3054 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3056 if (BE (sbcset
== NULL
, 0))
3057 #endif /* RE_ENABLE_I18N */
3060 #ifdef RE_ENABLE_I18N
3067 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3068 if (BE (token
->type
== END_OF_RE
, 0))
3071 goto parse_bracket_exp_free_return
;
3073 if (token
->type
== OP_NON_MATCH_LIST
)
3075 #ifdef RE_ENABLE_I18N
3076 mbcset
->non_match
= 1;
3077 #endif /* not RE_ENABLE_I18N */
3079 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3080 bitset_set (sbcset
, '\n');
3081 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3082 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3083 if (BE (token
->type
== END_OF_RE
, 0))
3086 goto parse_bracket_exp_free_return
;
3090 /* We treat the first ']' as a normal character. */
3091 if (token
->type
== OP_CLOSE_BRACKET
)
3092 token
->type
= CHARACTER
;
3096 bracket_elem_t start_elem
, end_elem
;
3097 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3098 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3100 int token_len2
= 0, is_range_exp
= 0;
3103 start_elem
.opr
.name
= start_name_buf
;
3104 start_elem
.type
= COLL_SYM
;
3105 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3106 syntax
, first_round
);
3107 if (BE (ret
!= REG_NOERROR
, 0))
3110 goto parse_bracket_exp_free_return
;
3114 /* Get information about the next token. We need it in any case. */
3115 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3117 /* Do not check for ranges if we know they are not allowed. */
3118 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3120 if (BE (token
->type
== END_OF_RE
, 0))
3123 goto parse_bracket_exp_free_return
;
3125 if (token
->type
== OP_CHARSET_RANGE
)
3127 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3128 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3129 if (BE (token2
.type
== END_OF_RE
, 0))
3132 goto parse_bracket_exp_free_return
;
3134 if (token2
.type
== OP_CLOSE_BRACKET
)
3136 /* We treat the last '-' as a normal character. */
3137 re_string_skip_bytes (regexp
, -token_len
);
3138 token
->type
= CHARACTER
;
3145 if (is_range_exp
== 1)
3147 end_elem
.opr
.name
= end_name_buf
;
3148 end_elem
.type
= COLL_SYM
;
3149 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3151 if (BE (ret
!= REG_NOERROR
, 0))
3154 goto parse_bracket_exp_free_return
;
3157 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3160 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3161 &start_elem
, &end_elem
);
3163 # ifdef RE_ENABLE_I18N
3164 *err
= build_range_exp (sbcset
,
3165 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3166 &range_alloc
, &start_elem
, &end_elem
);
3168 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3170 #endif /* RE_ENABLE_I18N */
3171 if (BE (*err
!= REG_NOERROR
, 0))
3172 goto parse_bracket_exp_free_return
;
3176 switch (start_elem
.type
)
3179 bitset_set (sbcset
, start_elem
.opr
.ch
);
3181 #ifdef RE_ENABLE_I18N
3183 /* Check whether the array has enough space. */
3184 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3186 wchar_t *new_mbchars
;
3187 /* Not enough, realloc it. */
3188 /* +1 in case of mbcset->nmbchars is 0. */
3189 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3190 /* Use realloc since array is NULL if *alloc == 0. */
3191 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3193 if (BE (new_mbchars
== NULL
, 0))
3194 goto parse_bracket_exp_espace
;
3195 mbcset
->mbchars
= new_mbchars
;
3197 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3199 #endif /* RE_ENABLE_I18N */
3201 *err
= build_equiv_class (sbcset
,
3202 #ifdef RE_ENABLE_I18N
3203 mbcset
, &equiv_class_alloc
,
3204 #endif /* RE_ENABLE_I18N */
3205 start_elem
.opr
.name
);
3206 if (BE (*err
!= REG_NOERROR
, 0))
3207 goto parse_bracket_exp_free_return
;
3210 *err
= build_collating_symbol (sbcset
,
3211 #ifdef RE_ENABLE_I18N
3212 mbcset
, &coll_sym_alloc
,
3213 #endif /* RE_ENABLE_I18N */
3214 start_elem
.opr
.name
);
3215 if (BE (*err
!= REG_NOERROR
, 0))
3216 goto parse_bracket_exp_free_return
;
3219 *err
= build_charclass (regexp
->trans
, sbcset
,
3220 #ifdef RE_ENABLE_I18N
3221 mbcset
, &char_class_alloc
,
3222 #endif /* RE_ENABLE_I18N */
3223 start_elem
.opr
.name
, syntax
);
3224 if (BE (*err
!= REG_NOERROR
, 0))
3225 goto parse_bracket_exp_free_return
;
3232 if (BE (token
->type
== END_OF_RE
, 0))
3235 goto parse_bracket_exp_free_return
;
3237 if (token
->type
== OP_CLOSE_BRACKET
)
3241 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3243 /* If it is non-matching list. */
3245 bitset_not (sbcset
);
3247 #ifdef RE_ENABLE_I18N
3248 /* Ensure only single byte characters are set. */
3249 if (dfa
->mb_cur_max
> 1)
3250 bitset_mask (sbcset
, dfa
->sb_char
);
3252 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3253 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3254 || mbcset
->non_match
)))
3256 bin_tree_t
*mbc_tree
;
3258 /* Build a tree for complex bracket. */
3259 dfa
->has_mb_node
= 1;
3260 br_token
.type
= COMPLEX_BRACKET
;
3261 br_token
.opr
.mbcset
= mbcset
;
3262 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3263 if (BE (mbc_tree
== NULL
, 0))
3264 goto parse_bracket_exp_espace
;
3265 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3266 if (sbcset
[sbc_idx
])
3268 /* If there are no bits set in sbcset, there is no point
3269 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3270 if (sbc_idx
< BITSET_WORDS
)
3272 /* Build a tree for simple bracket. */
3273 br_token
.type
= SIMPLE_BRACKET
;
3274 br_token
.opr
.sbcset
= sbcset
;
3275 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3276 if (BE (work_tree
== NULL
, 0))
3277 goto parse_bracket_exp_espace
;
3279 /* Then join them by ALT node. */
3280 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3281 if (BE (work_tree
== NULL
, 0))
3282 goto parse_bracket_exp_espace
;
3287 work_tree
= mbc_tree
;
3291 #endif /* not RE_ENABLE_I18N */
3293 #ifdef RE_ENABLE_I18N
3294 free_charset (mbcset
);
3296 /* Build a tree for simple bracket. */
3297 br_token
.type
= SIMPLE_BRACKET
;
3298 br_token
.opr
.sbcset
= sbcset
;
3299 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3300 if (BE (work_tree
== NULL
, 0))
3301 goto parse_bracket_exp_espace
;
3305 parse_bracket_exp_espace
:
3307 parse_bracket_exp_free_return
:
3309 #ifdef RE_ENABLE_I18N
3310 free_charset (mbcset
);
3311 #endif /* RE_ENABLE_I18N */
3315 /* Parse an element in the bracket expression. */
3317 static reg_errcode_t
3318 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3319 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3320 reg_syntax_t syntax
, int accept_hyphen
)
3322 #ifdef RE_ENABLE_I18N
3324 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3325 if (cur_char_size
> 1)
3327 elem
->type
= MB_CHAR
;
3328 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3329 re_string_skip_bytes (regexp
, cur_char_size
);
3332 #endif /* RE_ENABLE_I18N */
3333 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3334 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3335 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3336 return parse_bracket_symbol (elem
, regexp
, token
);
3337 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3339 /* A '-' must only appear as anything but a range indicator before
3340 the closing bracket. Everything else is an error. */
3342 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3343 if (token2
.type
!= OP_CLOSE_BRACKET
)
3344 /* The actual error value is not standardized since this whole
3345 case is undefined. But ERANGE makes good sense. */
3348 elem
->type
= SB_CHAR
;
3349 elem
->opr
.ch
= token
->opr
.c
;
3353 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3354 such as [:<character_class>:], [.<collating_element>.], and
3355 [=<equivalent_class>=]. */
3357 static reg_errcode_t
3358 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3361 unsigned char ch
, delim
= token
->opr
.c
;
3363 if (re_string_eoi(regexp
))
3367 if (i
>= BRACKET_NAME_BUF_SIZE
)
3369 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3370 ch
= re_string_fetch_byte_case (regexp
);
3372 ch
= re_string_fetch_byte (regexp
);
3373 if (re_string_eoi(regexp
))
3375 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3377 elem
->opr
.name
[i
] = ch
;
3379 re_string_skip_bytes (regexp
, 1);
3380 elem
->opr
.name
[i
] = '\0';
3381 switch (token
->type
)
3383 case OP_OPEN_COLL_ELEM
:
3384 elem
->type
= COLL_SYM
;
3386 case OP_OPEN_EQUIV_CLASS
:
3387 elem
->type
= EQUIV_CLASS
;
3389 case OP_OPEN_CHAR_CLASS
:
3390 elem
->type
= CHAR_CLASS
;
3398 /* Helper function for parse_bracket_exp.
3399 Build the equivalence class which is represented by NAME.
3400 The result are written to MBCSET and SBCSET.
3401 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3402 is a pointer argument since we may update it. */
3404 static reg_errcode_t
3405 #ifdef RE_ENABLE_I18N
3406 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3407 int *equiv_class_alloc
, const unsigned char *name
)
3408 #else /* not RE_ENABLE_I18N */
3409 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3410 #endif /* not RE_ENABLE_I18N */
3413 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3416 const int32_t *table
, *indirect
;
3417 const unsigned char *weights
, *extra
, *cp
;
3418 unsigned char char_buf
[2];
3422 /* Calculate the index for equivalence class. */
3424 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3425 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3426 _NL_COLLATE_WEIGHTMB
);
3427 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3428 _NL_COLLATE_EXTRAMB
);
3429 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3430 _NL_COLLATE_INDIRECTMB
);
3431 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3432 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3433 /* This isn't a valid character. */
3434 return REG_ECOLLATE
;
3436 /* Build single byte matcing table for this equivalence class. */
3437 len
= weights
[idx1
& 0xffffff];
3438 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3442 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3447 /* This isn't a valid character. */
3449 /* Compare only if the length matches and the collation rule
3450 index is the same. */
3451 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3455 while (cnt
<= len
&&
3456 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3457 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3461 bitset_set (sbcset
, ch
);
3464 /* Check whether the array has enough space. */
3465 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3467 /* Not enough, realloc it. */
3468 /* +1 in case of mbcset->nequiv_classes is 0. */
3469 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3470 /* Use realloc since the array is NULL if *alloc == 0. */
3471 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3473 new_equiv_class_alloc
);
3474 if (BE (new_equiv_classes
== NULL
, 0))
3476 mbcset
->equiv_classes
= new_equiv_classes
;
3477 *equiv_class_alloc
= new_equiv_class_alloc
;
3479 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3484 if (BE (strlen ((const char *) name
) != 1, 0))
3485 return REG_ECOLLATE
;
3486 bitset_set (sbcset
, *name
);
3491 /* Helper function for parse_bracket_exp.
3492 Build the character class which is represented by NAME.
3493 The result are written to MBCSET and SBCSET.
3494 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3495 is a pointer argument since we may update it. */
3497 static reg_errcode_t
3498 #ifdef RE_ENABLE_I18N
3499 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3500 re_charset_t
*mbcset
, int *char_class_alloc
,
3501 const unsigned char *class_name
, reg_syntax_t syntax
)
3502 #else /* not RE_ENABLE_I18N */
3503 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3504 const unsigned char *class_name
, reg_syntax_t syntax
)
3505 #endif /* not RE_ENABLE_I18N */
3508 const char *name
= (const char *) class_name
;
3510 /* In case of REG_ICASE "upper" and "lower" match the both of
3511 upper and lower cases. */
3512 if ((syntax
& RE_ICASE
)
3513 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3516 #ifdef RE_ENABLE_I18N
3517 /* Check the space of the arrays. */
3518 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3520 /* Not enough, realloc it. */
3521 /* +1 in case of mbcset->nchar_classes is 0. */
3522 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3523 /* Use realloc since array is NULL if *alloc == 0. */
3524 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3525 new_char_class_alloc
);
3526 if (BE (new_char_classes
== NULL
, 0))
3528 mbcset
->char_classes
= new_char_classes
;
3529 *char_class_alloc
= new_char_class_alloc
;
3531 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3532 #endif /* RE_ENABLE_I18N */
3534 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3536 if (BE (trans != NULL, 0)) \
3538 for (i = 0; i < SBC_MAX; ++i) \
3539 if (ctype_func (i)) \
3540 bitset_set (sbcset, trans[i]); \
3544 for (i = 0; i < SBC_MAX; ++i) \
3545 if (ctype_func (i)) \
3546 bitset_set (sbcset, i); \
3550 if (strcmp (name
, "alnum") == 0)
3551 BUILD_CHARCLASS_LOOP (isalnum
);
3552 else if (strcmp (name
, "cntrl") == 0)
3553 BUILD_CHARCLASS_LOOP (iscntrl
);
3554 else if (strcmp (name
, "lower") == 0)
3555 BUILD_CHARCLASS_LOOP (islower
);
3556 else if (strcmp (name
, "space") == 0)
3557 BUILD_CHARCLASS_LOOP (isspace
);
3558 else if (strcmp (name
, "alpha") == 0)
3559 BUILD_CHARCLASS_LOOP (isalpha
);
3560 else if (strcmp (name
, "digit") == 0)
3561 BUILD_CHARCLASS_LOOP (isdigit
);
3562 else if (strcmp (name
, "print") == 0)
3563 BUILD_CHARCLASS_LOOP (isprint
);
3564 else if (strcmp (name
, "upper") == 0)
3565 BUILD_CHARCLASS_LOOP (isupper
);
3566 else if (strcmp (name
, "blank") == 0)
3567 BUILD_CHARCLASS_LOOP (isblank
);
3568 else if (strcmp (name
, "graph") == 0)
3569 BUILD_CHARCLASS_LOOP (isgraph
);
3570 else if (strcmp (name
, "punct") == 0)
3571 BUILD_CHARCLASS_LOOP (ispunct
);
3572 else if (strcmp (name
, "xdigit") == 0)
3573 BUILD_CHARCLASS_LOOP (isxdigit
);
3581 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3582 const unsigned char *class_name
,
3583 const unsigned char *extra
, int non_match
,
3586 re_bitset_ptr_t sbcset
;
3587 #ifdef RE_ENABLE_I18N
3588 re_charset_t
*mbcset
;
3590 #endif /* not RE_ENABLE_I18N */
3592 re_token_t br_token
;
3595 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3596 #ifdef RE_ENABLE_I18N
3597 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3598 #endif /* RE_ENABLE_I18N */
3600 #ifdef RE_ENABLE_I18N
3601 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3602 #else /* not RE_ENABLE_I18N */
3603 if (BE (sbcset
== NULL
, 0))
3604 #endif /* not RE_ENABLE_I18N */
3612 #ifdef RE_ENABLE_I18N
3613 mbcset
->non_match
= 1;
3614 #endif /* not RE_ENABLE_I18N */
3617 /* We don't care the syntax in this case. */
3618 ret
= build_charclass (trans
, sbcset
,
3619 #ifdef RE_ENABLE_I18N
3621 #endif /* RE_ENABLE_I18N */
3624 if (BE (ret
!= REG_NOERROR
, 0))
3627 #ifdef RE_ENABLE_I18N
3628 free_charset (mbcset
);
3629 #endif /* RE_ENABLE_I18N */
3633 /* \w match '_' also. */
3634 for (; *extra
; extra
++)
3635 bitset_set (sbcset
, *extra
);
3637 /* If it is non-matching list. */
3639 bitset_not (sbcset
);
3641 #ifdef RE_ENABLE_I18N
3642 /* Ensure only single byte characters are set. */
3643 if (dfa
->mb_cur_max
> 1)
3644 bitset_mask (sbcset
, dfa
->sb_char
);
3647 /* Build a tree for simple bracket. */
3648 br_token
.type
= SIMPLE_BRACKET
;
3649 br_token
.opr
.sbcset
= sbcset
;
3650 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3651 if (BE (tree
== NULL
, 0))
3652 goto build_word_op_espace
;
3654 #ifdef RE_ENABLE_I18N
3655 if (dfa
->mb_cur_max
> 1)
3657 bin_tree_t
*mbc_tree
;
3658 /* Build a tree for complex bracket. */
3659 br_token
.type
= COMPLEX_BRACKET
;
3660 br_token
.opr
.mbcset
= mbcset
;
3661 dfa
->has_mb_node
= 1;
3662 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3663 if (BE (mbc_tree
== NULL
, 0))
3664 goto build_word_op_espace
;
3665 /* Then join them by ALT node. */
3666 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3667 if (BE (mbc_tree
!= NULL
, 1))
3672 free_charset (mbcset
);
3675 #else /* not RE_ENABLE_I18N */
3677 #endif /* not RE_ENABLE_I18N */
3679 build_word_op_espace
:
3681 #ifdef RE_ENABLE_I18N
3682 free_charset (mbcset
);
3683 #endif /* RE_ENABLE_I18N */
3688 /* This is intended for the expressions like "a{1,3}".
3689 Fetch a number from `input', and return the number.
3690 Return -1, if the number field is empty like "{,1}".
3691 Return -2, If an error is occured. */
3694 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3700 fetch_token (token
, input
, syntax
);
3702 if (BE (token
->type
== END_OF_RE
, 0))
3704 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3706 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3707 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3708 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3713 #ifdef RE_ENABLE_I18N
3715 free_charset (re_charset_t
*cset
)
3717 re_free (cset
->mbchars
);
3719 re_free (cset
->coll_syms
);
3720 re_free (cset
->equiv_classes
);
3721 re_free (cset
->range_starts
);
3722 re_free (cset
->range_ends
);
3724 re_free (cset
->char_classes
);
3727 #endif /* RE_ENABLE_I18N */
3729 /* Functions for binary tree operation. */
3731 /* Create a tree node. */
3734 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3735 re_token_type_t type
)
3739 return create_token_tree (dfa
, left
, right
, &t
);
3743 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3744 const re_token_t
*token
)
3747 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3749 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3751 if (storage
== NULL
)
3753 storage
->next
= dfa
->str_tree_storage
;
3754 dfa
->str_tree_storage
= storage
;
3755 dfa
->str_tree_storage_idx
= 0;
3757 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3759 tree
->parent
= NULL
;
3761 tree
->right
= right
;
3762 tree
->token
= *token
;
3763 tree
->token
.duplicated
= 0;
3764 tree
->token
.opt_subexp
= 0;
3767 tree
->node_idx
= -1;
3770 left
->parent
= tree
;
3772 right
->parent
= tree
;
3776 /* Mark the tree SRC as an optional subexpression.
3777 To be called from preorder or postorder. */
3779 static reg_errcode_t
3780 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3782 int idx
= (int) (long) extra
;
3783 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3784 node
->token
.opt_subexp
= 1;
3789 /* Free the allocated memory inside NODE. */
3792 free_token (re_token_t
*node
)
3794 #ifdef RE_ENABLE_I18N
3795 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3796 free_charset (node
->opr
.mbcset
);
3798 #endif /* RE_ENABLE_I18N */
3799 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3800 re_free (node
->opr
.sbcset
);
3803 /* Worker function for tree walking. Free the allocated memory inside NODE
3804 and its children. */
3806 static reg_errcode_t
3807 free_tree (void *extra
, bin_tree_t
*node
)
3809 free_token (&node
->token
);
3814 /* Duplicate the node SRC, and return new node. This is a preorder
3815 visit similar to the one implemented by the generic visitor, but
3816 we need more infrastructure to maintain two parallel trees --- so,
3817 it's easier to duplicate. */
3820 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3822 const bin_tree_t
*node
;
3823 bin_tree_t
*dup_root
;
3824 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3826 for (node
= root
; ; )
3828 /* Create a new tree and link it back to the current parent. */
3829 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3832 (*p_new
)->parent
= dup_node
;
3833 (*p_new
)->token
.duplicated
= 1;
3836 /* Go to the left node, or up and to the right. */
3840 p_new
= &dup_node
->left
;
3844 const bin_tree_t
*prev
= NULL
;
3845 while (node
->right
== prev
|| node
->right
== NULL
)
3848 node
= node
->parent
;
3849 dup_node
= dup_node
->parent
;
3854 p_new
= &dup_node
->right
;