1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 # include <locale/weight.h>
26 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
27 size_t length
, reg_syntax_t syntax
);
28 static void re_compile_fastmap_iter (regex_t
*bufp
,
29 const re_dfastate_t
*init_state
,
31 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
33 static void free_charset (re_charset_t
*cset
);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t
*preg
);
36 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
38 static void optimize_utf8 (re_dfa_t
*dfa
);
40 static reg_errcode_t
analyze (regex_t
*preg
);
41 static reg_errcode_t
preorder (bin_tree_t
*root
,
42 reg_errcode_t (fn (void *, bin_tree_t
*)),
44 static reg_errcode_t
postorder (bin_tree_t
*root
,
45 reg_errcode_t (fn (void *, bin_tree_t
*)),
47 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
49 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
51 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
52 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
53 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
54 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
55 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
56 unsigned int constraint
);
57 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
58 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
60 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
61 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
63 static int peek_token (re_token_t
*token
, re_string_t
*input
,
64 reg_syntax_t syntax
) internal_function
;
65 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
66 reg_syntax_t syntax
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
80 re_dfa_t
*dfa
, re_token_t
*token
,
81 reg_syntax_t syntax
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
83 re_token_t
*token
, reg_syntax_t syntax
,
85 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
87 re_token_t
*token
, int token_len
,
91 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
95 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
97 int *equiv_class_alloc
,
98 const unsigned char *name
);
99 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
101 re_charset_t
*mbcset
,
102 int *char_class_alloc
,
103 const unsigned char *class_name
,
104 reg_syntax_t syntax
);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
107 const unsigned char *name
);
108 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 reg_syntax_t syntax
);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
114 RE_TRANSLATE_TYPE trans
,
115 const unsigned char *class_name
,
116 const unsigned char *extra
,
117 int non_match
, reg_errcode_t
*err
);
118 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
119 bin_tree_t
*left
, bin_tree_t
*right
,
120 re_token_type_t type
);
121 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
122 bin_tree_t
*left
, bin_tree_t
*right
,
123 const re_token_t
*token
);
124 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
125 static void free_token (re_token_t
*node
);
126 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
127 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid
[] attribute_hidden
=
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx
[] attribute_hidden
=
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
219 re_compile_pattern (const char *pattern
, size_t length
,
220 struct re_pattern_buffer
*bufp
)
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
229 /* Match anchors at newline. */
230 bufp
->newline_anchor
= 1;
232 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
236 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
239 weak_alias (__re_compile_pattern
, re_compile_pattern
)
242 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
243 also be assigned to arbitrarily: each pattern buffer stores its own
244 syntax, so it can be changed between regex compilations. */
245 /* This has no initializer because initialized variables in Emacs
246 become read-only after dumping. */
247 reg_syntax_t re_syntax_options
;
250 /* Specify the precise syntax of regexps for compilation. This provides
251 for compatibility for various utilities which historically have
252 different, incompatible syntaxes.
254 The argument SYNTAX is a bit mask comprised of the various bits
255 defined in regex.h. We return the old syntax. */
258 re_set_syntax (reg_syntax_t syntax
)
260 reg_syntax_t ret
= re_syntax_options
;
262 re_syntax_options
= syntax
;
266 weak_alias (__re_set_syntax
, re_set_syntax
)
270 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
272 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
273 char *fastmap
= bufp
->fastmap
;
275 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
276 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_word
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_nl
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
281 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
282 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
283 bufp
->fastmap_accurate
= 1;
287 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
291 __attribute__ ((always_inline
))
292 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
296 fastmap
[tolower (ch
)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
303 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
306 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
308 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
309 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
311 int node
= init_state
->nodes
.elems
[node_cnt
];
312 re_token_type_t type
= dfa
->nodes
[node
].type
;
314 if (type
== CHARACTER
)
316 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
320 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
325 *p
++ = dfa
->nodes
[node
].opr
.c
;
326 while (++node
< dfa
->nodes_len
327 && dfa
->nodes
[node
].type
== CHARACTER
328 && dfa
->nodes
[node
].mb_partial
)
329 *p
++ = dfa
->nodes
[node
].opr
.c
;
330 memset (&state
, '\0', sizeof (state
));
331 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
333 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
335 re_set_fastmap (fastmap
, 0, buf
[0]);
339 else if (type
== SIMPLE_BRACKET
)
342 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
345 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
346 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
347 if (w
& ((bitset_word_t
) 1 << j
))
348 re_set_fastmap (fastmap
, icase
, ch
);
351 #ifdef RE_ENABLE_I18N
352 else if (type
== COMPLEX_BRACKET
)
354 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
358 /* See if we have to try all bytes which start multiple collation
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
365 && (cset
->ncoll_syms
|| cset
->nranges
))
367 const int32_t *table
= (const int32_t *)
368 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
369 for (i
= 0; i
< SBC_MAX
; ++i
)
371 re_set_fastmap (fastmap
, icase
, i
);
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa
->mb_cur_max
> 1
380 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
382 || cset
->nequiv_classes
390 memset (&mbs
, 0, sizeof (mbs
));
391 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
392 re_set_fastmap (fastmap
, false, (int) c
);
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i
= 0; i
< cset
->nmbchars
; ++i
)
404 memset (&state
, '\0', sizeof (state
));
405 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
406 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
407 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
409 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
411 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
416 #endif /* RE_ENABLE_I18N */
417 else if (type
== OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type
== OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type
== END_OF_RE
)
423 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
424 if (type
== END_OF_RE
)
425 bufp
->can_be_null
= 1;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 'buffer' to the compiled pattern;
438 'used' to the length of the compiled pattern;
439 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 'fastmap' to an allocated space for the fastmap;
444 'fastmap_accurate' to zero;
445 're_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (regex_t
*__restrict preg
, const char *__restrict pattern
, int cflags
)
471 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
472 : RE_SYNTAX_POSIX_BASIC
);
478 /* Try to allocate space for the fastmap. */
479 preg
->fastmap
= re_malloc (char, SBC_MAX
);
480 if (BE (preg
->fastmap
== NULL
, 0))
483 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
485 /* If REG_NEWLINE is set, newlines are treated differently. */
486 if (cflags
& REG_NEWLINE
)
487 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
488 syntax
&= ~RE_DOT_NEWLINE
;
489 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
490 /* It also changes the matching behavior. */
491 preg
->newline_anchor
= 1;
494 preg
->newline_anchor
= 0;
495 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
496 preg
->translate
= NULL
;
498 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
500 /* POSIX doesn't distinguish between an unmatched open-group and an
501 unmatched close-group: both are REG_EPAREN. */
502 if (ret
== REG_ERPAREN
)
505 /* We have already checked preg->fastmap != NULL. */
506 if (BE (ret
== REG_NOERROR
, 1))
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. This function never fails in this implementation. */
509 (void) re_compile_fastmap (preg
);
512 /* Some error occurred while compiling the expression. */
513 re_free (preg
->fastmap
);
514 preg
->fastmap
= NULL
;
520 weak_alias (__regcomp
, regcomp
)
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
527 regerror (int errcode
, const regex_t
*__restrict preg
, char *__restrict errbuf
,
534 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
535 / sizeof (__re_error_msgid_idx
[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
542 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
544 msg_size
= strlen (msg
) + 1; /* Includes the null. */
546 if (BE (errbuf_size
!= 0, 1))
548 if (BE (msg_size
> errbuf_size
, 0))
550 #if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
553 memcpy (errbuf
, msg
, errbuf_size
- 1);
554 errbuf
[errbuf_size
- 1] = 0;
558 memcpy (errbuf
, msg
, msg_size
);
564 weak_alias (__regerror
, regerror
)
568 #ifdef RE_ENABLE_I18N
569 /* This static array is used for the map to single-byte characters when
570 UTF-8 is used. Otherwise we would allocate memory just to initialize
571 it the same all the time. UTF-8 is the preferred encoding so this is
572 a worthwhile optimization. */
573 static const bitset_t utf8_sb_map
=
575 /* Set the first 128 bits. */
576 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
582 free_dfa_content (re_dfa_t
*dfa
)
587 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
588 free_token (dfa
->nodes
+ i
);
589 re_free (dfa
->nexts
);
590 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
592 if (dfa
->eclosures
!= NULL
)
593 re_node_set_free (dfa
->eclosures
+ i
);
594 if (dfa
->inveclosures
!= NULL
)
595 re_node_set_free (dfa
->inveclosures
+ i
);
596 if (dfa
->edests
!= NULL
)
597 re_node_set_free (dfa
->edests
+ i
);
599 re_free (dfa
->edests
);
600 re_free (dfa
->eclosures
);
601 re_free (dfa
->inveclosures
);
602 re_free (dfa
->nodes
);
604 if (dfa
->state_table
)
605 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
607 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
608 for (j
= 0; j
< entry
->num
; ++j
)
610 re_dfastate_t
*state
= entry
->array
[j
];
613 re_free (entry
->array
);
615 re_free (dfa
->state_table
);
616 #ifdef RE_ENABLE_I18N
617 if (dfa
->sb_char
!= utf8_sb_map
)
618 re_free (dfa
->sb_char
);
620 re_free (dfa
->subexp_map
);
622 re_free (dfa
->re_str
);
629 /* Free dynamically allocated space used by PREG. */
632 regfree (regex_t
*preg
)
634 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
635 if (BE (dfa
!= NULL
, 1))
636 free_dfa_content (dfa
);
640 re_free (preg
->fastmap
);
641 preg
->fastmap
= NULL
;
643 re_free (preg
->translate
);
644 preg
->translate
= NULL
;
647 weak_alias (__regfree
, regfree
)
650 /* Entry points compatible with 4.2 BSD regex library. We don't define
651 them unless specifically requested. */
653 #if defined _REGEX_RE_COMP || defined _LIBC
655 /* BSD has one and only one pattern buffer. */
656 static struct re_pattern_buffer re_comp_buf
;
660 /* Make these definitions weak in libc, so POSIX programs can redefine
661 these names if they don't use our functions, and still use
662 regcomp/regexec above without link errors. */
665 re_comp (const char *s
)
672 if (!re_comp_buf
.buffer
)
673 return gettext ("No previous regular expression");
677 if (re_comp_buf
.buffer
)
679 fastmap
= re_comp_buf
.fastmap
;
680 re_comp_buf
.fastmap
= NULL
;
681 __regfree (&re_comp_buf
);
682 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
683 re_comp_buf
.fastmap
= fastmap
;
686 if (re_comp_buf
.fastmap
== NULL
)
688 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
689 if (re_comp_buf
.fastmap
== NULL
)
690 return (char *) gettext (__re_error_msgid
691 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
694 /* Since 're_exec' always passes NULL for the 'regs' argument, we
695 don't need to initialize the pattern buffer fields which affect it. */
697 /* Match anchors at newlines. */
698 re_comp_buf
.newline_anchor
= 1;
700 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
705 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
706 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
710 libc_freeres_fn (free_mem
)
712 __regfree (&re_comp_buf
);
716 #endif /* _REGEX_RE_COMP */
718 /* Internal entry point.
719 Compile the regular expression PATTERN, whose length is LENGTH.
720 SYNTAX indicate regular expression's syntax. */
723 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
726 reg_errcode_t err
= REG_NOERROR
;
730 /* Initialize the pattern buffer. */
731 preg
->fastmap_accurate
= 0;
732 preg
->syntax
= syntax
;
733 preg
->not_bol
= preg
->not_eol
= 0;
736 preg
->can_be_null
= 0;
737 preg
->regs_allocated
= REGS_UNALLOCATED
;
739 /* Initialize the dfa. */
740 dfa
= (re_dfa_t
*) preg
->buffer
;
741 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
743 /* If zero allocated, but buffer is non-null, try to realloc
744 enough space. This loses if buffer's address is bogus, but
745 that is the user's responsibility. If ->buffer is NULL this
746 is a simple allocation. */
747 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
750 preg
->allocated
= sizeof (re_dfa_t
);
751 preg
->buffer
= (unsigned char *) dfa
;
753 preg
->used
= sizeof (re_dfa_t
);
755 err
= init_dfa (dfa
, length
);
756 if (BE (err
!= REG_NOERROR
, 0))
758 free_dfa_content (dfa
);
764 /* Note: length+1 will not overflow since it is checked in init_dfa. */
765 dfa
->re_str
= re_malloc (char, length
+ 1);
766 strncpy (dfa
->re_str
, pattern
, length
+ 1);
769 __libc_lock_init (dfa
->lock
);
771 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
772 syntax
& RE_ICASE
, dfa
);
773 if (BE (err
!= REG_NOERROR
, 0))
775 re_compile_internal_free_return
:
776 free_workarea_compile (preg
);
777 re_string_destruct (®exp
);
778 free_dfa_content (dfa
);
784 /* Parse the regular expression, and build a structure tree. */
786 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
787 if (BE (dfa
->str_tree
== NULL
, 0))
788 goto re_compile_internal_free_return
;
790 /* Analyze the tree and create the nfa. */
791 err
= analyze (preg
);
792 if (BE (err
!= REG_NOERROR
, 0))
793 goto re_compile_internal_free_return
;
795 #ifdef RE_ENABLE_I18N
796 /* If possible, do searching in single byte encoding to speed things up. */
797 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
801 /* Then create the initial state of the dfa. */
802 err
= create_initial_state (dfa
);
804 /* Release work areas. */
805 free_workarea_compile (preg
);
806 re_string_destruct (®exp
);
808 if (BE (err
!= REG_NOERROR
, 0))
810 free_dfa_content (dfa
);
818 /* Initialize DFA. We use the length of the regular expression PAT_LEN
819 as the initial length of some arrays. */
822 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
824 unsigned int table_size
;
829 memset (dfa
, '\0', sizeof (re_dfa_t
));
831 /* Force allocation of str_tree_storage the first time. */
832 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
834 /* Avoid overflows. */
835 if (pat_len
== SIZE_MAX
)
838 dfa
->nodes_alloc
= pat_len
+ 1;
839 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
841 /* table_size = 2 ^ ceil(log pat_len) */
842 for (table_size
= 1; ; table_size
<<= 1)
843 if (table_size
> pat_len
)
846 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
847 dfa
->state_hash_mask
= table_size
- 1;
849 dfa
->mb_cur_max
= MB_CUR_MAX
;
851 if (dfa
->mb_cur_max
== 6
852 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
854 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
857 # ifdef HAVE_LANGINFO_CODESET
858 codeset_name
= nl_langinfo (CODESET
);
860 codeset_name
= getenv ("LC_ALL");
861 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
862 codeset_name
= getenv ("LC_CTYPE");
863 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
864 codeset_name
= getenv ("LANG");
865 if (codeset_name
== NULL
)
867 else if (strchr (codeset_name
, '.') != NULL
)
868 codeset_name
= strchr (codeset_name
, '.') + 1;
871 if (strcasecmp (codeset_name
, "UTF-8") == 0
872 || strcasecmp (codeset_name
, "UTF8") == 0)
875 /* We check exhaustively in the loop below if this charset is a
876 superset of ASCII. */
877 dfa
->map_notascii
= 0;
880 #ifdef RE_ENABLE_I18N
881 if (dfa
->mb_cur_max
> 1)
884 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
889 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
890 if (BE (dfa
->sb_char
== NULL
, 0))
893 /* Set the bits corresponding to single byte chars. */
894 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
895 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
897 wint_t wch
= __btowc (ch
);
899 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
901 if (isascii (ch
) && wch
!= ch
)
902 dfa
->map_notascii
= 1;
909 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
914 /* Initialize WORD_CHAR table, which indicate which character is
915 "word". In this case "word" means that it is the word construction
916 character used by some operators like "\<", "\>", etc. */
920 init_word_char (re_dfa_t
*dfa
)
922 dfa
->word_ops_used
= 1;
925 if (BE (dfa
->map_notascii
== 0, 1))
927 if (sizeof (dfa
->word_char
[0]) == 8)
929 /* The extra temporaries here avoid "implicitly truncated"
930 warnings in the case when this is dead code, i.e. 32-bit. */
931 const uint64_t wc0
= UINT64_C (0x03ff000000000000);
932 const uint64_t wc1
= UINT64_C (0x07fffffe87fffffe);
933 dfa
->word_char
[0] = wc0
;
934 dfa
->word_char
[1] = wc1
;
937 else if (sizeof (dfa
->word_char
[0]) == 4)
939 dfa
->word_char
[0] = UINT32_C (0x00000000);
940 dfa
->word_char
[1] = UINT32_C (0x03ff0000);
941 dfa
->word_char
[2] = UINT32_C (0x87fffffe);
942 dfa
->word_char
[3] = UINT32_C (0x07fffffe);
949 if (BE (dfa
->is_utf8
, 1))
951 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
956 for (; i
< BITSET_WORDS
; ++i
)
957 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
958 if (isalnum (ch
) || ch
== '_')
959 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
962 /* Free the work area which are only used while compiling. */
965 free_workarea_compile (regex_t
*preg
)
967 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
968 bin_tree_storage_t
*storage
, *next
;
969 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
971 next
= storage
->next
;
974 dfa
->str_tree_storage
= NULL
;
975 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
976 dfa
->str_tree
= NULL
;
977 re_free (dfa
->org_indices
);
978 dfa
->org_indices
= NULL
;
981 /* Create initial states for all contexts. */
984 create_initial_state (re_dfa_t
*dfa
)
988 re_node_set init_nodes
;
990 /* Initial states have the epsilon closure of the node which is
991 the first node of the regular expression. */
992 first
= dfa
->str_tree
->first
->node_idx
;
993 dfa
->init_node
= first
;
994 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
995 if (BE (err
!= REG_NOERROR
, 0))
998 /* The back-references which are in initial states can epsilon transit,
999 since in this case all of the subexpressions can be null.
1000 Then we add epsilon closures of the nodes which are the next nodes of
1001 the back-references. */
1002 if (dfa
->nbackref
> 0)
1003 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1005 int node_idx
= init_nodes
.elems
[i
];
1006 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1009 if (type
!= OP_BACK_REF
)
1011 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1013 re_token_t
*clexp_node
;
1014 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1015 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1016 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1019 if (clexp_idx
== init_nodes
.nelem
)
1022 if (type
== OP_BACK_REF
)
1024 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1025 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1027 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1030 if (err
!= REG_NOERROR
)
1037 /* It must be the first time to invoke acquire_state. */
1038 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1039 /* We don't check ERR here, since the initial state must not be NULL. */
1040 if (BE (dfa
->init_state
== NULL
, 0))
1042 if (dfa
->init_state
->has_constraint
)
1044 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1046 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1048 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1052 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1053 || dfa
->init_state_begbuf
== NULL
, 0))
1057 dfa
->init_state_word
= dfa
->init_state_nl
1058 = dfa
->init_state_begbuf
= dfa
->init_state
;
1060 re_node_set_free (&init_nodes
);
1064 #ifdef RE_ENABLE_I18N
1065 /* If it is possible to do searching in single byte encoding instead of UTF-8
1066 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1067 DFA nodes where needed. */
1070 optimize_utf8 (re_dfa_t
*dfa
)
1072 int node
, i
, mb_chars
= 0, has_period
= 0;
1074 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1075 switch (dfa
->nodes
[node
].type
)
1078 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1082 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1090 /* Word anchors etc. cannot be handled. It's okay to test
1091 opr.ctx_type since constraints (for all DFA nodes) are
1092 created by ORing one or more opr.ctx_type values. */
1102 case OP_DUP_ASTERISK
:
1103 case OP_OPEN_SUBEXP
:
1104 case OP_CLOSE_SUBEXP
:
1106 case COMPLEX_BRACKET
:
1108 case SIMPLE_BRACKET
:
1109 /* Just double check. The non-ASCII range starts at 0x80. */
1110 assert (0x80 % BITSET_WORD_BITS
== 0);
1111 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1112 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1119 if (mb_chars
|| has_period
)
1120 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1122 if (dfa
->nodes
[node
].type
== CHARACTER
1123 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1124 dfa
->nodes
[node
].mb_partial
= 0;
1125 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1126 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1129 /* The search can be in single byte locale. */
1130 dfa
->mb_cur_max
= 1;
1132 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1136 /* Analyze the structure tree, and calculate "first", "next", "edest",
1137 "eclosure", and "inveclosure". */
1139 static reg_errcode_t
1140 analyze (regex_t
*preg
)
1142 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1145 /* Allocate arrays. */
1146 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1147 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1148 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1149 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1150 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1151 || dfa
->eclosures
== NULL
, 0))
1154 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1155 if (dfa
->subexp_map
!= NULL
)
1158 for (i
= 0; i
< preg
->re_nsub
; i
++)
1159 dfa
->subexp_map
[i
] = i
;
1160 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1161 for (i
= 0; i
< preg
->re_nsub
; i
++)
1162 if (dfa
->subexp_map
[i
] != i
)
1164 if (i
== preg
->re_nsub
)
1166 free (dfa
->subexp_map
);
1167 dfa
->subexp_map
= NULL
;
1171 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1172 if (BE (ret
!= REG_NOERROR
, 0))
1174 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1175 if (BE (ret
!= REG_NOERROR
, 0))
1177 preorder (dfa
->str_tree
, calc_next
, dfa
);
1178 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1179 if (BE (ret
!= REG_NOERROR
, 0))
1181 ret
= calc_eclosure (dfa
);
1182 if (BE (ret
!= REG_NOERROR
, 0))
1185 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1186 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1187 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1190 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1191 if (BE (dfa
->inveclosures
== NULL
, 0))
1193 ret
= calc_inveclosure (dfa
);
1199 /* Our parse trees are very unbalanced, so we cannot use a stack to
1200 implement parse tree visits. Instead, we use parent pointers and
1201 some hairy code in these two functions. */
1202 static reg_errcode_t
1203 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1206 bin_tree_t
*node
, *prev
;
1208 for (node
= root
; ; )
1210 /* Descend down the tree, preferably to the left (or to the right
1211 if that's the only child). */
1212 while (node
->left
|| node
->right
)
1220 reg_errcode_t err
= fn (extra
, node
);
1221 if (BE (err
!= REG_NOERROR
, 0))
1223 if (node
->parent
== NULL
)
1226 node
= node
->parent
;
1228 /* Go up while we have a node that is reached from the right. */
1229 while (node
->right
== prev
|| node
->right
== NULL
);
1234 static reg_errcode_t
1235 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1240 for (node
= root
; ; )
1242 reg_errcode_t err
= fn (extra
, node
);
1243 if (BE (err
!= REG_NOERROR
, 0))
1246 /* Go to the left node, or up and to the right. */
1251 bin_tree_t
*prev
= NULL
;
1252 while (node
->right
== prev
|| node
->right
== NULL
)
1255 node
= node
->parent
;
1264 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1266 backreferences as well. Requires a preorder visit. */
1267 static reg_errcode_t
1268 optimize_subexps (void *extra
, bin_tree_t
*node
)
1270 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1272 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1274 int idx
= node
->token
.opr
.idx
;
1275 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1276 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1279 else if (node
->token
.type
== SUBEXP
1280 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1282 int other_idx
= node
->left
->token
.opr
.idx
;
1284 node
->left
= node
->left
->left
;
1286 node
->left
->parent
= node
;
1288 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1289 if (other_idx
< BITSET_WORD_BITS
)
1290 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1296 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1297 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1298 static reg_errcode_t
1299 lower_subexps (void *extra
, bin_tree_t
*node
)
1301 regex_t
*preg
= (regex_t
*) extra
;
1302 reg_errcode_t err
= REG_NOERROR
;
1304 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1306 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1308 node
->left
->parent
= node
;
1310 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1312 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1314 node
->right
->parent
= node
;
1321 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1323 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1324 bin_tree_t
*body
= node
->left
;
1325 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1328 /* We do not optimize empty subexpressions, because otherwise we may
1329 have bad CONCAT nodes with NULL children. This is obviously not
1330 very common, so we do not lose much. An example that triggers
1331 this case is the sed "script" /\(\)/x. */
1332 && node
->left
!= NULL
1333 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1334 || !(dfa
->used_bkref_map
1335 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1338 /* Convert the SUBEXP node to the concatenation of an
1339 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1340 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1341 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1342 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1343 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1344 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1350 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1351 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356 nodes. Requires a postorder visit. */
1357 static reg_errcode_t
1358 calc_first (void *extra
, bin_tree_t
*node
)
1360 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1361 if (node
->token
.type
== CONCAT
)
1363 node
->first
= node
->left
->first
;
1364 node
->node_idx
= node
->left
->node_idx
;
1369 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1370 if (BE (node
->node_idx
== -1, 0))
1372 if (node
->token
.type
== ANCHOR
)
1373 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1378 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1379 static reg_errcode_t
1380 calc_next (void *extra
, bin_tree_t
*node
)
1382 switch (node
->token
.type
)
1384 case OP_DUP_ASTERISK
:
1385 node
->left
->next
= node
;
1388 node
->left
->next
= node
->right
->first
;
1389 node
->right
->next
= node
->next
;
1393 node
->left
->next
= node
->next
;
1395 node
->right
->next
= node
->next
;
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1402 static reg_errcode_t
1403 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1405 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1406 int idx
= node
->node_idx
;
1407 reg_errcode_t err
= REG_NOERROR
;
1409 switch (node
->token
.type
)
1415 assert (node
->next
== NULL
);
1418 case OP_DUP_ASTERISK
:
1422 dfa
->has_plural_match
= 1;
1423 if (node
->left
!= NULL
)
1424 left
= node
->left
->first
->node_idx
;
1426 left
= node
->next
->node_idx
;
1427 if (node
->right
!= NULL
)
1428 right
= node
->right
->first
->node_idx
;
1430 right
= node
->next
->node_idx
;
1432 assert (right
> -1);
1433 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1438 case OP_OPEN_SUBEXP
:
1439 case OP_CLOSE_SUBEXP
:
1440 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1444 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1445 if (node
->token
.type
== OP_BACK_REF
)
1446 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1450 assert (!IS_EPSILON_NODE (node
->token
.type
));
1451 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1458 /* Duplicate the epsilon closure of the node ROOT_NODE.
1459 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1460 to their own constraint. */
1462 static reg_errcode_t
1464 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1465 int root_node
, unsigned int init_constraint
)
1467 int org_node
, clone_node
, ret
;
1468 unsigned int constraint
= init_constraint
;
1469 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1471 int org_dest
, clone_dest
;
1472 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1474 /* If the back reference epsilon-transit, its destination must
1475 also have the constraint. Then duplicate the epsilon closure
1476 of the destination of the back reference, and store it in
1477 edests of the back reference. */
1478 org_dest
= dfa
->nexts
[org_node
];
1479 re_node_set_empty (dfa
->edests
+ clone_node
);
1480 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1481 if (BE (clone_dest
== -1, 0))
1483 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1484 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1485 if (BE (ret
< 0, 0))
1488 else if (dfa
->edests
[org_node
].nelem
== 0)
1490 /* In case of the node can't epsilon-transit, don't duplicate the
1491 destination and store the original destination as the
1492 destination of the node. */
1493 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1496 else if (dfa
->edests
[org_node
].nelem
== 1)
1498 /* In case of the node can epsilon-transit, and it has only one
1500 org_dest
= dfa
->edests
[org_node
].elems
[0];
1501 re_node_set_empty (dfa
->edests
+ clone_node
);
1502 /* If the node is root_node itself, it means the epsilon closure
1503 has a loop. Then tie it to the destination of the root_node. */
1504 if (org_node
== root_node
&& clone_node
!= org_node
)
1506 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1507 if (BE (ret
< 0, 0))
1511 /* In case the node has another constraint, append it. */
1512 constraint
|= dfa
->nodes
[org_node
].constraint
;
1513 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1514 if (BE (clone_dest
== -1, 0))
1516 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1517 if (BE (ret
< 0, 0))
1520 else /* dfa->edests[org_node].nelem == 2 */
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1524 org_dest
= dfa
->edests
[org_node
].elems
[0];
1525 re_node_set_empty (dfa
->edests
+ clone_node
);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1528 if (clone_dest
== -1)
1530 /* There is no such duplicated node, create a new one. */
1532 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1533 if (BE (clone_dest
== -1, 0))
1535 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1536 if (BE (ret
< 0, 0))
1538 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1539 root_node
, constraint
);
1540 if (BE (err
!= REG_NOERROR
, 0))
1545 /* There is a duplicated node which satisfies the constraint,
1546 use it to avoid infinite loop. */
1547 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1548 if (BE (ret
< 0, 0))
1552 org_dest
= dfa
->edests
[org_node
].elems
[1];
1553 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1554 if (BE (clone_dest
== -1, 0))
1556 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1557 if (BE (ret
< 0, 0))
1560 org_node
= org_dest
;
1561 clone_node
= clone_dest
;
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567 satisfies the constraint CONSTRAINT. */
1570 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1571 unsigned int constraint
)
1574 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1576 if (org_node
== dfa
->org_indices
[idx
]
1577 && constraint
== dfa
->nodes
[idx
].constraint
)
1578 return idx
; /* Found. */
1580 return -1; /* Not found. */
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584 Return the index of the new node, or -1 if insufficient storage is
1588 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1590 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1591 if (BE (dup_idx
!= -1, 1))
1593 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1594 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1595 dfa
->nodes
[dup_idx
].duplicated
= 1;
1597 /* Store the index of the original node. */
1598 dfa
->org_indices
[dup_idx
] = org_idx
;
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t
*dfa
)
1607 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1608 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1610 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1612 int *elems
= dfa
->eclosures
[src
].elems
;
1613 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1615 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1616 if (BE (ret
== -1, 0))
1624 /* Calculate "eclosure" for all the node in DFA. */
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t
*dfa
)
1629 int node_idx
, incomplete
;
1631 assert (dfa
->nodes_len
> 0);
1634 /* For each nodes, calculate epsilon closure. */
1635 for (node_idx
= 0; ; ++node_idx
)
1638 re_node_set eclosure_elem
;
1639 if (node_idx
== dfa
->nodes_len
)
1648 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1651 /* If we have already calculated, skip it. */
1652 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1654 /* Calculate epsilon closure of 'node_idx'. */
1655 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1656 if (BE (err
!= REG_NOERROR
, 0))
1659 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1662 re_node_set_free (&eclosure_elem
);
1668 /* Calculate epsilon closure of NODE. */
1670 static reg_errcode_t
1671 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1675 re_node_set eclosure
;
1678 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1679 if (BE (err
!= REG_NOERROR
, 0))
1682 /* This indicates that we are calculating this node now.
1683 We reference this value to avoid infinite loop. */
1684 dfa
->eclosures
[node
].nelem
= -1;
1686 /* If the current node has constraints, duplicate all nodes
1687 since they must inherit the constraints. */
1688 if (dfa
->nodes
[node
].constraint
1689 && dfa
->edests
[node
].nelem
1690 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1692 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1693 dfa
->nodes
[node
].constraint
);
1694 if (BE (err
!= REG_NOERROR
, 0))
1698 /* Expand each epsilon destination nodes. */
1699 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1700 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1702 re_node_set eclosure_elem
;
1703 int edest
= dfa
->edests
[node
].elems
[i
];
1704 /* If calculating the epsilon closure of `edest' is in progress,
1705 return intermediate result. */
1706 if (dfa
->eclosures
[edest
].nelem
== -1)
1711 /* If we haven't calculated the epsilon closure of `edest' yet,
1712 calculate now. Otherwise use calculated epsilon closure. */
1713 if (dfa
->eclosures
[edest
].nelem
== 0)
1715 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1716 if (BE (err
!= REG_NOERROR
, 0))
1720 eclosure_elem
= dfa
->eclosures
[edest
];
1721 /* Merge the epsilon closure of 'edest'. */
1722 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1723 if (BE (err
!= REG_NOERROR
, 0))
1725 /* If the epsilon closure of 'edest' is incomplete,
1726 the epsilon closure of this node is also incomplete. */
1727 if (dfa
->eclosures
[edest
].nelem
== 0)
1730 re_node_set_free (&eclosure_elem
);
1734 /* An epsilon closure includes itself. */
1735 ret
= re_node_set_insert (&eclosure
, node
);
1736 if (BE (ret
< 0, 0))
1738 if (incomplete
&& !root
)
1739 dfa
->eclosures
[node
].nelem
= 0;
1741 dfa
->eclosures
[node
] = eclosure
;
1742 *new_set
= eclosure
;
1746 /* Functions for token which are used in the parser. */
1748 /* Fetch a token from INPUT.
1749 We must not use this function inside bracket expressions. */
1753 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1755 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1758 /* Peek a token from INPUT, and return the length of the token.
1759 We must not use this function inside bracket expressions. */
1763 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1767 if (re_string_eoi (input
))
1769 token
->type
= END_OF_RE
;
1773 c
= re_string_peek_byte (input
, 0);
1776 token
->word_char
= 0;
1777 #ifdef RE_ENABLE_I18N
1778 token
->mb_partial
= 0;
1779 if (input
->mb_cur_max
> 1 &&
1780 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1782 token
->type
= CHARACTER
;
1783 token
->mb_partial
= 1;
1790 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1792 token
->type
= BACK_SLASH
;
1796 c2
= re_string_peek_byte_case (input
, 1);
1798 token
->type
= CHARACTER
;
1799 #ifdef RE_ENABLE_I18N
1800 if (input
->mb_cur_max
> 1)
1802 wint_t wc
= re_string_wchar_at (input
,
1803 re_string_cur_idx (input
) + 1);
1804 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1808 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1813 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1814 token
->type
= OP_ALT
;
1816 case '1': case '2': case '3': case '4': case '5':
1817 case '6': case '7': case '8': case '9':
1818 if (!(syntax
& RE_NO_BK_REFS
))
1820 token
->type
= OP_BACK_REF
;
1821 token
->opr
.idx
= c2
- '1';
1825 if (!(syntax
& RE_NO_GNU_OPS
))
1827 token
->type
= ANCHOR
;
1828 token
->opr
.ctx_type
= WORD_FIRST
;
1832 if (!(syntax
& RE_NO_GNU_OPS
))
1834 token
->type
= ANCHOR
;
1835 token
->opr
.ctx_type
= WORD_LAST
;
1839 if (!(syntax
& RE_NO_GNU_OPS
))
1841 token
->type
= ANCHOR
;
1842 token
->opr
.ctx_type
= WORD_DELIM
;
1846 if (!(syntax
& RE_NO_GNU_OPS
))
1848 token
->type
= ANCHOR
;
1849 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1853 if (!(syntax
& RE_NO_GNU_OPS
))
1854 token
->type
= OP_WORD
;
1857 if (!(syntax
& RE_NO_GNU_OPS
))
1858 token
->type
= OP_NOTWORD
;
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1862 token
->type
= OP_SPACE
;
1865 if (!(syntax
& RE_NO_GNU_OPS
))
1866 token
->type
= OP_NOTSPACE
;
1869 if (!(syntax
& RE_NO_GNU_OPS
))
1871 token
->type
= ANCHOR
;
1872 token
->opr
.ctx_type
= BUF_FIRST
;
1876 if (!(syntax
& RE_NO_GNU_OPS
))
1878 token
->type
= ANCHOR
;
1879 token
->opr
.ctx_type
= BUF_LAST
;
1883 if (!(syntax
& RE_NO_BK_PARENS
))
1884 token
->type
= OP_OPEN_SUBEXP
;
1887 if (!(syntax
& RE_NO_BK_PARENS
))
1888 token
->type
= OP_CLOSE_SUBEXP
;
1891 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1892 token
->type
= OP_DUP_PLUS
;
1895 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1896 token
->type
= OP_DUP_QUESTION
;
1899 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1900 token
->type
= OP_OPEN_DUP_NUM
;
1903 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1904 token
->type
= OP_CLOSE_DUP_NUM
;
1912 token
->type
= CHARACTER
;
1913 #ifdef RE_ENABLE_I18N
1914 if (input
->mb_cur_max
> 1)
1916 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1917 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1921 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1926 if (syntax
& RE_NEWLINE_ALT
)
1927 token
->type
= OP_ALT
;
1930 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1931 token
->type
= OP_ALT
;
1934 token
->type
= OP_DUP_ASTERISK
;
1937 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1938 token
->type
= OP_DUP_PLUS
;
1941 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1942 token
->type
= OP_DUP_QUESTION
;
1945 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1946 token
->type
= OP_OPEN_DUP_NUM
;
1949 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1950 token
->type
= OP_CLOSE_DUP_NUM
;
1953 if (syntax
& RE_NO_BK_PARENS
)
1954 token
->type
= OP_OPEN_SUBEXP
;
1957 if (syntax
& RE_NO_BK_PARENS
)
1958 token
->type
= OP_CLOSE_SUBEXP
;
1961 token
->type
= OP_OPEN_BRACKET
;
1964 token
->type
= OP_PERIOD
;
1967 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1968 re_string_cur_idx (input
) != 0)
1970 char prev
= re_string_peek_byte (input
, -1);
1971 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1974 token
->type
= ANCHOR
;
1975 token
->opr
.ctx_type
= LINE_FIRST
;
1978 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1979 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1982 re_string_skip_bytes (input
, 1);
1983 peek_token (&next
, input
, syntax
);
1984 re_string_skip_bytes (input
, -1);
1985 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1988 token
->type
= ANCHOR
;
1989 token
->opr
.ctx_type
= LINE_LAST
;
1997 /* Peek a token from INPUT, and return the length of the token.
1998 We must not use this function out of bracket expressions. */
2002 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2005 if (re_string_eoi (input
))
2007 token
->type
= END_OF_RE
;
2010 c
= re_string_peek_byte (input
, 0);
2013 #ifdef RE_ENABLE_I18N
2014 if (input
->mb_cur_max
> 1 &&
2015 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2017 token
->type
= CHARACTER
;
2020 #endif /* RE_ENABLE_I18N */
2022 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2023 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2025 /* In this case, '\' escape a character. */
2027 re_string_skip_bytes (input
, 1);
2028 c2
= re_string_peek_byte (input
, 0);
2030 token
->type
= CHARACTER
;
2033 if (c
== '[') /* '[' is a special char in a bracket exps. */
2037 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2038 c2
= re_string_peek_byte (input
, 1);
2046 token
->type
= OP_OPEN_COLL_ELEM
;
2049 token
->type
= OP_OPEN_EQUIV_CLASS
;
2052 if (syntax
& RE_CHAR_CLASSES
)
2054 token
->type
= OP_OPEN_CHAR_CLASS
;
2057 /* else fall through. */
2059 token
->type
= CHARACTER
;
2069 token
->type
= OP_CHARSET_RANGE
;
2072 token
->type
= OP_CLOSE_BRACKET
;
2075 token
->type
= OP_NON_MATCH_LIST
;
2078 token
->type
= CHARACTER
;
2083 /* Functions for parser. */
2085 /* Entry point of the parser.
2086 Parse the regular expression REGEXP and return the structure tree.
2087 If an error occurs, ERR is set by error code, and return NULL.
2088 This function build the following tree, from regular expression <reg_exp>:
2094 CAT means concatenation.
2095 EOR means end of regular expression. */
2098 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2101 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2102 bin_tree_t
*tree
, *eor
, *root
;
2103 re_token_t current_token
;
2104 dfa
->syntax
= syntax
;
2105 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2106 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2107 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2109 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2111 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2114 if (BE (eor
== NULL
|| root
== NULL
, 0))
2122 /* This function build the following tree, from regular expression
2123 <branch1>|<branch2>:
2129 ALT means alternative, which represents the operator '|'. */
2132 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2133 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2135 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2136 bin_tree_t
*tree
, *branch
= NULL
;
2137 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2138 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2141 while (token
->type
== OP_ALT
)
2143 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2144 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2145 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2147 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2148 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2151 postorder (tree
, free_tree
, NULL
);
2157 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2158 if (BE (tree
== NULL
, 0))
2167 /* This function build the following tree, from regular expression
2174 CAT means concatenation. */
2177 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2178 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2180 bin_tree_t
*tree
, *exp
;
2181 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2182 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2183 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2186 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2187 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2189 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2190 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2193 postorder (tree
, free_tree
, NULL
);
2196 if (tree
!= NULL
&& exp
!= NULL
)
2198 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2199 if (newtree
== NULL
)
2201 postorder (exp
, free_tree
, NULL
);
2202 postorder (tree
, free_tree
, NULL
);
2208 else if (tree
== NULL
)
2210 /* Otherwise exp == NULL, we don't need to create new tree. */
2215 /* This function build the following tree, from regular expression a*:
2222 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2223 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2225 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2227 switch (token
->type
)
2230 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2231 if (BE (tree
== NULL
, 0))
2236 #ifdef RE_ENABLE_I18N
2237 if (dfa
->mb_cur_max
> 1)
2239 while (!re_string_eoi (regexp
)
2240 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2242 bin_tree_t
*mbc_remain
;
2243 fetch_token (token
, regexp
, syntax
);
2244 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2245 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2246 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2255 case OP_OPEN_SUBEXP
:
2256 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2257 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2260 case OP_OPEN_BRACKET
:
2261 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2262 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2266 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2271 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2272 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2273 if (BE (tree
== NULL
, 0))
2279 dfa
->has_mb_node
= 1;
2281 case OP_OPEN_DUP_NUM
:
2282 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2288 case OP_DUP_ASTERISK
:
2290 case OP_DUP_QUESTION
:
2291 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2296 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2298 fetch_token (token
, regexp
, syntax
);
2299 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2301 /* else fall through */
2302 case OP_CLOSE_SUBEXP
:
2303 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2304 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2309 /* else fall through */
2310 case OP_CLOSE_DUP_NUM
:
2311 /* We treat it as a normal character. */
2313 /* Then we can these characters as normal characters. */
2314 token
->type
= CHARACTER
;
2315 /* mb_partial and word_char bits should be initialized already
2317 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2318 if (BE (tree
== NULL
, 0))
2325 if ((token
->opr
.ctx_type
2326 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2327 && dfa
->word_ops_used
== 0)
2328 init_word_char (dfa
);
2329 if (token
->opr
.ctx_type
== WORD_DELIM
2330 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2332 bin_tree_t
*tree_first
, *tree_last
;
2333 if (token
->opr
.ctx_type
== WORD_DELIM
)
2335 token
->opr
.ctx_type
= WORD_FIRST
;
2336 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2337 token
->opr
.ctx_type
= WORD_LAST
;
2341 token
->opr
.ctx_type
= INSIDE_WORD
;
2342 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2343 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2345 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2346 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2347 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2355 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2356 if (BE (tree
== NULL
, 0))
2362 /* We must return here, since ANCHORs can't be followed
2363 by repetition operators.
2364 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2366 fetch_token (token
, regexp
, syntax
);
2369 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2370 if (BE (tree
== NULL
, 0))
2375 if (dfa
->mb_cur_max
> 1)
2376 dfa
->has_mb_node
= 1;
2380 tree
= build_charclass_op (dfa
, regexp
->trans
,
2381 (const unsigned char *) "alnum",
2382 (const unsigned char *) "_",
2383 token
->type
== OP_NOTWORD
, err
);
2384 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2389 tree
= build_charclass_op (dfa
, regexp
->trans
,
2390 (const unsigned char *) "space",
2391 (const unsigned char *) "",
2392 token
->type
== OP_NOTSPACE
, err
);
2393 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2403 /* Must not happen? */
2409 fetch_token (token
, regexp
, syntax
);
2411 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2412 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2414 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2415 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2418 postorder (tree
, free_tree
, NULL
);
2422 /* In BRE consecutive duplications are not allowed. */
2423 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2424 && (token
->type
== OP_DUP_ASTERISK
2425 || token
->type
== OP_OPEN_DUP_NUM
))
2428 postorder (tree
, free_tree
, NULL
);
2437 /* This function build the following tree, from regular expression
2445 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2446 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2448 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2451 cur_nsub
= preg
->re_nsub
++;
2453 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2455 /* The subexpression may be a null string. */
2456 if (token
->type
== OP_CLOSE_SUBEXP
)
2460 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2461 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2464 postorder (tree
, free_tree
, NULL
);
2467 if (BE (*err
!= REG_NOERROR
, 0))
2471 if (cur_nsub
<= '9' - '1')
2472 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2474 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2475 if (BE (tree
== NULL
, 0))
2480 tree
->token
.opr
.idx
= cur_nsub
;
2484 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2487 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2488 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2490 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2491 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2492 re_token_t start_token
= *token
;
2494 if (token
->type
== OP_OPEN_DUP_NUM
)
2497 start
= fetch_number (regexp
, token
, syntax
);
2500 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2501 start
= 0; /* We treat "{,m}" as "{0,m}". */
2504 *err
= REG_BADBR
; /* <re>{} is invalid. */
2508 if (BE (start
!= -2, 1))
2510 /* We treat "{n}" as "{n,n}". */
2511 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2512 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2513 ? fetch_number (regexp
, token
, syntax
) : -2));
2515 if (BE (start
== -2 || end
== -2, 0))
2517 /* Invalid sequence. */
2518 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2520 if (token
->type
== END_OF_RE
)
2528 /* If the syntax bit is set, rollback. */
2529 re_string_set_index (regexp
, start_idx
);
2530 *token
= start_token
;
2531 token
->type
= CHARACTER
;
2532 /* mb_partial and word_char bits should be already initialized by
2537 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2539 /* First number greater than second. */
2546 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2547 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2550 fetch_token (token
, regexp
, syntax
);
2552 if (BE (elem
== NULL
, 0))
2554 if (BE (start
== 0 && end
== 0, 0))
2556 postorder (elem
, free_tree
, NULL
);
2560 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2561 if (BE (start
> 0, 0))
2564 for (i
= 2; i
<= start
; ++i
)
2566 elem
= duplicate_tree (elem
, dfa
);
2567 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2568 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2569 goto parse_dup_op_espace
;
2575 /* Duplicate ELEM before it is marked optional. */
2576 elem
= duplicate_tree (elem
, dfa
);
2577 if (BE (elem
== NULL
, 0))
2578 goto parse_dup_op_espace
;
2584 if (elem
->token
.type
== SUBEXP
)
2585 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2587 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2588 if (BE (tree
== NULL
, 0))
2589 goto parse_dup_op_espace
;
2591 /* This loop is actually executed only when end != -1,
2592 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2593 already created the start+1-th copy. */
2594 for (i
= start
+ 2; i
<= end
; ++i
)
2596 elem
= duplicate_tree (elem
, dfa
);
2597 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2598 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2599 goto parse_dup_op_espace
;
2601 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2602 if (BE (tree
== NULL
, 0))
2603 goto parse_dup_op_espace
;
2607 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2611 parse_dup_op_espace
:
2616 /* Size of the names for collating symbol/equivalence_class/character_class.
2617 I'm not sure, but maybe enough. */
2618 #define BRACKET_NAME_BUF_SIZE 32
2621 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622 Build the range expression which starts from START_ELEM, and ends
2623 at END_ELEM. The result are written to MBCSET and SBCSET.
2624 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625 mbcset->range_ends, is a pointer argument since we may
2628 static reg_errcode_t
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2632 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2635 bracket_elem_t
*end_elem
)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch
, end_ch
;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2641 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2645 /* We can handle no multi character collating elements without libc
2647 if (BE ((start_elem
->type
== COLL_SYM
2648 && strlen ((char *) start_elem
->opr
.name
) > 1)
2649 || (end_elem
->type
== COLL_SYM
2650 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2651 return REG_ECOLLATE
;
2653 # ifdef RE_ENABLE_I18N
2658 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2660 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2661 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2663 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2664 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2666 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2667 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2668 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2669 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2670 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2671 return REG_ECOLLATE
;
2672 cmp_buf
[0] = start_wc
;
2673 cmp_buf
[4] = end_wc
;
2674 if (__wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2677 /* Got valid collation sequence values, add them as a new entry.
2678 However, for !_LIBC we have no collation elements: if the
2679 character set is single byte, the single byte character set
2680 that we build below suffices. parse_bracket_exp passes
2681 no MBCSET if dfa->mb_cur_max == 1. */
2684 /* Check the space of the arrays. */
2685 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2687 /* There is not enough space, need realloc. */
2688 wchar_t *new_array_start
, *new_array_end
;
2691 /* +1 in case of mbcset->nranges is 0. */
2692 new_nranges
= 2 * mbcset
->nranges
+ 1;
2693 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2694 are NULL if *range_alloc == 0. */
2695 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2697 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2700 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2703 mbcset
->range_starts
= new_array_start
;
2704 mbcset
->range_ends
= new_array_end
;
2705 *range_alloc
= new_nranges
;
2708 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2709 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2712 /* Build the table for single byte characters. */
2713 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2716 if (__wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2717 && __wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2718 bitset_set (sbcset
, wc
);
2721 # else /* not RE_ENABLE_I18N */
2724 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2725 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2727 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2728 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2730 if (start_ch
> end_ch
)
2732 /* Build the table for single byte characters. */
2733 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2734 if (start_ch
<= ch
&& ch
<= end_ch
)
2735 bitset_set (sbcset
, ch
);
2737 # endif /* not RE_ENABLE_I18N */
2740 #endif /* not _LIBC */
2743 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2744 Build the collating element which is represented by NAME.
2745 The result are written to MBCSET and SBCSET.
2746 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2747 pointer argument since we may update it. */
2749 static reg_errcode_t
2751 # ifdef RE_ENABLE_I18N
2752 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2753 int *coll_sym_alloc
, const unsigned char *name
)
2754 # else /* not RE_ENABLE_I18N */
2755 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2756 # endif /* not RE_ENABLE_I18N */
2758 size_t name_len
= strlen ((const char *) name
);
2759 if (BE (name_len
!= 1, 0))
2760 return REG_ECOLLATE
;
2763 bitset_set (sbcset
, name
[0]);
2767 #endif /* not _LIBC */
2769 /* This function parse bracket expression like "[abc]", "[a-c]",
2773 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2774 reg_syntax_t syntax
, reg_errcode_t
*err
)
2777 const unsigned char *collseqmb
;
2778 const char *collseqwc
;
2781 const int32_t *symb_table
;
2782 const unsigned char *extra
;
2784 /* Local function for parse_bracket_exp used in _LIBC environment.
2785 Seek the collating symbol entry corresponding to NAME.
2786 Return the index of the symbol in the SYMB_TABLE,
2787 or -1 if not found. */
2790 __attribute__ ((always_inline
))
2791 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2795 for (elem
= 0; elem
< table_size
; elem
++)
2796 if (symb_table
[2 * elem
] != 0)
2798 int32_t idx
= symb_table
[2 * elem
+ 1];
2799 /* Skip the name of collating element name. */
2800 idx
+= 1 + extra
[idx
];
2801 if (/* Compare the length of the name. */
2802 name_len
== extra
[idx
]
2803 /* Compare the name. */
2804 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2805 /* Yep, this is the entry. */
2811 /* Local function for parse_bracket_exp used in _LIBC environment.
2812 Look up the collation sequence value of BR_ELEM.
2813 Return the value if succeeded, UINT_MAX otherwise. */
2815 auto inline unsigned int
2816 __attribute__ ((always_inline
))
2817 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2819 if (br_elem
->type
== SB_CHAR
)
2822 if (MB_CUR_MAX == 1)
2825 return collseqmb
[br_elem
->opr
.ch
];
2828 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2829 return __collseq_table_lookup (collseqwc
, wc
);
2832 else if (br_elem
->type
== MB_CHAR
)
2835 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2837 else if (br_elem
->type
== COLL_SYM
)
2839 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2843 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2847 /* We found the entry. */
2848 idx
= symb_table
[2 * elem
+ 1];
2849 /* Skip the name of collating element name. */
2850 idx
+= 1 + extra
[idx
];
2851 /* Skip the byte sequence of the collating element. */
2852 idx
+= 1 + extra
[idx
];
2853 /* Adjust for the alignment. */
2854 idx
= (idx
+ 3) & ~3;
2855 /* Skip the multibyte collation sequence value. */
2856 idx
+= sizeof (unsigned int);
2857 /* Skip the wide char sequence of the collating element. */
2858 idx
+= sizeof (unsigned int) *
2859 (1 + *(unsigned int *) (extra
+ idx
));
2860 /* Return the collation sequence value. */
2861 return *(unsigned int *) (extra
+ idx
);
2863 else if (sym_name_len
== 1)
2865 /* No valid character. Match it as a single byte
2867 return collseqmb
[br_elem
->opr
.name
[0]];
2870 else if (sym_name_len
== 1)
2871 return collseqmb
[br_elem
->opr
.name
[0]];
2876 /* Local function for parse_bracket_exp used in _LIBC environment.
2877 Build the range expression which starts from START_ELEM, and ends
2878 at END_ELEM. The result are written to MBCSET and SBCSET.
2879 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2880 mbcset->range_ends, is a pointer argument since we may
2883 auto inline reg_errcode_t
2884 __attribute__ ((always_inline
))
2885 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2886 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2889 uint32_t start_collseq
;
2890 uint32_t end_collseq
;
2892 /* Equivalence Classes and Character Classes can't be a range
2894 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2895 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2899 start_collseq
= lookup_collation_sequence_value (start_elem
);
2900 end_collseq
= lookup_collation_sequence_value (end_elem
);
2901 /* Check start/end collation sequence values. */
2902 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2903 return REG_ECOLLATE
;
2904 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2907 /* Got valid collation sequence values, add them as a new entry.
2908 However, if we have no collation elements, and the character set
2909 is single byte, the single byte character set that we
2910 build below suffices. */
2911 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2913 /* Check the space of the arrays. */
2914 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2916 /* There is not enough space, need realloc. */
2917 uint32_t *new_array_start
;
2918 uint32_t *new_array_end
;
2921 /* +1 in case of mbcset->nranges is 0. */
2922 new_nranges
= 2 * mbcset
->nranges
+ 1;
2923 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2925 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2928 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2931 mbcset
->range_starts
= new_array_start
;
2932 mbcset
->range_ends
= new_array_end
;
2933 *range_alloc
= new_nranges
;
2936 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2937 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2940 /* Build the table for single byte characters. */
2941 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2943 uint32_t ch_collseq
;
2945 if (MB_CUR_MAX == 1)
2948 ch_collseq
= collseqmb
[ch
];
2950 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2951 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2952 bitset_set (sbcset
, ch
);
2957 /* Local function for parse_bracket_exp used in _LIBC environment.
2958 Build the collating element which is represented by NAME.
2959 The result are written to MBCSET and SBCSET.
2960 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2961 pointer argument since we may update it. */
2963 auto inline reg_errcode_t
2964 __attribute__ ((always_inline
))
2965 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2966 int *coll_sym_alloc
, const unsigned char *name
)
2969 size_t name_len
= strlen ((const char *) name
);
2972 elem
= seek_collating_symbol_entry (name
, name_len
);
2975 /* We found the entry. */
2976 idx
= symb_table
[2 * elem
+ 1];
2977 /* Skip the name of collating element name. */
2978 idx
+= 1 + extra
[idx
];
2980 else if (name_len
== 1)
2982 /* No valid character, treat it as a normal
2984 bitset_set (sbcset
, name
[0]);
2988 return REG_ECOLLATE
;
2990 /* Got valid collation sequence, add it as a new entry. */
2991 /* Check the space of the arrays. */
2992 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2994 /* Not enough, realloc it. */
2995 /* +1 in case of mbcset->ncoll_syms is 0. */
2996 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2997 /* Use realloc since mbcset->coll_syms is NULL
2999 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3000 new_coll_sym_alloc
);
3001 if (BE (new_coll_syms
== NULL
, 0))
3003 mbcset
->coll_syms
= new_coll_syms
;
3004 *coll_sym_alloc
= new_coll_sym_alloc
;
3006 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3011 if (BE (name_len
!= 1, 0))
3012 return REG_ECOLLATE
;
3015 bitset_set (sbcset
, name
[0]);
3022 re_token_t br_token
;
3023 re_bitset_ptr_t sbcset
;
3024 #ifdef RE_ENABLE_I18N
3025 re_charset_t
*mbcset
;
3026 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3027 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3028 #endif /* not RE_ENABLE_I18N */
3030 bin_tree_t
*work_tree
;
3032 int first_round
= 1;
3034 collseqmb
= (const unsigned char *)
3035 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3036 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3042 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3043 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3044 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3045 _NL_COLLATE_SYMB_TABLEMB
);
3046 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3047 _NL_COLLATE_SYMB_EXTRAMB
);
3050 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3051 #ifdef RE_ENABLE_I18N
3052 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3053 #endif /* RE_ENABLE_I18N */
3054 #ifdef RE_ENABLE_I18N
3055 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3057 if (BE (sbcset
== NULL
, 0))
3058 #endif /* RE_ENABLE_I18N */
3061 #ifdef RE_ENABLE_I18N
3068 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3069 if (BE (token
->type
== END_OF_RE
, 0))
3072 goto parse_bracket_exp_free_return
;
3074 if (token
->type
== OP_NON_MATCH_LIST
)
3076 #ifdef RE_ENABLE_I18N
3077 mbcset
->non_match
= 1;
3078 #endif /* not RE_ENABLE_I18N */
3080 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3081 bitset_set (sbcset
, '\n');
3082 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3083 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3084 if (BE (token
->type
== END_OF_RE
, 0))
3087 goto parse_bracket_exp_free_return
;
3091 /* We treat the first ']' as a normal character. */
3092 if (token
->type
== OP_CLOSE_BRACKET
)
3093 token
->type
= CHARACTER
;
3097 bracket_elem_t start_elem
, end_elem
;
3098 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3099 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3101 int token_len2
= 0, is_range_exp
= 0;
3104 start_elem
.opr
.name
= start_name_buf
;
3105 start_elem
.type
= COLL_SYM
;
3106 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3107 syntax
, first_round
);
3108 if (BE (ret
!= REG_NOERROR
, 0))
3111 goto parse_bracket_exp_free_return
;
3115 /* Get information about the next token. We need it in any case. */
3116 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3118 /* Do not check for ranges if we know they are not allowed. */
3119 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3121 if (BE (token
->type
== END_OF_RE
, 0))
3124 goto parse_bracket_exp_free_return
;
3126 if (token
->type
== OP_CHARSET_RANGE
)
3128 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3129 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3130 if (BE (token2
.type
== END_OF_RE
, 0))
3133 goto parse_bracket_exp_free_return
;
3135 if (token2
.type
== OP_CLOSE_BRACKET
)
3137 /* We treat the last '-' as a normal character. */
3138 re_string_skip_bytes (regexp
, -token_len
);
3139 token
->type
= CHARACTER
;
3146 if (is_range_exp
== 1)
3148 end_elem
.opr
.name
= end_name_buf
;
3149 end_elem
.type
= COLL_SYM
;
3150 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3152 if (BE (ret
!= REG_NOERROR
, 0))
3155 goto parse_bracket_exp_free_return
;
3158 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3161 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3162 &start_elem
, &end_elem
);
3164 # ifdef RE_ENABLE_I18N
3165 *err
= build_range_exp (sbcset
,
3166 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3167 &range_alloc
, &start_elem
, &end_elem
);
3169 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3171 #endif /* RE_ENABLE_I18N */
3172 if (BE (*err
!= REG_NOERROR
, 0))
3173 goto parse_bracket_exp_free_return
;
3177 switch (start_elem
.type
)
3180 bitset_set (sbcset
, start_elem
.opr
.ch
);
3182 #ifdef RE_ENABLE_I18N
3184 /* Check whether the array has enough space. */
3185 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3187 wchar_t *new_mbchars
;
3188 /* Not enough, realloc it. */
3189 /* +1 in case of mbcset->nmbchars is 0. */
3190 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3191 /* Use realloc since array is NULL if *alloc == 0. */
3192 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3194 if (BE (new_mbchars
== NULL
, 0))
3195 goto parse_bracket_exp_espace
;
3196 mbcset
->mbchars
= new_mbchars
;
3198 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3200 #endif /* RE_ENABLE_I18N */
3202 *err
= build_equiv_class (sbcset
,
3203 #ifdef RE_ENABLE_I18N
3204 mbcset
, &equiv_class_alloc
,
3205 #endif /* RE_ENABLE_I18N */
3206 start_elem
.opr
.name
);
3207 if (BE (*err
!= REG_NOERROR
, 0))
3208 goto parse_bracket_exp_free_return
;
3211 *err
= build_collating_symbol (sbcset
,
3212 #ifdef RE_ENABLE_I18N
3213 mbcset
, &coll_sym_alloc
,
3214 #endif /* RE_ENABLE_I18N */
3215 start_elem
.opr
.name
);
3216 if (BE (*err
!= REG_NOERROR
, 0))
3217 goto parse_bracket_exp_free_return
;
3220 *err
= build_charclass (regexp
->trans
, sbcset
,
3221 #ifdef RE_ENABLE_I18N
3222 mbcset
, &char_class_alloc
,
3223 #endif /* RE_ENABLE_I18N */
3224 start_elem
.opr
.name
, syntax
);
3225 if (BE (*err
!= REG_NOERROR
, 0))
3226 goto parse_bracket_exp_free_return
;
3233 if (BE (token
->type
== END_OF_RE
, 0))
3236 goto parse_bracket_exp_free_return
;
3238 if (token
->type
== OP_CLOSE_BRACKET
)
3242 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3244 /* If it is non-matching list. */
3246 bitset_not (sbcset
);
3248 #ifdef RE_ENABLE_I18N
3249 /* Ensure only single byte characters are set. */
3250 if (dfa
->mb_cur_max
> 1)
3251 bitset_mask (sbcset
, dfa
->sb_char
);
3253 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3254 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3255 || mbcset
->non_match
)))
3257 bin_tree_t
*mbc_tree
;
3259 /* Build a tree for complex bracket. */
3260 dfa
->has_mb_node
= 1;
3261 br_token
.type
= COMPLEX_BRACKET
;
3262 br_token
.opr
.mbcset
= mbcset
;
3263 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3264 if (BE (mbc_tree
== NULL
, 0))
3265 goto parse_bracket_exp_espace
;
3266 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3267 if (sbcset
[sbc_idx
])
3269 /* If there are no bits set in sbcset, there is no point
3270 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3271 if (sbc_idx
< BITSET_WORDS
)
3273 /* Build a tree for simple bracket. */
3274 br_token
.type
= SIMPLE_BRACKET
;
3275 br_token
.opr
.sbcset
= sbcset
;
3276 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3277 if (BE (work_tree
== NULL
, 0))
3278 goto parse_bracket_exp_espace
;
3280 /* Then join them by ALT node. */
3281 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3282 if (BE (work_tree
== NULL
, 0))
3283 goto parse_bracket_exp_espace
;
3288 work_tree
= mbc_tree
;
3292 #endif /* not RE_ENABLE_I18N */
3294 #ifdef RE_ENABLE_I18N
3295 free_charset (mbcset
);
3297 /* Build a tree for simple bracket. */
3298 br_token
.type
= SIMPLE_BRACKET
;
3299 br_token
.opr
.sbcset
= sbcset
;
3300 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3301 if (BE (work_tree
== NULL
, 0))
3302 goto parse_bracket_exp_espace
;
3306 parse_bracket_exp_espace
:
3308 parse_bracket_exp_free_return
:
3310 #ifdef RE_ENABLE_I18N
3311 free_charset (mbcset
);
3312 #endif /* RE_ENABLE_I18N */
3316 /* Parse an element in the bracket expression. */
3318 static reg_errcode_t
3319 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3320 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3321 reg_syntax_t syntax
, int accept_hyphen
)
3323 #ifdef RE_ENABLE_I18N
3325 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3326 if (cur_char_size
> 1)
3328 elem
->type
= MB_CHAR
;
3329 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3330 re_string_skip_bytes (regexp
, cur_char_size
);
3333 #endif /* RE_ENABLE_I18N */
3334 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3335 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3336 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3337 return parse_bracket_symbol (elem
, regexp
, token
);
3338 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3340 /* A '-' must only appear as anything but a range indicator before
3341 the closing bracket. Everything else is an error. */
3343 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3344 if (token2
.type
!= OP_CLOSE_BRACKET
)
3345 /* The actual error value is not standardized since this whole
3346 case is undefined. But ERANGE makes good sense. */
3349 elem
->type
= SB_CHAR
;
3350 elem
->opr
.ch
= token
->opr
.c
;
3354 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3355 such as [:<character_class>:], [.<collating_element>.], and
3356 [=<equivalent_class>=]. */
3358 static reg_errcode_t
3359 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3362 unsigned char ch
, delim
= token
->opr
.c
;
3364 if (re_string_eoi(regexp
))
3368 if (i
>= BRACKET_NAME_BUF_SIZE
)
3370 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3371 ch
= re_string_fetch_byte_case (regexp
);
3373 ch
= re_string_fetch_byte (regexp
);
3374 if (re_string_eoi(regexp
))
3376 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3378 elem
->opr
.name
[i
] = ch
;
3380 re_string_skip_bytes (regexp
, 1);
3381 elem
->opr
.name
[i
] = '\0';
3382 switch (token
->type
)
3384 case OP_OPEN_COLL_ELEM
:
3385 elem
->type
= COLL_SYM
;
3387 case OP_OPEN_EQUIV_CLASS
:
3388 elem
->type
= EQUIV_CLASS
;
3390 case OP_OPEN_CHAR_CLASS
:
3391 elem
->type
= CHAR_CLASS
;
3399 /* Helper function for parse_bracket_exp.
3400 Build the equivalence class which is represented by NAME.
3401 The result are written to MBCSET and SBCSET.
3402 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3403 is a pointer argument since we may update it. */
3405 static reg_errcode_t
3406 #ifdef RE_ENABLE_I18N
3407 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3408 int *equiv_class_alloc
, const unsigned char *name
)
3409 #else /* not RE_ENABLE_I18N */
3410 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3411 #endif /* not RE_ENABLE_I18N */
3414 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3417 const int32_t *table
, *indirect
;
3418 const unsigned char *weights
, *extra
, *cp
;
3419 unsigned char char_buf
[2];
3423 /* Calculate the index for equivalence class. */
3425 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3426 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3427 _NL_COLLATE_WEIGHTMB
);
3428 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3429 _NL_COLLATE_EXTRAMB
);
3430 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3431 _NL_COLLATE_INDIRECTMB
);
3432 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3433 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3434 /* This isn't a valid character. */
3435 return REG_ECOLLATE
;
3437 /* Build single byte matcing table for this equivalence class. */
3438 len
= weights
[idx1
& 0xffffff];
3439 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3443 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3448 /* This isn't a valid character. */
3450 /* Compare only if the length matches and the collation rule
3451 index is the same. */
3452 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3456 while (cnt
<= len
&&
3457 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3458 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3462 bitset_set (sbcset
, ch
);
3465 /* Check whether the array has enough space. */
3466 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3468 /* Not enough, realloc it. */
3469 /* +1 in case of mbcset->nequiv_classes is 0. */
3470 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3471 /* Use realloc since the array is NULL if *alloc == 0. */
3472 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3474 new_equiv_class_alloc
);
3475 if (BE (new_equiv_classes
== NULL
, 0))
3477 mbcset
->equiv_classes
= new_equiv_classes
;
3478 *equiv_class_alloc
= new_equiv_class_alloc
;
3480 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3485 if (BE (strlen ((const char *) name
) != 1, 0))
3486 return REG_ECOLLATE
;
3487 bitset_set (sbcset
, *name
);
3492 /* Helper function for parse_bracket_exp.
3493 Build the character class which is represented by NAME.
3494 The result are written to MBCSET and SBCSET.
3495 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3496 is a pointer argument since we may update it. */
3498 static reg_errcode_t
3499 #ifdef RE_ENABLE_I18N
3500 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3501 re_charset_t
*mbcset
, int *char_class_alloc
,
3502 const unsigned char *class_name
, reg_syntax_t syntax
)
3503 #else /* not RE_ENABLE_I18N */
3504 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3505 const unsigned char *class_name
, reg_syntax_t syntax
)
3506 #endif /* not RE_ENABLE_I18N */
3509 const char *name
= (const char *) class_name
;
3511 /* In case of REG_ICASE "upper" and "lower" match the both of
3512 upper and lower cases. */
3513 if ((syntax
& RE_ICASE
)
3514 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3517 #ifdef RE_ENABLE_I18N
3518 /* Check the space of the arrays. */
3519 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3521 /* Not enough, realloc it. */
3522 /* +1 in case of mbcset->nchar_classes is 0. */
3523 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3524 /* Use realloc since array is NULL if *alloc == 0. */
3525 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3526 new_char_class_alloc
);
3527 if (BE (new_char_classes
== NULL
, 0))
3529 mbcset
->char_classes
= new_char_classes
;
3530 *char_class_alloc
= new_char_class_alloc
;
3532 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3533 #endif /* RE_ENABLE_I18N */
3535 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3537 if (BE (trans != NULL, 0)) \
3539 for (i = 0; i < SBC_MAX; ++i) \
3540 if (ctype_func (i)) \
3541 bitset_set (sbcset, trans[i]); \
3545 for (i = 0; i < SBC_MAX; ++i) \
3546 if (ctype_func (i)) \
3547 bitset_set (sbcset, i); \
3551 if (strcmp (name
, "alnum") == 0)
3552 BUILD_CHARCLASS_LOOP (isalnum
);
3553 else if (strcmp (name
, "cntrl") == 0)
3554 BUILD_CHARCLASS_LOOP (iscntrl
);
3555 else if (strcmp (name
, "lower") == 0)
3556 BUILD_CHARCLASS_LOOP (islower
);
3557 else if (strcmp (name
, "space") == 0)
3558 BUILD_CHARCLASS_LOOP (isspace
);
3559 else if (strcmp (name
, "alpha") == 0)
3560 BUILD_CHARCLASS_LOOP (isalpha
);
3561 else if (strcmp (name
, "digit") == 0)
3562 BUILD_CHARCLASS_LOOP (isdigit
);
3563 else if (strcmp (name
, "print") == 0)
3564 BUILD_CHARCLASS_LOOP (isprint
);
3565 else if (strcmp (name
, "upper") == 0)
3566 BUILD_CHARCLASS_LOOP (isupper
);
3567 else if (strcmp (name
, "blank") == 0)
3568 BUILD_CHARCLASS_LOOP (isblank
);
3569 else if (strcmp (name
, "graph") == 0)
3570 BUILD_CHARCLASS_LOOP (isgraph
);
3571 else if (strcmp (name
, "punct") == 0)
3572 BUILD_CHARCLASS_LOOP (ispunct
);
3573 else if (strcmp (name
, "xdigit") == 0)
3574 BUILD_CHARCLASS_LOOP (isxdigit
);
3582 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3583 const unsigned char *class_name
,
3584 const unsigned char *extra
, int non_match
,
3587 re_bitset_ptr_t sbcset
;
3588 #ifdef RE_ENABLE_I18N
3589 re_charset_t
*mbcset
;
3591 #endif /* not RE_ENABLE_I18N */
3593 re_token_t br_token
;
3596 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3597 #ifdef RE_ENABLE_I18N
3598 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3599 #endif /* RE_ENABLE_I18N */
3601 #ifdef RE_ENABLE_I18N
3602 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3603 #else /* not RE_ENABLE_I18N */
3604 if (BE (sbcset
== NULL
, 0))
3605 #endif /* not RE_ENABLE_I18N */
3613 #ifdef RE_ENABLE_I18N
3614 mbcset
->non_match
= 1;
3615 #endif /* not RE_ENABLE_I18N */
3618 /* We don't care the syntax in this case. */
3619 ret
= build_charclass (trans
, sbcset
,
3620 #ifdef RE_ENABLE_I18N
3622 #endif /* RE_ENABLE_I18N */
3625 if (BE (ret
!= REG_NOERROR
, 0))
3628 #ifdef RE_ENABLE_I18N
3629 free_charset (mbcset
);
3630 #endif /* RE_ENABLE_I18N */
3634 /* \w match '_' also. */
3635 for (; *extra
; extra
++)
3636 bitset_set (sbcset
, *extra
);
3638 /* If it is non-matching list. */
3640 bitset_not (sbcset
);
3642 #ifdef RE_ENABLE_I18N
3643 /* Ensure only single byte characters are set. */
3644 if (dfa
->mb_cur_max
> 1)
3645 bitset_mask (sbcset
, dfa
->sb_char
);
3648 /* Build a tree for simple bracket. */
3649 br_token
.type
= SIMPLE_BRACKET
;
3650 br_token
.opr
.sbcset
= sbcset
;
3651 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3652 if (BE (tree
== NULL
, 0))
3653 goto build_word_op_espace
;
3655 #ifdef RE_ENABLE_I18N
3656 if (dfa
->mb_cur_max
> 1)
3658 bin_tree_t
*mbc_tree
;
3659 /* Build a tree for complex bracket. */
3660 br_token
.type
= COMPLEX_BRACKET
;
3661 br_token
.opr
.mbcset
= mbcset
;
3662 dfa
->has_mb_node
= 1;
3663 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3664 if (BE (mbc_tree
== NULL
, 0))
3665 goto build_word_op_espace
;
3666 /* Then join them by ALT node. */
3667 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3668 if (BE (mbc_tree
!= NULL
, 1))
3673 free_charset (mbcset
);
3676 #else /* not RE_ENABLE_I18N */
3678 #endif /* not RE_ENABLE_I18N */
3680 build_word_op_espace
:
3682 #ifdef RE_ENABLE_I18N
3683 free_charset (mbcset
);
3684 #endif /* RE_ENABLE_I18N */
3689 /* This is intended for the expressions like "a{1,3}".
3690 Fetch a number from `input', and return the number.
3691 Return -1, if the number field is empty like "{,1}".
3692 Return -2, If an error is occured. */
3695 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3701 fetch_token (token
, input
, syntax
);
3703 if (BE (token
->type
== END_OF_RE
, 0))
3705 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3707 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3708 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3709 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3714 #ifdef RE_ENABLE_I18N
3716 free_charset (re_charset_t
*cset
)
3718 re_free (cset
->mbchars
);
3720 re_free (cset
->coll_syms
);
3721 re_free (cset
->equiv_classes
);
3722 re_free (cset
->range_starts
);
3723 re_free (cset
->range_ends
);
3725 re_free (cset
->char_classes
);
3728 #endif /* RE_ENABLE_I18N */
3730 /* Functions for binary tree operation. */
3732 /* Create a tree node. */
3735 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3736 re_token_type_t type
)
3740 return create_token_tree (dfa
, left
, right
, &t
);
3744 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3745 const re_token_t
*token
)
3748 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3750 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3752 if (storage
== NULL
)
3754 storage
->next
= dfa
->str_tree_storage
;
3755 dfa
->str_tree_storage
= storage
;
3756 dfa
->str_tree_storage_idx
= 0;
3758 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3760 tree
->parent
= NULL
;
3762 tree
->right
= right
;
3763 tree
->token
= *token
;
3764 tree
->token
.duplicated
= 0;
3765 tree
->token
.opt_subexp
= 0;
3768 tree
->node_idx
= -1;
3771 left
->parent
= tree
;
3773 right
->parent
= tree
;
3777 /* Mark the tree SRC as an optional subexpression.
3778 To be called from preorder or postorder. */
3780 static reg_errcode_t
3781 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3783 int idx
= (int) (long) extra
;
3784 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3785 node
->token
.opt_subexp
= 1;
3790 /* Free the allocated memory inside NODE. */
3793 free_token (re_token_t
*node
)
3795 #ifdef RE_ENABLE_I18N
3796 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3797 free_charset (node
->opr
.mbcset
);
3799 #endif /* RE_ENABLE_I18N */
3800 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3801 re_free (node
->opr
.sbcset
);
3804 /* Worker function for tree walking. Free the allocated memory inside NODE
3805 and its children. */
3807 static reg_errcode_t
3808 free_tree (void *extra
, bin_tree_t
*node
)
3810 free_token (&node
->token
);
3815 /* Duplicate the node SRC, and return new node. This is a preorder
3816 visit similar to the one implemented by the generic visitor, but
3817 we need more infrastructure to maintain two parallel trees --- so,
3818 it's easier to duplicate. */
3821 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3823 const bin_tree_t
*node
;
3824 bin_tree_t
*dup_root
;
3825 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3827 for (node
= root
; ; )
3829 /* Create a new tree and link it back to the current parent. */
3830 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3833 (*p_new
)->parent
= dup_node
;
3834 (*p_new
)->token
.duplicated
= 1;
3837 /* Go to the left node, or up and to the right. */
3841 p_new
= &dup_node
->left
;
3845 const bin_tree_t
*prev
= NULL
;
3846 while (node
->right
== prev
|| node
->right
== NULL
)
3849 node
= node
->parent
;
3850 dup_node
= dup_node
->parent
;
3855 p_new
= &dup_node
->right
;