1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 # include <locale/weight.h>
26 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
27 size_t length
, reg_syntax_t syntax
);
28 static void re_compile_fastmap_iter (regex_t
*bufp
,
29 const re_dfastate_t
*init_state
,
31 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
33 static void free_charset (re_charset_t
*cset
);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t
*preg
);
36 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
38 static void optimize_utf8 (re_dfa_t
*dfa
);
40 static reg_errcode_t
analyze (regex_t
*preg
);
41 static reg_errcode_t
preorder (bin_tree_t
*root
,
42 reg_errcode_t (fn (void *, bin_tree_t
*)),
44 static reg_errcode_t
postorder (bin_tree_t
*root
,
45 reg_errcode_t (fn (void *, bin_tree_t
*)),
47 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
49 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
51 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
52 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
53 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
54 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
55 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
56 unsigned int constraint
);
57 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
58 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
60 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
61 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
63 static int peek_token (re_token_t
*token
, re_string_t
*input
,
64 reg_syntax_t syntax
) internal_function
;
65 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
66 reg_syntax_t syntax
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
80 re_dfa_t
*dfa
, re_token_t
*token
,
81 reg_syntax_t syntax
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
83 re_token_t
*token
, reg_syntax_t syntax
,
85 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
87 re_token_t
*token
, int token_len
,
91 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
95 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
97 int *equiv_class_alloc
,
98 const unsigned char *name
);
99 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
101 re_charset_t
*mbcset
,
102 int *char_class_alloc
,
103 const unsigned char *class_name
,
104 reg_syntax_t syntax
);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
107 const unsigned char *name
);
108 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 reg_syntax_t syntax
);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
114 RE_TRANSLATE_TYPE trans
,
115 const unsigned char *class_name
,
116 const unsigned char *extra
,
117 int non_match
, reg_errcode_t
*err
);
118 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
119 bin_tree_t
*left
, bin_tree_t
*right
,
120 re_token_type_t type
);
121 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
122 bin_tree_t
*left
, bin_tree_t
*right
,
123 const re_token_t
*token
);
124 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
125 static void free_token (re_token_t
*node
);
126 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
127 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid
[] attribute_hidden
=
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx
[] attribute_hidden
=
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
219 re_compile_pattern (pattern
, length
, bufp
)
222 struct re_pattern_buffer
*bufp
;
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
231 /* Match anchors at newline. */
232 bufp
->newline_anchor
= 1;
234 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
238 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
241 weak_alias (__re_compile_pattern
, re_compile_pattern
)
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options
;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax
)
263 reg_syntax_t ret
= re_syntax_options
;
265 re_syntax_options
= syntax
;
269 weak_alias (__re_set_syntax
, re_set_syntax
)
273 re_compile_fastmap (bufp
)
274 struct re_pattern_buffer
*bufp
;
276 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
277 char *fastmap
= bufp
->fastmap
;
279 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
280 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
281 if (dfa
->init_state
!= dfa
->init_state_word
)
282 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
283 if (dfa
->init_state
!= dfa
->init_state_nl
)
284 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
285 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
286 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
287 bufp
->fastmap_accurate
= 1;
291 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
295 __attribute__ ((always_inline
))
296 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
300 fastmap
[tolower (ch
)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
310 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
312 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
313 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
315 int node
= init_state
->nodes
.elems
[node_cnt
];
316 re_token_type_t type
= dfa
->nodes
[node
].type
;
318 if (type
== CHARACTER
)
320 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
324 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
329 *p
++ = dfa
->nodes
[node
].opr
.c
;
330 while (++node
< dfa
->nodes_len
331 && dfa
->nodes
[node
].type
== CHARACTER
332 && dfa
->nodes
[node
].mb_partial
)
333 *p
++ = dfa
->nodes
[node
].opr
.c
;
334 memset (&state
, '\0', sizeof (state
));
335 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
337 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
339 re_set_fastmap (fastmap
, 0, buf
[0]);
343 else if (type
== SIMPLE_BRACKET
)
346 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
349 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
350 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
351 if (w
& ((bitset_word_t
) 1 << j
))
352 re_set_fastmap (fastmap
, icase
, ch
);
355 #ifdef RE_ENABLE_I18N
356 else if (type
== COMPLEX_BRACKET
)
358 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
362 /* See if we have to try all bytes which start multiple collation
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
369 && (cset
->ncoll_syms
|| cset
->nranges
))
371 const int32_t *table
= (const int32_t *)
372 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
373 for (i
= 0; i
< SBC_MAX
; ++i
)
375 re_set_fastmap (fastmap
, icase
, i
);
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa
->mb_cur_max
> 1
384 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
386 || cset
->nequiv_classes
394 memset (&mbs
, 0, sizeof (mbs
));
395 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
396 re_set_fastmap (fastmap
, false, (int) c
);
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i
= 0; i
< cset
->nmbchars
; ++i
)
408 memset (&state
, '\0', sizeof (state
));
409 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
410 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
411 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
413 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
415 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
420 #endif /* RE_ENABLE_I18N */
421 else if (type
== OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type
== OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type
== END_OF_RE
)
427 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
428 if (type
== END_OF_RE
)
429 bufp
->can_be_null
= 1;
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
451 PATTERN is the address of the pattern string.
453 CFLAGS is a series of bits which affect compilation.
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
472 regcomp (preg
, pattern
, cflags
)
473 regex_t
*__restrict preg
;
474 const char *__restrict pattern
;
478 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC
);
485 /* Try to allocate space for the fastmap. */
486 preg
->fastmap
= re_malloc (char, SBC_MAX
);
487 if (BE (preg
->fastmap
== NULL
, 0))
490 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags
& REG_NEWLINE
)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax
&= ~RE_DOT_NEWLINE
;
496 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
497 /* It also changes the matching behavior. */
498 preg
->newline_anchor
= 1;
501 preg
->newline_anchor
= 0;
502 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
503 preg
->translate
= NULL
;
505 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret
== REG_ERPAREN
)
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret
== REG_NOERROR
, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg
);
519 /* Some error occurred while compiling the expression. */
520 re_free (preg
->fastmap
);
521 preg
->fastmap
= NULL
;
527 weak_alias (__regcomp
, regcomp
)
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
534 regerror (errcode
, preg
, errbuf
, errbuf_size
)
536 const regex_t
*__restrict preg
;
537 char *__restrict errbuf
;
544 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
545 / sizeof (__re_error_msgid_idx
[0])), 0))
546 /* Only error codes returned by the rest of the code should be passed
547 to this routine. If we are given anything else, or if other regex
548 code generates an invalid error code, then the program has a bug.
549 Dump core so we can fix it. */
552 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
554 msg_size
= strlen (msg
) + 1; /* Includes the null. */
556 if (BE (errbuf_size
!= 0, 1))
558 if (BE (msg_size
> errbuf_size
, 0))
560 #if defined HAVE_MEMPCPY || defined _LIBC
561 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
563 memcpy (errbuf
, msg
, errbuf_size
- 1);
564 errbuf
[errbuf_size
- 1] = 0;
568 memcpy (errbuf
, msg
, msg_size
);
574 weak_alias (__regerror
, regerror
)
578 #ifdef RE_ENABLE_I18N
579 /* This static array is used for the map to single-byte characters when
580 UTF-8 is used. Otherwise we would allocate memory just to initialize
581 it the same all the time. UTF-8 is the preferred encoding so this is
582 a worthwhile optimization. */
583 static const bitset_t utf8_sb_map
=
585 /* Set the first 128 bits. */
586 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
592 free_dfa_content (re_dfa_t
*dfa
)
597 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
598 free_token (dfa
->nodes
+ i
);
599 re_free (dfa
->nexts
);
600 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
602 if (dfa
->eclosures
!= NULL
)
603 re_node_set_free (dfa
->eclosures
+ i
);
604 if (dfa
->inveclosures
!= NULL
)
605 re_node_set_free (dfa
->inveclosures
+ i
);
606 if (dfa
->edests
!= NULL
)
607 re_node_set_free (dfa
->edests
+ i
);
609 re_free (dfa
->edests
);
610 re_free (dfa
->eclosures
);
611 re_free (dfa
->inveclosures
);
612 re_free (dfa
->nodes
);
614 if (dfa
->state_table
)
615 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
617 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
618 for (j
= 0; j
< entry
->num
; ++j
)
620 re_dfastate_t
*state
= entry
->array
[j
];
623 re_free (entry
->array
);
625 re_free (dfa
->state_table
);
626 #ifdef RE_ENABLE_I18N
627 if (dfa
->sb_char
!= utf8_sb_map
)
628 re_free (dfa
->sb_char
);
630 re_free (dfa
->subexp_map
);
632 re_free (dfa
->re_str
);
639 /* Free dynamically allocated space used by PREG. */
645 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
646 if (BE (dfa
!= NULL
, 1))
647 free_dfa_content (dfa
);
651 re_free (preg
->fastmap
);
652 preg
->fastmap
= NULL
;
654 re_free (preg
->translate
);
655 preg
->translate
= NULL
;
658 weak_alias (__regfree
, regfree
)
661 /* Entry points compatible with 4.2 BSD regex library. We don't define
662 them unless specifically requested. */
664 #if defined _REGEX_RE_COMP || defined _LIBC
666 /* BSD has one and only one pattern buffer. */
667 static struct re_pattern_buffer re_comp_buf
;
671 /* Make these definitions weak in libc, so POSIX programs can redefine
672 these names if they don't use our functions, and still use
673 regcomp/regexec above without link errors. */
684 if (!re_comp_buf
.buffer
)
685 return gettext ("No previous regular expression");
689 if (re_comp_buf
.buffer
)
691 fastmap
= re_comp_buf
.fastmap
;
692 re_comp_buf
.fastmap
= NULL
;
693 __regfree (&re_comp_buf
);
694 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
695 re_comp_buf
.fastmap
= fastmap
;
698 if (re_comp_buf
.fastmap
== NULL
)
700 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
701 if (re_comp_buf
.fastmap
== NULL
)
702 return (char *) gettext (__re_error_msgid
703 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
706 /* Since 're_exec' always passes NULL for the 'regs' argument, we
707 don't need to initialize the pattern buffer fields which affect it. */
709 /* Match anchors at newlines. */
710 re_comp_buf
.newline_anchor
= 1;
712 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
717 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
718 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
722 libc_freeres_fn (free_mem
)
724 __regfree (&re_comp_buf
);
728 #endif /* _REGEX_RE_COMP */
730 /* Internal entry point.
731 Compile the regular expression PATTERN, whose length is LENGTH.
732 SYNTAX indicate regular expression's syntax. */
735 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
738 reg_errcode_t err
= REG_NOERROR
;
742 /* Initialize the pattern buffer. */
743 preg
->fastmap_accurate
= 0;
744 preg
->syntax
= syntax
;
745 preg
->not_bol
= preg
->not_eol
= 0;
748 preg
->can_be_null
= 0;
749 preg
->regs_allocated
= REGS_UNALLOCATED
;
751 /* Initialize the dfa. */
752 dfa
= (re_dfa_t
*) preg
->buffer
;
753 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
755 /* If zero allocated, but buffer is non-null, try to realloc
756 enough space. This loses if buffer's address is bogus, but
757 that is the user's responsibility. If ->buffer is NULL this
758 is a simple allocation. */
759 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
762 preg
->allocated
= sizeof (re_dfa_t
);
763 preg
->buffer
= (unsigned char *) dfa
;
765 preg
->used
= sizeof (re_dfa_t
);
767 err
= init_dfa (dfa
, length
);
768 if (BE (err
!= REG_NOERROR
, 0))
770 free_dfa_content (dfa
);
776 /* Note: length+1 will not overflow since it is checked in init_dfa. */
777 dfa
->re_str
= re_malloc (char, length
+ 1);
778 strncpy (dfa
->re_str
, pattern
, length
+ 1);
781 __libc_lock_init (dfa
->lock
);
783 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
784 syntax
& RE_ICASE
, dfa
);
785 if (BE (err
!= REG_NOERROR
, 0))
787 re_compile_internal_free_return
:
788 free_workarea_compile (preg
);
789 re_string_destruct (®exp
);
790 free_dfa_content (dfa
);
796 /* Parse the regular expression, and build a structure tree. */
798 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
799 if (BE (dfa
->str_tree
== NULL
, 0))
800 goto re_compile_internal_free_return
;
802 /* Analyze the tree and create the nfa. */
803 err
= analyze (preg
);
804 if (BE (err
!= REG_NOERROR
, 0))
805 goto re_compile_internal_free_return
;
807 #ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
809 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
813 /* Then create the initial state of the dfa. */
814 err
= create_initial_state (dfa
);
816 /* Release work areas. */
817 free_workarea_compile (preg
);
818 re_string_destruct (®exp
);
820 if (BE (err
!= REG_NOERROR
, 0))
822 free_dfa_content (dfa
);
830 /* Initialize DFA. We use the length of the regular expression PAT_LEN
831 as the initial length of some arrays. */
834 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
836 unsigned int table_size
;
841 memset (dfa
, '\0', sizeof (re_dfa_t
));
843 /* Force allocation of str_tree_storage the first time. */
844 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
846 /* Avoid overflows. */
847 if (pat_len
== SIZE_MAX
)
850 dfa
->nodes_alloc
= pat_len
+ 1;
851 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
853 /* table_size = 2 ^ ceil(log pat_len) */
854 for (table_size
= 1; ; table_size
<<= 1)
855 if (table_size
> pat_len
)
858 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
859 dfa
->state_hash_mask
= table_size
- 1;
861 dfa
->mb_cur_max
= MB_CUR_MAX
;
863 if (dfa
->mb_cur_max
== 6
864 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
866 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
869 # ifdef HAVE_LANGINFO_CODESET
870 codeset_name
= nl_langinfo (CODESET
);
872 codeset_name
= getenv ("LC_ALL");
873 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
874 codeset_name
= getenv ("LC_CTYPE");
875 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
876 codeset_name
= getenv ("LANG");
877 if (codeset_name
== NULL
)
879 else if (strchr (codeset_name
, '.') != NULL
)
880 codeset_name
= strchr (codeset_name
, '.') + 1;
883 if (strcasecmp (codeset_name
, "UTF-8") == 0
884 || strcasecmp (codeset_name
, "UTF8") == 0)
887 /* We check exhaustively in the loop below if this charset is a
888 superset of ASCII. */
889 dfa
->map_notascii
= 0;
892 #ifdef RE_ENABLE_I18N
893 if (dfa
->mb_cur_max
> 1)
896 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
901 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
902 if (BE (dfa
->sb_char
== NULL
, 0))
905 /* Set the bits corresponding to single byte chars. */
906 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
907 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
909 wint_t wch
= __btowc (ch
);
911 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
913 if (isascii (ch
) && wch
!= ch
)
914 dfa
->map_notascii
= 1;
921 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
926 /* Initialize WORD_CHAR table, which indicate which character is
927 "word". In this case "word" means that it is the word construction
928 character used by some operators like "\<", "\>", etc. */
932 init_word_char (re_dfa_t
*dfa
)
934 dfa
->word_ops_used
= 1;
937 if (BE (dfa
->map_notascii
== 0, 1))
939 if (sizeof (dfa
->word_char
[0]) == 8)
941 /* The extra temporaries here avoid "implicitly truncated"
942 warnings in the case when this is dead code, i.e. 32-bit. */
943 const uint64_t wc0
= UINT64_C (0x03ff000000000000);
944 const uint64_t wc1
= UINT64_C (0x07fffffe87fffffe);
945 dfa
->word_char
[0] = wc0
;
946 dfa
->word_char
[1] = wc1
;
949 else if (sizeof (dfa
->word_char
[0]) == 4)
951 dfa
->word_char
[0] = UINT32_C (0x00000000);
952 dfa
->word_char
[1] = UINT32_C (0x03ff0000);
953 dfa
->word_char
[2] = UINT32_C (0x87fffffe);
954 dfa
->word_char
[3] = UINT32_C (0x07fffffe);
961 if (BE (dfa
->is_utf8
, 1))
963 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
968 for (; i
< BITSET_WORDS
; ++i
)
969 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
970 if (isalnum (ch
) || ch
== '_')
971 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
974 /* Free the work area which are only used while compiling. */
977 free_workarea_compile (regex_t
*preg
)
979 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
980 bin_tree_storage_t
*storage
, *next
;
981 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
983 next
= storage
->next
;
986 dfa
->str_tree_storage
= NULL
;
987 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
988 dfa
->str_tree
= NULL
;
989 re_free (dfa
->org_indices
);
990 dfa
->org_indices
= NULL
;
993 /* Create initial states for all contexts. */
996 create_initial_state (re_dfa_t
*dfa
)
1000 re_node_set init_nodes
;
1002 /* Initial states have the epsilon closure of the node which is
1003 the first node of the regular expression. */
1004 first
= dfa
->str_tree
->first
->node_idx
;
1005 dfa
->init_node
= first
;
1006 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1007 if (BE (err
!= REG_NOERROR
, 0))
1010 /* The back-references which are in initial states can epsilon transit,
1011 since in this case all of the subexpressions can be null.
1012 Then we add epsilon closures of the nodes which are the next nodes of
1013 the back-references. */
1014 if (dfa
->nbackref
> 0)
1015 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1017 int node_idx
= init_nodes
.elems
[i
];
1018 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1021 if (type
!= OP_BACK_REF
)
1023 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1025 re_token_t
*clexp_node
;
1026 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1027 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1028 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1031 if (clexp_idx
== init_nodes
.nelem
)
1034 if (type
== OP_BACK_REF
)
1036 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1037 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1039 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1042 if (err
!= REG_NOERROR
)
1049 /* It must be the first time to invoke acquire_state. */
1050 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
1052 if (BE (dfa
->init_state
== NULL
, 0))
1054 if (dfa
->init_state
->has_constraint
)
1056 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1058 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1060 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1064 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1065 || dfa
->init_state_begbuf
== NULL
, 0))
1069 dfa
->init_state_word
= dfa
->init_state_nl
1070 = dfa
->init_state_begbuf
= dfa
->init_state
;
1072 re_node_set_free (&init_nodes
);
1076 #ifdef RE_ENABLE_I18N
1077 /* If it is possible to do searching in single byte encoding instead of UTF-8
1078 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1079 DFA nodes where needed. */
1082 optimize_utf8 (re_dfa_t
*dfa
)
1084 int node
, i
, mb_chars
= 0, has_period
= 0;
1086 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1087 switch (dfa
->nodes
[node
].type
)
1090 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1094 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1102 /* Word anchors etc. cannot be handled. It's okay to test
1103 opr.ctx_type since constraints (for all DFA nodes) are
1104 created by ORing one or more opr.ctx_type values. */
1114 case OP_DUP_ASTERISK
:
1115 case OP_OPEN_SUBEXP
:
1116 case OP_CLOSE_SUBEXP
:
1118 case COMPLEX_BRACKET
:
1120 case SIMPLE_BRACKET
:
1121 /* Just double check. The non-ASCII range starts at 0x80. */
1122 assert (0x80 % BITSET_WORD_BITS
== 0);
1123 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1124 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1131 if (mb_chars
|| has_period
)
1132 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1134 if (dfa
->nodes
[node
].type
== CHARACTER
1135 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1136 dfa
->nodes
[node
].mb_partial
= 0;
1137 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1138 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1141 /* The search can be in single byte locale. */
1142 dfa
->mb_cur_max
= 1;
1144 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1148 /* Analyze the structure tree, and calculate "first", "next", "edest",
1149 "eclosure", and "inveclosure". */
1151 static reg_errcode_t
1152 analyze (regex_t
*preg
)
1154 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1157 /* Allocate arrays. */
1158 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1159 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1160 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1161 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1162 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1163 || dfa
->eclosures
== NULL
, 0))
1166 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1167 if (dfa
->subexp_map
!= NULL
)
1170 for (i
= 0; i
< preg
->re_nsub
; i
++)
1171 dfa
->subexp_map
[i
] = i
;
1172 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1173 for (i
= 0; i
< preg
->re_nsub
; i
++)
1174 if (dfa
->subexp_map
[i
] != i
)
1176 if (i
== preg
->re_nsub
)
1178 free (dfa
->subexp_map
);
1179 dfa
->subexp_map
= NULL
;
1183 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1184 if (BE (ret
!= REG_NOERROR
, 0))
1186 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1187 if (BE (ret
!= REG_NOERROR
, 0))
1189 preorder (dfa
->str_tree
, calc_next
, dfa
);
1190 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1191 if (BE (ret
!= REG_NOERROR
, 0))
1193 ret
= calc_eclosure (dfa
);
1194 if (BE (ret
!= REG_NOERROR
, 0))
1197 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1198 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1199 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1202 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1203 if (BE (dfa
->inveclosures
== NULL
, 0))
1205 ret
= calc_inveclosure (dfa
);
1211 /* Our parse trees are very unbalanced, so we cannot use a stack to
1212 implement parse tree visits. Instead, we use parent pointers and
1213 some hairy code in these two functions. */
1214 static reg_errcode_t
1215 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1218 bin_tree_t
*node
, *prev
;
1220 for (node
= root
; ; )
1222 /* Descend down the tree, preferably to the left (or to the right
1223 if that's the only child). */
1224 while (node
->left
|| node
->right
)
1232 reg_errcode_t err
= fn (extra
, node
);
1233 if (BE (err
!= REG_NOERROR
, 0))
1235 if (node
->parent
== NULL
)
1238 node
= node
->parent
;
1240 /* Go up while we have a node that is reached from the right. */
1241 while (node
->right
== prev
|| node
->right
== NULL
);
1246 static reg_errcode_t
1247 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1252 for (node
= root
; ; )
1254 reg_errcode_t err
= fn (extra
, node
);
1255 if (BE (err
!= REG_NOERROR
, 0))
1258 /* Go to the left node, or up and to the right. */
1263 bin_tree_t
*prev
= NULL
;
1264 while (node
->right
== prev
|| node
->right
== NULL
)
1267 node
= node
->parent
;
1276 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1277 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1278 backreferences as well. Requires a preorder visit. */
1279 static reg_errcode_t
1280 optimize_subexps (void *extra
, bin_tree_t
*node
)
1282 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1284 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1286 int idx
= node
->token
.opr
.idx
;
1287 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1288 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1291 else if (node
->token
.type
== SUBEXP
1292 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1294 int other_idx
= node
->left
->token
.opr
.idx
;
1296 node
->left
= node
->left
->left
;
1298 node
->left
->parent
= node
;
1300 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1301 if (other_idx
< BITSET_WORD_BITS
)
1302 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1308 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1309 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1310 static reg_errcode_t
1311 lower_subexps (void *extra
, bin_tree_t
*node
)
1313 regex_t
*preg
= (regex_t
*) extra
;
1314 reg_errcode_t err
= REG_NOERROR
;
1316 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1318 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1320 node
->left
->parent
= node
;
1322 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1324 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1326 node
->right
->parent
= node
;
1333 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1335 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1336 bin_tree_t
*body
= node
->left
;
1337 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1340 /* We do not optimize empty subexpressions, because otherwise we may
1341 have bad CONCAT nodes with NULL children. This is obviously not
1342 very common, so we do not lose much. An example that triggers
1343 this case is the sed "script" /\(\)/x. */
1344 && node
->left
!= NULL
1345 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1346 || !(dfa
->used_bkref_map
1347 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1350 /* Convert the SUBEXP node to the concatenation of an
1351 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1352 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1353 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1354 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1355 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1356 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1362 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1363 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1367 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1368 nodes. Requires a postorder visit. */
1369 static reg_errcode_t
1370 calc_first (void *extra
, bin_tree_t
*node
)
1372 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1373 if (node
->token
.type
== CONCAT
)
1375 node
->first
= node
->left
->first
;
1376 node
->node_idx
= node
->left
->node_idx
;
1381 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1382 if (BE (node
->node_idx
== -1, 0))
1384 if (node
->token
.type
== ANCHOR
)
1385 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1390 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1391 static reg_errcode_t
1392 calc_next (void *extra
, bin_tree_t
*node
)
1394 switch (node
->token
.type
)
1396 case OP_DUP_ASTERISK
:
1397 node
->left
->next
= node
;
1400 node
->left
->next
= node
->right
->first
;
1401 node
->right
->next
= node
->next
;
1405 node
->left
->next
= node
->next
;
1407 node
->right
->next
= node
->next
;
1413 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1414 static reg_errcode_t
1415 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1417 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1418 int idx
= node
->node_idx
;
1419 reg_errcode_t err
= REG_NOERROR
;
1421 switch (node
->token
.type
)
1427 assert (node
->next
== NULL
);
1430 case OP_DUP_ASTERISK
:
1434 dfa
->has_plural_match
= 1;
1435 if (node
->left
!= NULL
)
1436 left
= node
->left
->first
->node_idx
;
1438 left
= node
->next
->node_idx
;
1439 if (node
->right
!= NULL
)
1440 right
= node
->right
->first
->node_idx
;
1442 right
= node
->next
->node_idx
;
1444 assert (right
> -1);
1445 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1450 case OP_OPEN_SUBEXP
:
1451 case OP_CLOSE_SUBEXP
:
1452 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1456 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1457 if (node
->token
.type
== OP_BACK_REF
)
1458 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1462 assert (!IS_EPSILON_NODE (node
->token
.type
));
1463 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1470 /* Duplicate the epsilon closure of the node ROOT_NODE.
1471 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1472 to their own constraint. */
1474 static reg_errcode_t
1476 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1477 int root_node
, unsigned int init_constraint
)
1479 int org_node
, clone_node
, ret
;
1480 unsigned int constraint
= init_constraint
;
1481 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1483 int org_dest
, clone_dest
;
1484 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1486 /* If the back reference epsilon-transit, its destination must
1487 also have the constraint. Then duplicate the epsilon closure
1488 of the destination of the back reference, and store it in
1489 edests of the back reference. */
1490 org_dest
= dfa
->nexts
[org_node
];
1491 re_node_set_empty (dfa
->edests
+ clone_node
);
1492 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1493 if (BE (clone_dest
== -1, 0))
1495 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1496 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1497 if (BE (ret
< 0, 0))
1500 else if (dfa
->edests
[org_node
].nelem
== 0)
1502 /* In case of the node can't epsilon-transit, don't duplicate the
1503 destination and store the original destination as the
1504 destination of the node. */
1505 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1508 else if (dfa
->edests
[org_node
].nelem
== 1)
1510 /* In case of the node can epsilon-transit, and it has only one
1512 org_dest
= dfa
->edests
[org_node
].elems
[0];
1513 re_node_set_empty (dfa
->edests
+ clone_node
);
1514 /* If the node is root_node itself, it means the epsilon closure
1515 has a loop. Then tie it to the destination of the root_node. */
1516 if (org_node
== root_node
&& clone_node
!= org_node
)
1518 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1519 if (BE (ret
< 0, 0))
1523 /* In case the node has another constraint, append it. */
1524 constraint
|= dfa
->nodes
[org_node
].constraint
;
1525 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1526 if (BE (clone_dest
== -1, 0))
1528 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1529 if (BE (ret
< 0, 0))
1532 else /* dfa->edests[org_node].nelem == 2 */
1534 /* In case of the node can epsilon-transit, and it has two
1535 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1536 org_dest
= dfa
->edests
[org_node
].elems
[0];
1537 re_node_set_empty (dfa
->edests
+ clone_node
);
1538 /* Search for a duplicated node which satisfies the constraint. */
1539 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1540 if (clone_dest
== -1)
1542 /* There is no such duplicated node, create a new one. */
1544 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1545 if (BE (clone_dest
== -1, 0))
1547 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1548 if (BE (ret
< 0, 0))
1550 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1551 root_node
, constraint
);
1552 if (BE (err
!= REG_NOERROR
, 0))
1557 /* There is a duplicated node which satisfies the constraint,
1558 use it to avoid infinite loop. */
1559 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1560 if (BE (ret
< 0, 0))
1564 org_dest
= dfa
->edests
[org_node
].elems
[1];
1565 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1566 if (BE (clone_dest
== -1, 0))
1568 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1569 if (BE (ret
< 0, 0))
1572 org_node
= org_dest
;
1573 clone_node
= clone_dest
;
1578 /* Search for a node which is duplicated from the node ORG_NODE, and
1579 satisfies the constraint CONSTRAINT. */
1582 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1583 unsigned int constraint
)
1586 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1588 if (org_node
== dfa
->org_indices
[idx
]
1589 && constraint
== dfa
->nodes
[idx
].constraint
)
1590 return idx
; /* Found. */
1592 return -1; /* Not found. */
1595 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1596 Return the index of the new node, or -1 if insufficient storage is
1600 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1602 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1603 if (BE (dup_idx
!= -1, 1))
1605 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1606 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1607 dfa
->nodes
[dup_idx
].duplicated
= 1;
1609 /* Store the index of the original node. */
1610 dfa
->org_indices
[dup_idx
] = org_idx
;
1615 static reg_errcode_t
1616 calc_inveclosure (re_dfa_t
*dfa
)
1619 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1620 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1622 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1624 int *elems
= dfa
->eclosures
[src
].elems
;
1625 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1627 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1628 if (BE (ret
== -1, 0))
1636 /* Calculate "eclosure" for all the node in DFA. */
1638 static reg_errcode_t
1639 calc_eclosure (re_dfa_t
*dfa
)
1641 int node_idx
, incomplete
;
1643 assert (dfa
->nodes_len
> 0);
1646 /* For each nodes, calculate epsilon closure. */
1647 for (node_idx
= 0; ; ++node_idx
)
1650 re_node_set eclosure_elem
;
1651 if (node_idx
== dfa
->nodes_len
)
1660 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1663 /* If we have already calculated, skip it. */
1664 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1666 /* Calculate epsilon closure of 'node_idx'. */
1667 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1668 if (BE (err
!= REG_NOERROR
, 0))
1671 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1674 re_node_set_free (&eclosure_elem
);
1680 /* Calculate epsilon closure of NODE. */
1682 static reg_errcode_t
1683 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1687 re_node_set eclosure
;
1690 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1691 if (BE (err
!= REG_NOERROR
, 0))
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa
->eclosures
[node
].nelem
= -1;
1698 /* If the current node has constraints, duplicate all nodes
1699 since they must inherit the constraints. */
1700 if (dfa
->nodes
[node
].constraint
1701 && dfa
->edests
[node
].nelem
1702 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1704 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1705 dfa
->nodes
[node
].constraint
);
1706 if (BE (err
!= REG_NOERROR
, 0))
1710 /* Expand each epsilon destination nodes. */
1711 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1712 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1714 re_node_set eclosure_elem
;
1715 int edest
= dfa
->edests
[node
].elems
[i
];
1716 /* If calculating the epsilon closure of `edest' is in progress,
1717 return intermediate result. */
1718 if (dfa
->eclosures
[edest
].nelem
== -1)
1723 /* If we haven't calculated the epsilon closure of `edest' yet,
1724 calculate now. Otherwise use calculated epsilon closure. */
1725 if (dfa
->eclosures
[edest
].nelem
== 0)
1727 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1728 if (BE (err
!= REG_NOERROR
, 0))
1732 eclosure_elem
= dfa
->eclosures
[edest
];
1733 /* Merge the epsilon closure of 'edest'. */
1734 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1735 if (BE (err
!= REG_NOERROR
, 0))
1737 /* If the epsilon closure of 'edest' is incomplete,
1738 the epsilon closure of this node is also incomplete. */
1739 if (dfa
->eclosures
[edest
].nelem
== 0)
1742 re_node_set_free (&eclosure_elem
);
1746 /* An epsilon closure includes itself. */
1747 ret
= re_node_set_insert (&eclosure
, node
);
1748 if (BE (ret
< 0, 0))
1750 if (incomplete
&& !root
)
1751 dfa
->eclosures
[node
].nelem
= 0;
1753 dfa
->eclosures
[node
] = eclosure
;
1754 *new_set
= eclosure
;
1758 /* Functions for token which are used in the parser. */
1760 /* Fetch a token from INPUT.
1761 We must not use this function inside bracket expressions. */
1765 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1767 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1770 /* Peek a token from INPUT, and return the length of the token.
1771 We must not use this function inside bracket expressions. */
1775 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1779 if (re_string_eoi (input
))
1781 token
->type
= END_OF_RE
;
1785 c
= re_string_peek_byte (input
, 0);
1788 token
->word_char
= 0;
1789 #ifdef RE_ENABLE_I18N
1790 token
->mb_partial
= 0;
1791 if (input
->mb_cur_max
> 1 &&
1792 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1794 token
->type
= CHARACTER
;
1795 token
->mb_partial
= 1;
1802 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1804 token
->type
= BACK_SLASH
;
1808 c2
= re_string_peek_byte_case (input
, 1);
1810 token
->type
= CHARACTER
;
1811 #ifdef RE_ENABLE_I18N
1812 if (input
->mb_cur_max
> 1)
1814 wint_t wc
= re_string_wchar_at (input
,
1815 re_string_cur_idx (input
) + 1);
1816 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1820 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1825 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1826 token
->type
= OP_ALT
;
1828 case '1': case '2': case '3': case '4': case '5':
1829 case '6': case '7': case '8': case '9':
1830 if (!(syntax
& RE_NO_BK_REFS
))
1832 token
->type
= OP_BACK_REF
;
1833 token
->opr
.idx
= c2
- '1';
1837 if (!(syntax
& RE_NO_GNU_OPS
))
1839 token
->type
= ANCHOR
;
1840 token
->opr
.ctx_type
= WORD_FIRST
;
1844 if (!(syntax
& RE_NO_GNU_OPS
))
1846 token
->type
= ANCHOR
;
1847 token
->opr
.ctx_type
= WORD_LAST
;
1851 if (!(syntax
& RE_NO_GNU_OPS
))
1853 token
->type
= ANCHOR
;
1854 token
->opr
.ctx_type
= WORD_DELIM
;
1858 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= ANCHOR
;
1861 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1865 if (!(syntax
& RE_NO_GNU_OPS
))
1866 token
->type
= OP_WORD
;
1869 if (!(syntax
& RE_NO_GNU_OPS
))
1870 token
->type
= OP_NOTWORD
;
1873 if (!(syntax
& RE_NO_GNU_OPS
))
1874 token
->type
= OP_SPACE
;
1877 if (!(syntax
& RE_NO_GNU_OPS
))
1878 token
->type
= OP_NOTSPACE
;
1881 if (!(syntax
& RE_NO_GNU_OPS
))
1883 token
->type
= ANCHOR
;
1884 token
->opr
.ctx_type
= BUF_FIRST
;
1888 if (!(syntax
& RE_NO_GNU_OPS
))
1890 token
->type
= ANCHOR
;
1891 token
->opr
.ctx_type
= BUF_LAST
;
1895 if (!(syntax
& RE_NO_BK_PARENS
))
1896 token
->type
= OP_OPEN_SUBEXP
;
1899 if (!(syntax
& RE_NO_BK_PARENS
))
1900 token
->type
= OP_CLOSE_SUBEXP
;
1903 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1904 token
->type
= OP_DUP_PLUS
;
1907 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1908 token
->type
= OP_DUP_QUESTION
;
1911 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1912 token
->type
= OP_OPEN_DUP_NUM
;
1915 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1916 token
->type
= OP_CLOSE_DUP_NUM
;
1924 token
->type
= CHARACTER
;
1925 #ifdef RE_ENABLE_I18N
1926 if (input
->mb_cur_max
> 1)
1928 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1929 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1933 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1938 if (syntax
& RE_NEWLINE_ALT
)
1939 token
->type
= OP_ALT
;
1942 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1943 token
->type
= OP_ALT
;
1946 token
->type
= OP_DUP_ASTERISK
;
1949 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1950 token
->type
= OP_DUP_PLUS
;
1953 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1954 token
->type
= OP_DUP_QUESTION
;
1957 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1958 token
->type
= OP_OPEN_DUP_NUM
;
1961 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1962 token
->type
= OP_CLOSE_DUP_NUM
;
1965 if (syntax
& RE_NO_BK_PARENS
)
1966 token
->type
= OP_OPEN_SUBEXP
;
1969 if (syntax
& RE_NO_BK_PARENS
)
1970 token
->type
= OP_CLOSE_SUBEXP
;
1973 token
->type
= OP_OPEN_BRACKET
;
1976 token
->type
= OP_PERIOD
;
1979 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1980 re_string_cur_idx (input
) != 0)
1982 char prev
= re_string_peek_byte (input
, -1);
1983 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1986 token
->type
= ANCHOR
;
1987 token
->opr
.ctx_type
= LINE_FIRST
;
1990 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1991 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1994 re_string_skip_bytes (input
, 1);
1995 peek_token (&next
, input
, syntax
);
1996 re_string_skip_bytes (input
, -1);
1997 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2000 token
->type
= ANCHOR
;
2001 token
->opr
.ctx_type
= LINE_LAST
;
2009 /* Peek a token from INPUT, and return the length of the token.
2010 We must not use this function out of bracket expressions. */
2014 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2017 if (re_string_eoi (input
))
2019 token
->type
= END_OF_RE
;
2022 c
= re_string_peek_byte (input
, 0);
2025 #ifdef RE_ENABLE_I18N
2026 if (input
->mb_cur_max
> 1 &&
2027 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2029 token
->type
= CHARACTER
;
2032 #endif /* RE_ENABLE_I18N */
2034 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2035 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2037 /* In this case, '\' escape a character. */
2039 re_string_skip_bytes (input
, 1);
2040 c2
= re_string_peek_byte (input
, 0);
2042 token
->type
= CHARACTER
;
2045 if (c
== '[') /* '[' is a special char in a bracket exps. */
2049 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2050 c2
= re_string_peek_byte (input
, 1);
2058 token
->type
= OP_OPEN_COLL_ELEM
;
2061 token
->type
= OP_OPEN_EQUIV_CLASS
;
2064 if (syntax
& RE_CHAR_CLASSES
)
2066 token
->type
= OP_OPEN_CHAR_CLASS
;
2069 /* else fall through. */
2071 token
->type
= CHARACTER
;
2081 token
->type
= OP_CHARSET_RANGE
;
2084 token
->type
= OP_CLOSE_BRACKET
;
2087 token
->type
= OP_NON_MATCH_LIST
;
2090 token
->type
= CHARACTER
;
2095 /* Functions for parser. */
2097 /* Entry point of the parser.
2098 Parse the regular expression REGEXP and return the structure tree.
2099 If an error occurs, ERR is set by error code, and return NULL.
2100 This function build the following tree, from regular expression <reg_exp>:
2106 CAT means concatenation.
2107 EOR means end of regular expression. */
2110 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2113 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2114 bin_tree_t
*tree
, *eor
, *root
;
2115 re_token_t current_token
;
2116 dfa
->syntax
= syntax
;
2117 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2118 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2119 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2121 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2123 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2126 if (BE (eor
== NULL
|| root
== NULL
, 0))
2134 /* This function build the following tree, from regular expression
2135 <branch1>|<branch2>:
2141 ALT means alternative, which represents the operator '|'. */
2144 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2145 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2147 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2148 bin_tree_t
*tree
, *branch
= NULL
;
2149 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2150 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2153 while (token
->type
== OP_ALT
)
2155 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2156 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2157 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2159 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2160 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2163 postorder (tree
, free_tree
, NULL
);
2169 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2170 if (BE (tree
== NULL
, 0))
2179 /* This function build the following tree, from regular expression
2186 CAT means concatenation. */
2189 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2190 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2192 bin_tree_t
*tree
, *exp
;
2193 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2194 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2195 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2198 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2199 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2201 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2202 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2205 postorder (tree
, free_tree
, NULL
);
2208 if (tree
!= NULL
&& exp
!= NULL
)
2210 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2211 if (newtree
== NULL
)
2213 postorder (exp
, free_tree
, NULL
);
2214 postorder (tree
, free_tree
, NULL
);
2220 else if (tree
== NULL
)
2222 /* Otherwise exp == NULL, we don't need to create new tree. */
2227 /* This function build the following tree, from regular expression a*:
2234 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2235 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2237 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2239 switch (token
->type
)
2242 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2243 if (BE (tree
== NULL
, 0))
2248 #ifdef RE_ENABLE_I18N
2249 if (dfa
->mb_cur_max
> 1)
2251 while (!re_string_eoi (regexp
)
2252 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2254 bin_tree_t
*mbc_remain
;
2255 fetch_token (token
, regexp
, syntax
);
2256 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2257 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2258 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2267 case OP_OPEN_SUBEXP
:
2268 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2269 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2272 case OP_OPEN_BRACKET
:
2273 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2274 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2278 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2283 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2284 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2285 if (BE (tree
== NULL
, 0))
2291 dfa
->has_mb_node
= 1;
2293 case OP_OPEN_DUP_NUM
:
2294 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2300 case OP_DUP_ASTERISK
:
2302 case OP_DUP_QUESTION
:
2303 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2308 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2310 fetch_token (token
, regexp
, syntax
);
2311 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2313 /* else fall through */
2314 case OP_CLOSE_SUBEXP
:
2315 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2316 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2321 /* else fall through */
2322 case OP_CLOSE_DUP_NUM
:
2323 /* We treat it as a normal character. */
2325 /* Then we can these characters as normal characters. */
2326 token
->type
= CHARACTER
;
2327 /* mb_partial and word_char bits should be initialized already
2329 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2330 if (BE (tree
== NULL
, 0))
2337 if ((token
->opr
.ctx_type
2338 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2339 && dfa
->word_ops_used
== 0)
2340 init_word_char (dfa
);
2341 if (token
->opr
.ctx_type
== WORD_DELIM
2342 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2344 bin_tree_t
*tree_first
, *tree_last
;
2345 if (token
->opr
.ctx_type
== WORD_DELIM
)
2347 token
->opr
.ctx_type
= WORD_FIRST
;
2348 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2349 token
->opr
.ctx_type
= WORD_LAST
;
2353 token
->opr
.ctx_type
= INSIDE_WORD
;
2354 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2355 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2357 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2358 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2359 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2367 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2368 if (BE (tree
== NULL
, 0))
2374 /* We must return here, since ANCHORs can't be followed
2375 by repetition operators.
2376 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2377 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2378 fetch_token (token
, regexp
, syntax
);
2381 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2382 if (BE (tree
== NULL
, 0))
2387 if (dfa
->mb_cur_max
> 1)
2388 dfa
->has_mb_node
= 1;
2392 tree
= build_charclass_op (dfa
, regexp
->trans
,
2393 (const unsigned char *) "alnum",
2394 (const unsigned char *) "_",
2395 token
->type
== OP_NOTWORD
, err
);
2396 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2401 tree
= build_charclass_op (dfa
, regexp
->trans
,
2402 (const unsigned char *) "space",
2403 (const unsigned char *) "",
2404 token
->type
== OP_NOTSPACE
, err
);
2405 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2415 /* Must not happen? */
2421 fetch_token (token
, regexp
, syntax
);
2423 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2424 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2426 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2427 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2430 postorder (tree
, free_tree
, NULL
);
2434 /* In BRE consecutive duplications are not allowed. */
2435 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2436 && (token
->type
== OP_DUP_ASTERISK
2437 || token
->type
== OP_OPEN_DUP_NUM
))
2440 postorder (tree
, free_tree
, NULL
);
2449 /* This function build the following tree, from regular expression
2457 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2458 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2460 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2463 cur_nsub
= preg
->re_nsub
++;
2465 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2467 /* The subexpression may be a null string. */
2468 if (token
->type
== OP_CLOSE_SUBEXP
)
2472 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2473 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2476 postorder (tree
, free_tree
, NULL
);
2479 if (BE (*err
!= REG_NOERROR
, 0))
2483 if (cur_nsub
<= '9' - '1')
2484 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2486 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2487 if (BE (tree
== NULL
, 0))
2492 tree
->token
.opr
.idx
= cur_nsub
;
2496 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2499 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2500 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2502 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2503 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2504 re_token_t start_token
= *token
;
2506 if (token
->type
== OP_OPEN_DUP_NUM
)
2509 start
= fetch_number (regexp
, token
, syntax
);
2512 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2513 start
= 0; /* We treat "{,m}" as "{0,m}". */
2516 *err
= REG_BADBR
; /* <re>{} is invalid. */
2520 if (BE (start
!= -2, 1))
2522 /* We treat "{n}" as "{n,n}". */
2523 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2524 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2525 ? fetch_number (regexp
, token
, syntax
) : -2));
2527 if (BE (start
== -2 || end
== -2, 0))
2529 /* Invalid sequence. */
2530 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2532 if (token
->type
== END_OF_RE
)
2540 /* If the syntax bit is set, rollback. */
2541 re_string_set_index (regexp
, start_idx
);
2542 *token
= start_token
;
2543 token
->type
= CHARACTER
;
2544 /* mb_partial and word_char bits should be already initialized by
2549 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2551 /* First number greater than second. */
2558 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2559 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2562 fetch_token (token
, regexp
, syntax
);
2564 if (BE (elem
== NULL
, 0))
2566 if (BE (start
== 0 && end
== 0, 0))
2568 postorder (elem
, free_tree
, NULL
);
2572 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2573 if (BE (start
> 0, 0))
2576 for (i
= 2; i
<= start
; ++i
)
2578 elem
= duplicate_tree (elem
, dfa
);
2579 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2580 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2581 goto parse_dup_op_espace
;
2587 /* Duplicate ELEM before it is marked optional. */
2588 elem
= duplicate_tree (elem
, dfa
);
2589 if (BE (elem
== NULL
, 0))
2590 goto parse_dup_op_espace
;
2596 if (elem
->token
.type
== SUBEXP
)
2597 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2599 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2600 if (BE (tree
== NULL
, 0))
2601 goto parse_dup_op_espace
;
2603 /* This loop is actually executed only when end != -1,
2604 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2605 already created the start+1-th copy. */
2606 for (i
= start
+ 2; i
<= end
; ++i
)
2608 elem
= duplicate_tree (elem
, dfa
);
2609 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2610 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2611 goto parse_dup_op_espace
;
2613 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2614 if (BE (tree
== NULL
, 0))
2615 goto parse_dup_op_espace
;
2619 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2623 parse_dup_op_espace
:
2628 /* Size of the names for collating symbol/equivalence_class/character_class.
2629 I'm not sure, but maybe enough. */
2630 #define BRACKET_NAME_BUF_SIZE 32
2633 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2634 Build the range expression which starts from START_ELEM, and ends
2635 at END_ELEM. The result are written to MBCSET and SBCSET.
2636 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2637 mbcset->range_ends, is a pointer argument since we may
2640 static reg_errcode_t
2642 # ifdef RE_ENABLE_I18N
2643 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2644 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2645 # else /* not RE_ENABLE_I18N */
2646 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2647 bracket_elem_t
*end_elem
)
2648 # endif /* not RE_ENABLE_I18N */
2650 unsigned int start_ch
, end_ch
;
2651 /* Equivalence Classes and Character Classes can't be a range start/end. */
2652 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2653 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2657 /* We can handle no multi character collating elements without libc
2659 if (BE ((start_elem
->type
== COLL_SYM
2660 && strlen ((char *) start_elem
->opr
.name
) > 1)
2661 || (end_elem
->type
== COLL_SYM
2662 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2663 return REG_ECOLLATE
;
2665 # ifdef RE_ENABLE_I18N
2670 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2672 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2673 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2675 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2676 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2678 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2679 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2680 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2681 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2682 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2683 return REG_ECOLLATE
;
2684 cmp_buf
[0] = start_wc
;
2685 cmp_buf
[4] = end_wc
;
2686 if (__wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2696 /* Check the space of the arrays. */
2697 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start
, *new_array_end
;
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges
= 2 * mbcset
->nranges
+ 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2709 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2712 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2715 mbcset
->range_starts
= new_array_start
;
2716 mbcset
->range_ends
= new_array_end
;
2717 *range_alloc
= new_nranges
;
2720 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2721 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2724 /* Build the table for single byte characters. */
2725 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2728 if (__wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2729 && __wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2730 bitset_set (sbcset
, wc
);
2733 # else /* not RE_ENABLE_I18N */
2736 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2737 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2739 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2740 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2742 if (start_ch
> end_ch
)
2744 /* Build the table for single byte characters. */
2745 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2746 if (start_ch
<= ch
&& ch
<= end_ch
)
2747 bitset_set (sbcset
, ch
);
2749 # endif /* not RE_ENABLE_I18N */
2752 #endif /* not _LIBC */
2755 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756 Build the collating element which is represented by NAME.
2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759 pointer argument since we may update it. */
2761 static reg_errcode_t
2763 # ifdef RE_ENABLE_I18N
2764 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2765 int *coll_sym_alloc
, const unsigned char *name
)
2766 # else /* not RE_ENABLE_I18N */
2767 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2768 # endif /* not RE_ENABLE_I18N */
2770 size_t name_len
= strlen ((const char *) name
);
2771 if (BE (name_len
!= 1, 0))
2772 return REG_ECOLLATE
;
2775 bitset_set (sbcset
, name
[0]);
2779 #endif /* not _LIBC */
2781 /* This function parse bracket expression like "[abc]", "[a-c]",
2785 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2786 reg_syntax_t syntax
, reg_errcode_t
*err
)
2789 const unsigned char *collseqmb
;
2790 const char *collseqwc
;
2793 const int32_t *symb_table
;
2794 const unsigned char *extra
;
2796 /* Local function for parse_bracket_exp used in _LIBC environment.
2797 Seek the collating symbol entry corresponding to NAME.
2798 Return the index of the symbol in the SYMB_TABLE,
2799 or -1 if not found. */
2802 __attribute__ ((always_inline
))
2803 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2807 for (elem
= 0; elem
< table_size
; elem
++)
2808 if (symb_table
[2 * elem
] != 0)
2810 int32_t idx
= symb_table
[2 * elem
+ 1];
2811 /* Skip the name of collating element name. */
2812 idx
+= 1 + extra
[idx
];
2813 if (/* Compare the length of the name. */
2814 name_len
== extra
[idx
]
2815 /* Compare the name. */
2816 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2817 /* Yep, this is the entry. */
2823 /* Local function for parse_bracket_exp used in _LIBC environment.
2824 Look up the collation sequence value of BR_ELEM.
2825 Return the value if succeeded, UINT_MAX otherwise. */
2827 auto inline unsigned int
2828 __attribute__ ((always_inline
))
2829 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2831 if (br_elem
->type
== SB_CHAR
)
2834 if (MB_CUR_MAX == 1)
2837 return collseqmb
[br_elem
->opr
.ch
];
2840 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2841 return __collseq_table_lookup (collseqwc
, wc
);
2844 else if (br_elem
->type
== MB_CHAR
)
2847 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2849 else if (br_elem
->type
== COLL_SYM
)
2851 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2855 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2859 /* We found the entry. */
2860 idx
= symb_table
[2 * elem
+ 1];
2861 /* Skip the name of collating element name. */
2862 idx
+= 1 + extra
[idx
];
2863 /* Skip the byte sequence of the collating element. */
2864 idx
+= 1 + extra
[idx
];
2865 /* Adjust for the alignment. */
2866 idx
= (idx
+ 3) & ~3;
2867 /* Skip the multibyte collation sequence value. */
2868 idx
+= sizeof (unsigned int);
2869 /* Skip the wide char sequence of the collating element. */
2870 idx
+= sizeof (unsigned int) *
2871 (1 + *(unsigned int *) (extra
+ idx
));
2872 /* Return the collation sequence value. */
2873 return *(unsigned int *) (extra
+ idx
);
2875 else if (sym_name_len
== 1)
2877 /* No valid character. Match it as a single byte
2879 return collseqmb
[br_elem
->opr
.name
[0]];
2882 else if (sym_name_len
== 1)
2883 return collseqmb
[br_elem
->opr
.name
[0]];
2888 /* Local function for parse_bracket_exp used in _LIBC environment.
2889 Build the range expression which starts from START_ELEM, and ends
2890 at END_ELEM. The result are written to MBCSET and SBCSET.
2891 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892 mbcset->range_ends, is a pointer argument since we may
2895 auto inline reg_errcode_t
2896 __attribute__ ((always_inline
))
2897 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2898 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2901 uint32_t start_collseq
;
2902 uint32_t end_collseq
;
2904 /* Equivalence Classes and Character Classes can't be a range
2906 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2907 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2911 start_collseq
= lookup_collation_sequence_value (start_elem
);
2912 end_collseq
= lookup_collation_sequence_value (end_elem
);
2913 /* Check start/end collation sequence values. */
2914 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2915 return REG_ECOLLATE
;
2916 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2919 /* Got valid collation sequence values, add them as a new entry.
2920 However, if we have no collation elements, and the character set
2921 is single byte, the single byte character set that we
2922 build below suffices. */
2923 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2925 /* Check the space of the arrays. */
2926 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2928 /* There is not enough space, need realloc. */
2929 uint32_t *new_array_start
;
2930 uint32_t *new_array_end
;
2933 /* +1 in case of mbcset->nranges is 0. */
2934 new_nranges
= 2 * mbcset
->nranges
+ 1;
2935 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2937 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2940 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2943 mbcset
->range_starts
= new_array_start
;
2944 mbcset
->range_ends
= new_array_end
;
2945 *range_alloc
= new_nranges
;
2948 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2949 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2952 /* Build the table for single byte characters. */
2953 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2955 uint32_t ch_collseq
;
2957 if (MB_CUR_MAX == 1)
2960 ch_collseq
= collseqmb
[ch
];
2962 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2963 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2964 bitset_set (sbcset
, ch
);
2969 /* Local function for parse_bracket_exp used in _LIBC environment.
2970 Build the collating element which is represented by NAME.
2971 The result are written to MBCSET and SBCSET.
2972 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2973 pointer argument since we may update it. */
2975 auto inline reg_errcode_t
2976 __attribute__ ((always_inline
))
2977 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2978 int *coll_sym_alloc
, const unsigned char *name
)
2981 size_t name_len
= strlen ((const char *) name
);
2984 elem
= seek_collating_symbol_entry (name
, name_len
);
2987 /* We found the entry. */
2988 idx
= symb_table
[2 * elem
+ 1];
2989 /* Skip the name of collating element name. */
2990 idx
+= 1 + extra
[idx
];
2992 else if (name_len
== 1)
2994 /* No valid character, treat it as a normal
2996 bitset_set (sbcset
, name
[0]);
3000 return REG_ECOLLATE
;
3002 /* Got valid collation sequence, add it as a new entry. */
3003 /* Check the space of the arrays. */
3004 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3006 /* Not enough, realloc it. */
3007 /* +1 in case of mbcset->ncoll_syms is 0. */
3008 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3009 /* Use realloc since mbcset->coll_syms is NULL
3011 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3012 new_coll_sym_alloc
);
3013 if (BE (new_coll_syms
== NULL
, 0))
3015 mbcset
->coll_syms
= new_coll_syms
;
3016 *coll_sym_alloc
= new_coll_sym_alloc
;
3018 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3023 if (BE (name_len
!= 1, 0))
3024 return REG_ECOLLATE
;
3027 bitset_set (sbcset
, name
[0]);
3034 re_token_t br_token
;
3035 re_bitset_ptr_t sbcset
;
3036 #ifdef RE_ENABLE_I18N
3037 re_charset_t
*mbcset
;
3038 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3039 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3040 #endif /* not RE_ENABLE_I18N */
3042 bin_tree_t
*work_tree
;
3044 int first_round
= 1;
3046 collseqmb
= (const unsigned char *)
3047 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3048 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3054 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3055 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3056 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3057 _NL_COLLATE_SYMB_TABLEMB
);
3058 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3059 _NL_COLLATE_SYMB_EXTRAMB
);
3062 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3063 #ifdef RE_ENABLE_I18N
3064 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3065 #endif /* RE_ENABLE_I18N */
3066 #ifdef RE_ENABLE_I18N
3067 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3069 if (BE (sbcset
== NULL
, 0))
3070 #endif /* RE_ENABLE_I18N */
3073 #ifdef RE_ENABLE_I18N
3080 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3081 if (BE (token
->type
== END_OF_RE
, 0))
3084 goto parse_bracket_exp_free_return
;
3086 if (token
->type
== OP_NON_MATCH_LIST
)
3088 #ifdef RE_ENABLE_I18N
3089 mbcset
->non_match
= 1;
3090 #endif /* not RE_ENABLE_I18N */
3092 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3093 bitset_set (sbcset
, '\n');
3094 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3095 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3096 if (BE (token
->type
== END_OF_RE
, 0))
3099 goto parse_bracket_exp_free_return
;
3103 /* We treat the first ']' as a normal character. */
3104 if (token
->type
== OP_CLOSE_BRACKET
)
3105 token
->type
= CHARACTER
;
3109 bracket_elem_t start_elem
, end_elem
;
3110 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3111 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3113 int token_len2
= 0, is_range_exp
= 0;
3116 start_elem
.opr
.name
= start_name_buf
;
3117 start_elem
.type
= COLL_SYM
;
3118 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3119 syntax
, first_round
);
3120 if (BE (ret
!= REG_NOERROR
, 0))
3123 goto parse_bracket_exp_free_return
;
3127 /* Get information about the next token. We need it in any case. */
3128 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3130 /* Do not check for ranges if we know they are not allowed. */
3131 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3133 if (BE (token
->type
== END_OF_RE
, 0))
3136 goto parse_bracket_exp_free_return
;
3138 if (token
->type
== OP_CHARSET_RANGE
)
3140 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3141 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3142 if (BE (token2
.type
== END_OF_RE
, 0))
3145 goto parse_bracket_exp_free_return
;
3147 if (token2
.type
== OP_CLOSE_BRACKET
)
3149 /* We treat the last '-' as a normal character. */
3150 re_string_skip_bytes (regexp
, -token_len
);
3151 token
->type
= CHARACTER
;
3158 if (is_range_exp
== 1)
3160 end_elem
.opr
.name
= end_name_buf
;
3161 end_elem
.type
= COLL_SYM
;
3162 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3164 if (BE (ret
!= REG_NOERROR
, 0))
3167 goto parse_bracket_exp_free_return
;
3170 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3173 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3174 &start_elem
, &end_elem
);
3176 # ifdef RE_ENABLE_I18N
3177 *err
= build_range_exp (sbcset
,
3178 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3179 &range_alloc
, &start_elem
, &end_elem
);
3181 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3183 #endif /* RE_ENABLE_I18N */
3184 if (BE (*err
!= REG_NOERROR
, 0))
3185 goto parse_bracket_exp_free_return
;
3189 switch (start_elem
.type
)
3192 bitset_set (sbcset
, start_elem
.opr
.ch
);
3194 #ifdef RE_ENABLE_I18N
3196 /* Check whether the array has enough space. */
3197 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3199 wchar_t *new_mbchars
;
3200 /* Not enough, realloc it. */
3201 /* +1 in case of mbcset->nmbchars is 0. */
3202 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3203 /* Use realloc since array is NULL if *alloc == 0. */
3204 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3206 if (BE (new_mbchars
== NULL
, 0))
3207 goto parse_bracket_exp_espace
;
3208 mbcset
->mbchars
= new_mbchars
;
3210 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3212 #endif /* RE_ENABLE_I18N */
3214 *err
= build_equiv_class (sbcset
,
3215 #ifdef RE_ENABLE_I18N
3216 mbcset
, &equiv_class_alloc
,
3217 #endif /* RE_ENABLE_I18N */
3218 start_elem
.opr
.name
);
3219 if (BE (*err
!= REG_NOERROR
, 0))
3220 goto parse_bracket_exp_free_return
;
3223 *err
= build_collating_symbol (sbcset
,
3224 #ifdef RE_ENABLE_I18N
3225 mbcset
, &coll_sym_alloc
,
3226 #endif /* RE_ENABLE_I18N */
3227 start_elem
.opr
.name
);
3228 if (BE (*err
!= REG_NOERROR
, 0))
3229 goto parse_bracket_exp_free_return
;
3232 *err
= build_charclass (regexp
->trans
, sbcset
,
3233 #ifdef RE_ENABLE_I18N
3234 mbcset
, &char_class_alloc
,
3235 #endif /* RE_ENABLE_I18N */
3236 start_elem
.opr
.name
, syntax
);
3237 if (BE (*err
!= REG_NOERROR
, 0))
3238 goto parse_bracket_exp_free_return
;
3245 if (BE (token
->type
== END_OF_RE
, 0))
3248 goto parse_bracket_exp_free_return
;
3250 if (token
->type
== OP_CLOSE_BRACKET
)
3254 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3256 /* If it is non-matching list. */
3258 bitset_not (sbcset
);
3260 #ifdef RE_ENABLE_I18N
3261 /* Ensure only single byte characters are set. */
3262 if (dfa
->mb_cur_max
> 1)
3263 bitset_mask (sbcset
, dfa
->sb_char
);
3265 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3266 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3267 || mbcset
->non_match
)))
3269 bin_tree_t
*mbc_tree
;
3271 /* Build a tree for complex bracket. */
3272 dfa
->has_mb_node
= 1;
3273 br_token
.type
= COMPLEX_BRACKET
;
3274 br_token
.opr
.mbcset
= mbcset
;
3275 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3276 if (BE (mbc_tree
== NULL
, 0))
3277 goto parse_bracket_exp_espace
;
3278 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3279 if (sbcset
[sbc_idx
])
3281 /* If there are no bits set in sbcset, there is no point
3282 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3283 if (sbc_idx
< BITSET_WORDS
)
3285 /* Build a tree for simple bracket. */
3286 br_token
.type
= SIMPLE_BRACKET
;
3287 br_token
.opr
.sbcset
= sbcset
;
3288 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3289 if (BE (work_tree
== NULL
, 0))
3290 goto parse_bracket_exp_espace
;
3292 /* Then join them by ALT node. */
3293 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3294 if (BE (work_tree
== NULL
, 0))
3295 goto parse_bracket_exp_espace
;
3300 work_tree
= mbc_tree
;
3304 #endif /* not RE_ENABLE_I18N */
3306 #ifdef RE_ENABLE_I18N
3307 free_charset (mbcset
);
3309 /* Build a tree for simple bracket. */
3310 br_token
.type
= SIMPLE_BRACKET
;
3311 br_token
.opr
.sbcset
= sbcset
;
3312 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3313 if (BE (work_tree
== NULL
, 0))
3314 goto parse_bracket_exp_espace
;
3318 parse_bracket_exp_espace
:
3320 parse_bracket_exp_free_return
:
3322 #ifdef RE_ENABLE_I18N
3323 free_charset (mbcset
);
3324 #endif /* RE_ENABLE_I18N */
3328 /* Parse an element in the bracket expression. */
3330 static reg_errcode_t
3331 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3332 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3333 reg_syntax_t syntax
, int accept_hyphen
)
3335 #ifdef RE_ENABLE_I18N
3337 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3338 if (cur_char_size
> 1)
3340 elem
->type
= MB_CHAR
;
3341 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3342 re_string_skip_bytes (regexp
, cur_char_size
);
3345 #endif /* RE_ENABLE_I18N */
3346 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3347 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3348 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3349 return parse_bracket_symbol (elem
, regexp
, token
);
3350 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3352 /* A '-' must only appear as anything but a range indicator before
3353 the closing bracket. Everything else is an error. */
3355 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3356 if (token2
.type
!= OP_CLOSE_BRACKET
)
3357 /* The actual error value is not standardized since this whole
3358 case is undefined. But ERANGE makes good sense. */
3361 elem
->type
= SB_CHAR
;
3362 elem
->opr
.ch
= token
->opr
.c
;
3366 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3367 such as [:<character_class>:], [.<collating_element>.], and
3368 [=<equivalent_class>=]. */
3370 static reg_errcode_t
3371 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3374 unsigned char ch
, delim
= token
->opr
.c
;
3376 if (re_string_eoi(regexp
))
3380 if (i
>= BRACKET_NAME_BUF_SIZE
)
3382 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3383 ch
= re_string_fetch_byte_case (regexp
);
3385 ch
= re_string_fetch_byte (regexp
);
3386 if (re_string_eoi(regexp
))
3388 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3390 elem
->opr
.name
[i
] = ch
;
3392 re_string_skip_bytes (regexp
, 1);
3393 elem
->opr
.name
[i
] = '\0';
3394 switch (token
->type
)
3396 case OP_OPEN_COLL_ELEM
:
3397 elem
->type
= COLL_SYM
;
3399 case OP_OPEN_EQUIV_CLASS
:
3400 elem
->type
= EQUIV_CLASS
;
3402 case OP_OPEN_CHAR_CLASS
:
3403 elem
->type
= CHAR_CLASS
;
3411 /* Helper function for parse_bracket_exp.
3412 Build the equivalence class which is represented by NAME.
3413 The result are written to MBCSET and SBCSET.
3414 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3415 is a pointer argument since we may update it. */
3417 static reg_errcode_t
3418 #ifdef RE_ENABLE_I18N
3419 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3420 int *equiv_class_alloc
, const unsigned char *name
)
3421 #else /* not RE_ENABLE_I18N */
3422 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3423 #endif /* not RE_ENABLE_I18N */
3426 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3429 const int32_t *table
, *indirect
;
3430 const unsigned char *weights
, *extra
, *cp
;
3431 unsigned char char_buf
[2];
3435 /* Calculate the index for equivalence class. */
3437 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3438 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3439 _NL_COLLATE_WEIGHTMB
);
3440 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3441 _NL_COLLATE_EXTRAMB
);
3442 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3443 _NL_COLLATE_INDIRECTMB
);
3444 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3445 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3446 /* This isn't a valid character. */
3447 return REG_ECOLLATE
;
3449 /* Build single byte matcing table for this equivalence class. */
3450 len
= weights
[idx1
& 0xffffff];
3451 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3455 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3460 /* This isn't a valid character. */
3462 /* Compare only if the length matches and the collation rule
3463 index is the same. */
3464 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3468 while (cnt
<= len
&&
3469 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3470 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3474 bitset_set (sbcset
, ch
);
3477 /* Check whether the array has enough space. */
3478 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3480 /* Not enough, realloc it. */
3481 /* +1 in case of mbcset->nequiv_classes is 0. */
3482 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3483 /* Use realloc since the array is NULL if *alloc == 0. */
3484 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3486 new_equiv_class_alloc
);
3487 if (BE (new_equiv_classes
== NULL
, 0))
3489 mbcset
->equiv_classes
= new_equiv_classes
;
3490 *equiv_class_alloc
= new_equiv_class_alloc
;
3492 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3497 if (BE (strlen ((const char *) name
) != 1, 0))
3498 return REG_ECOLLATE
;
3499 bitset_set (sbcset
, *name
);
3504 /* Helper function for parse_bracket_exp.
3505 Build the character class which is represented by NAME.
3506 The result are written to MBCSET and SBCSET.
3507 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3508 is a pointer argument since we may update it. */
3510 static reg_errcode_t
3511 #ifdef RE_ENABLE_I18N
3512 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3513 re_charset_t
*mbcset
, int *char_class_alloc
,
3514 const unsigned char *class_name
, reg_syntax_t syntax
)
3515 #else /* not RE_ENABLE_I18N */
3516 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3517 const unsigned char *class_name
, reg_syntax_t syntax
)
3518 #endif /* not RE_ENABLE_I18N */
3521 const char *name
= (const char *) class_name
;
3523 /* In case of REG_ICASE "upper" and "lower" match the both of
3524 upper and lower cases. */
3525 if ((syntax
& RE_ICASE
)
3526 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3529 #ifdef RE_ENABLE_I18N
3530 /* Check the space of the arrays. */
3531 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3533 /* Not enough, realloc it. */
3534 /* +1 in case of mbcset->nchar_classes is 0. */
3535 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3536 /* Use realloc since array is NULL if *alloc == 0. */
3537 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3538 new_char_class_alloc
);
3539 if (BE (new_char_classes
== NULL
, 0))
3541 mbcset
->char_classes
= new_char_classes
;
3542 *char_class_alloc
= new_char_class_alloc
;
3544 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3545 #endif /* RE_ENABLE_I18N */
3547 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3549 if (BE (trans != NULL, 0)) \
3551 for (i = 0; i < SBC_MAX; ++i) \
3552 if (ctype_func (i)) \
3553 bitset_set (sbcset, trans[i]); \
3557 for (i = 0; i < SBC_MAX; ++i) \
3558 if (ctype_func (i)) \
3559 bitset_set (sbcset, i); \
3563 if (strcmp (name
, "alnum") == 0)
3564 BUILD_CHARCLASS_LOOP (isalnum
);
3565 else if (strcmp (name
, "cntrl") == 0)
3566 BUILD_CHARCLASS_LOOP (iscntrl
);
3567 else if (strcmp (name
, "lower") == 0)
3568 BUILD_CHARCLASS_LOOP (islower
);
3569 else if (strcmp (name
, "space") == 0)
3570 BUILD_CHARCLASS_LOOP (isspace
);
3571 else if (strcmp (name
, "alpha") == 0)
3572 BUILD_CHARCLASS_LOOP (isalpha
);
3573 else if (strcmp (name
, "digit") == 0)
3574 BUILD_CHARCLASS_LOOP (isdigit
);
3575 else if (strcmp (name
, "print") == 0)
3576 BUILD_CHARCLASS_LOOP (isprint
);
3577 else if (strcmp (name
, "upper") == 0)
3578 BUILD_CHARCLASS_LOOP (isupper
);
3579 else if (strcmp (name
, "blank") == 0)
3580 BUILD_CHARCLASS_LOOP (isblank
);
3581 else if (strcmp (name
, "graph") == 0)
3582 BUILD_CHARCLASS_LOOP (isgraph
);
3583 else if (strcmp (name
, "punct") == 0)
3584 BUILD_CHARCLASS_LOOP (ispunct
);
3585 else if (strcmp (name
, "xdigit") == 0)
3586 BUILD_CHARCLASS_LOOP (isxdigit
);
3594 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3595 const unsigned char *class_name
,
3596 const unsigned char *extra
, int non_match
,
3599 re_bitset_ptr_t sbcset
;
3600 #ifdef RE_ENABLE_I18N
3601 re_charset_t
*mbcset
;
3603 #endif /* not RE_ENABLE_I18N */
3605 re_token_t br_token
;
3608 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3609 #ifdef RE_ENABLE_I18N
3610 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3611 #endif /* RE_ENABLE_I18N */
3613 #ifdef RE_ENABLE_I18N
3614 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3615 #else /* not RE_ENABLE_I18N */
3616 if (BE (sbcset
== NULL
, 0))
3617 #endif /* not RE_ENABLE_I18N */
3625 #ifdef RE_ENABLE_I18N
3626 mbcset
->non_match
= 1;
3627 #endif /* not RE_ENABLE_I18N */
3630 /* We don't care the syntax in this case. */
3631 ret
= build_charclass (trans
, sbcset
,
3632 #ifdef RE_ENABLE_I18N
3634 #endif /* RE_ENABLE_I18N */
3637 if (BE (ret
!= REG_NOERROR
, 0))
3640 #ifdef RE_ENABLE_I18N
3641 free_charset (mbcset
);
3642 #endif /* RE_ENABLE_I18N */
3646 /* \w match '_' also. */
3647 for (; *extra
; extra
++)
3648 bitset_set (sbcset
, *extra
);
3650 /* If it is non-matching list. */
3652 bitset_not (sbcset
);
3654 #ifdef RE_ENABLE_I18N
3655 /* Ensure only single byte characters are set. */
3656 if (dfa
->mb_cur_max
> 1)
3657 bitset_mask (sbcset
, dfa
->sb_char
);
3660 /* Build a tree for simple bracket. */
3661 br_token
.type
= SIMPLE_BRACKET
;
3662 br_token
.opr
.sbcset
= sbcset
;
3663 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3664 if (BE (tree
== NULL
, 0))
3665 goto build_word_op_espace
;
3667 #ifdef RE_ENABLE_I18N
3668 if (dfa
->mb_cur_max
> 1)
3670 bin_tree_t
*mbc_tree
;
3671 /* Build a tree for complex bracket. */
3672 br_token
.type
= COMPLEX_BRACKET
;
3673 br_token
.opr
.mbcset
= mbcset
;
3674 dfa
->has_mb_node
= 1;
3675 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3676 if (BE (mbc_tree
== NULL
, 0))
3677 goto build_word_op_espace
;
3678 /* Then join them by ALT node. */
3679 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3680 if (BE (mbc_tree
!= NULL
, 1))
3685 free_charset (mbcset
);
3688 #else /* not RE_ENABLE_I18N */
3690 #endif /* not RE_ENABLE_I18N */
3692 build_word_op_espace
:
3694 #ifdef RE_ENABLE_I18N
3695 free_charset (mbcset
);
3696 #endif /* RE_ENABLE_I18N */
3701 /* This is intended for the expressions like "a{1,3}".
3702 Fetch a number from `input', and return the number.
3703 Return -1, if the number field is empty like "{,1}".
3704 Return -2, If an error is occured. */
3707 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3713 fetch_token (token
, input
, syntax
);
3715 if (BE (token
->type
== END_OF_RE
, 0))
3717 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3719 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3720 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3721 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3726 #ifdef RE_ENABLE_I18N
3728 free_charset (re_charset_t
*cset
)
3730 re_free (cset
->mbchars
);
3732 re_free (cset
->coll_syms
);
3733 re_free (cset
->equiv_classes
);
3734 re_free (cset
->range_starts
);
3735 re_free (cset
->range_ends
);
3737 re_free (cset
->char_classes
);
3740 #endif /* RE_ENABLE_I18N */
3742 /* Functions for binary tree operation. */
3744 /* Create a tree node. */
3747 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3748 re_token_type_t type
)
3752 return create_token_tree (dfa
, left
, right
, &t
);
3756 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3757 const re_token_t
*token
)
3760 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3762 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3764 if (storage
== NULL
)
3766 storage
->next
= dfa
->str_tree_storage
;
3767 dfa
->str_tree_storage
= storage
;
3768 dfa
->str_tree_storage_idx
= 0;
3770 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3772 tree
->parent
= NULL
;
3774 tree
->right
= right
;
3775 tree
->token
= *token
;
3776 tree
->token
.duplicated
= 0;
3777 tree
->token
.opt_subexp
= 0;
3780 tree
->node_idx
= -1;
3783 left
->parent
= tree
;
3785 right
->parent
= tree
;
3789 /* Mark the tree SRC as an optional subexpression.
3790 To be called from preorder or postorder. */
3792 static reg_errcode_t
3793 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3795 int idx
= (int) (long) extra
;
3796 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3797 node
->token
.opt_subexp
= 1;
3802 /* Free the allocated memory inside NODE. */
3805 free_token (re_token_t
*node
)
3807 #ifdef RE_ENABLE_I18N
3808 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3809 free_charset (node
->opr
.mbcset
);
3811 #endif /* RE_ENABLE_I18N */
3812 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3813 re_free (node
->opr
.sbcset
);
3816 /* Worker function for tree walking. Free the allocated memory inside NODE
3817 and its children. */
3819 static reg_errcode_t
3820 free_tree (void *extra
, bin_tree_t
*node
)
3822 free_token (&node
->token
);
3827 /* Duplicate the node SRC, and return new node. This is a preorder
3828 visit similar to the one implemented by the generic visitor, but
3829 we need more infrastructure to maintain two parallel trees --- so,
3830 it's easier to duplicate. */
3833 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3835 const bin_tree_t
*node
;
3836 bin_tree_t
*dup_root
;
3837 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3839 for (node
= root
; ; )
3841 /* Create a new tree and link it back to the current parent. */
3842 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3845 (*p_new
)->parent
= dup_node
;
3846 (*p_new
)->token
.duplicated
= 1;
3849 /* Go to the left node, or up and to the right. */
3853 p_new
= &dup_node
->left
;
3857 const bin_tree_t
*prev
= NULL
;
3858 while (node
->right
== prev
|| node
->right
== NULL
)
3861 node
= node
->parent
;
3862 dup_node
= dup_node
->parent
;
3867 p_new
= &dup_node
->right
;