1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
21 size_t length
, reg_syntax_t syntax
);
22 static void re_compile_fastmap_iter (regex_t
*bufp
,
23 const re_dfastate_t
*init_state
,
25 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
27 static void free_charset (re_charset_t
*cset
);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t
*preg
);
30 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
32 static void optimize_utf8 (re_dfa_t
*dfa
);
34 static reg_errcode_t
analyze (regex_t
*preg
);
35 static reg_errcode_t
preorder (bin_tree_t
*root
,
36 reg_errcode_t (fn (void *, bin_tree_t
*)),
38 static reg_errcode_t
postorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
42 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
43 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
45 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
48 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
49 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
50 unsigned int constraint
);
51 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
52 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
54 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
55 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
57 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 reg_syntax_t syntax
) internal_function
;
59 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
60 reg_syntax_t syntax
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 int nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 int nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
74 re_dfa_t
*dfa
, re_token_t
*token
,
75 reg_syntax_t syntax
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
77 re_token_t
*token
, reg_syntax_t syntax
,
79 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
81 re_token_t
*token
, int token_len
,
85 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
89 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
91 int *equiv_class_alloc
,
92 const unsigned char *name
);
93 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
96 int *char_class_alloc
,
97 const unsigned char *class_name
,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
101 const unsigned char *name
);
102 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
104 const unsigned char *class_name
,
105 reg_syntax_t syntax
);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
108 RE_TRANSLATE_TYPE trans
,
109 const unsigned char *class_name
,
110 const unsigned char *extra
,
111 int non_match
, reg_errcode_t
*err
);
112 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
113 bin_tree_t
*left
, bin_tree_t
*right
,
114 re_token_type_t type
);
115 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
116 bin_tree_t
*left
, bin_tree_t
*right
,
117 const re_token_t
*token
);
118 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
119 static void free_token (re_token_t
*node
);
120 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
121 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid
[] attribute_hidden
=
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx
[] attribute_hidden
=
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (pattern
, length
, bufp
)
216 struct re_pattern_buffer
*bufp
;
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
225 /* Match anchors at newline. */
226 bufp
->newline_anchor
= 1;
228 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
232 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
235 weak_alias (__re_compile_pattern
, re_compile_pattern
)
238 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options
;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
254 re_set_syntax (syntax
)
257 reg_syntax_t ret
= re_syntax_options
;
259 re_syntax_options
= syntax
;
263 weak_alias (__re_set_syntax
, re_set_syntax
)
267 re_compile_fastmap (bufp
)
268 struct re_pattern_buffer
*bufp
;
270 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
271 char *fastmap
= bufp
->fastmap
;
273 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
274 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
275 if (dfa
->init_state
!= dfa
->init_state_word
)
276 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_nl
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
281 bufp
->fastmap_accurate
= 1;
285 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
289 __attribute ((always_inline
))
290 re_set_fastmap (char *fastmap
, int icase
, int ch
)
294 fastmap
[tolower (ch
)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
301 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
304 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
306 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
307 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
309 int node
= init_state
->nodes
.elems
[node_cnt
];
310 re_token_type_t type
= dfa
->nodes
[node
].type
;
312 if (type
== CHARACTER
)
314 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
318 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
323 *p
++ = dfa
->nodes
[node
].opr
.c
;
324 while (++node
< dfa
->nodes_len
325 && dfa
->nodes
[node
].type
== CHARACTER
326 && dfa
->nodes
[node
].mb_partial
)
327 *p
++ = dfa
->nodes
[node
].opr
.c
;
328 memset (&state
, '\0', sizeof (state
));
329 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
331 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
333 re_set_fastmap (fastmap
, 0, buf
[0]);
337 else if (type
== SIMPLE_BRACKET
)
340 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
343 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
344 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
345 if (w
& ((bitset_word_t
) 1 << j
))
346 re_set_fastmap (fastmap
, icase
, ch
);
349 #ifdef RE_ENABLE_I18N
350 else if (type
== COMPLEX_BRACKET
)
352 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
356 /* See if we have to try all bytes which start multiple collation
358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 collation element, and don't catch 'b' since 'b' is
360 the only collation element which starts from 'b' (and
361 it is caught by SIMPLE_BRACKET). */
362 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
363 && (cset
->ncoll_syms
|| cset
->nranges
))
365 const int32_t *table
= (const int32_t *)
366 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
367 for (i
= 0; i
< SBC_MAX
; ++i
)
369 re_set_fastmap (fastmap
, icase
, i
);
373 /* See if we have to start the match at all multibyte characters,
374 i.e. where we would not find an invalid sequence. This only
375 applies to multibyte character sets; for single byte character
376 sets, the SIMPLE_BRACKET again suffices. */
377 if (dfa
->mb_cur_max
> 1
378 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
380 || cset
->nequiv_classes
388 memset (&mbs
, 0, sizeof (mbs
));
389 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
390 re_set_fastmap (fastmap
, false, (int) c
);
397 /* ... Else catch all bytes which can start the mbchars. */
398 for (i
= 0; i
< cset
->nmbchars
; ++i
)
402 memset (&state
, '\0', sizeof (state
));
403 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
404 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
405 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
407 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
409 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
414 #endif /* RE_ENABLE_I18N */
415 else if (type
== OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 || type
== OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 || type
== END_OF_RE
)
421 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
422 if (type
== END_OF_RE
)
423 bufp
->can_be_null
= 1;
429 /* Entry point for POSIX code. */
430 /* regcomp takes a regular expression as a string and compiles it.
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
445 PATTERN is the address of the pattern string.
447 CFLAGS is a series of bits which affect compilation.
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
466 regcomp (preg
, pattern
, cflags
)
467 regex_t
*__restrict preg
;
468 const char *__restrict pattern
;
472 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
473 : RE_SYNTAX_POSIX_BASIC
);
479 /* Try to allocate space for the fastmap. */
480 preg
->fastmap
= re_malloc (char, SBC_MAX
);
481 if (BE (preg
->fastmap
== NULL
, 0))
484 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
486 /* If REG_NEWLINE is set, newlines are treated differently. */
487 if (cflags
& REG_NEWLINE
)
488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
489 syntax
&= ~RE_DOT_NEWLINE
;
490 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
491 /* It also changes the matching behavior. */
492 preg
->newline_anchor
= 1;
495 preg
->newline_anchor
= 0;
496 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
497 preg
->translate
= NULL
;
499 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
501 /* POSIX doesn't distinguish between an unmatched open-group and an
502 unmatched close-group: both are REG_EPAREN. */
503 if (ret
== REG_ERPAREN
)
506 /* We have already checked preg->fastmap != NULL. */
507 if (BE (ret
== REG_NOERROR
, 1))
508 /* Compute the fastmap now, since regexec cannot modify the pattern
509 buffer. This function never fails in this implementation. */
510 (void) re_compile_fastmap (preg
);
513 /* Some error occurred while compiling the expression. */
514 re_free (preg
->fastmap
);
515 preg
->fastmap
= NULL
;
521 weak_alias (__regcomp
, regcomp
)
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
528 regerror (errcode
, preg
, errbuf
, errbuf_size
)
530 const regex_t
*__restrict preg
;
531 char *__restrict errbuf
;
538 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
539 / sizeof (__re_error_msgid_idx
[0])), 0))
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
546 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
548 msg_size
= strlen (msg
) + 1; /* Includes the null. */
550 if (BE (errbuf_size
!= 0, 1))
552 if (BE (msg_size
> errbuf_size
, 0))
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
557 memcpy (errbuf
, msg
, errbuf_size
- 1);
558 errbuf
[errbuf_size
- 1] = 0;
562 memcpy (errbuf
, msg
, msg_size
);
568 weak_alias (__regerror
, regerror
)
572 #ifdef RE_ENABLE_I18N
573 /* This static array is used for the map to single-byte characters when
574 UTF-8 is used. Otherwise we would allocate memory just to initialize
575 it the same all the time. UTF-8 is the preferred encoding so this is
576 a worthwhile optimization. */
577 static const bitset_t utf8_sb_map
=
579 /* Set the first 128 bits. */
580 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
586 free_dfa_content (re_dfa_t
*dfa
)
591 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
592 free_token (dfa
->nodes
+ i
);
593 re_free (dfa
->nexts
);
594 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
596 if (dfa
->eclosures
!= NULL
)
597 re_node_set_free (dfa
->eclosures
+ i
);
598 if (dfa
->inveclosures
!= NULL
)
599 re_node_set_free (dfa
->inveclosures
+ i
);
600 if (dfa
->edests
!= NULL
)
601 re_node_set_free (dfa
->edests
+ i
);
603 re_free (dfa
->edests
);
604 re_free (dfa
->eclosures
);
605 re_free (dfa
->inveclosures
);
606 re_free (dfa
->nodes
);
608 if (dfa
->state_table
)
609 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
611 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
612 for (j
= 0; j
< entry
->num
; ++j
)
614 re_dfastate_t
*state
= entry
->array
[j
];
617 re_free (entry
->array
);
619 re_free (dfa
->state_table
);
620 #ifdef RE_ENABLE_I18N
621 if (dfa
->sb_char
!= utf8_sb_map
)
622 re_free (dfa
->sb_char
);
624 re_free (dfa
->subexp_map
);
626 re_free (dfa
->re_str
);
633 /* Free dynamically allocated space used by PREG. */
639 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
640 if (BE (dfa
!= NULL
, 1))
641 free_dfa_content (dfa
);
645 re_free (preg
->fastmap
);
646 preg
->fastmap
= NULL
;
648 re_free (preg
->translate
);
649 preg
->translate
= NULL
;
652 weak_alias (__regfree
, regfree
)
655 /* Entry points compatible with 4.2 BSD regex library. We don't define
656 them unless specifically requested. */
658 #if defined _REGEX_RE_COMP || defined _LIBC
660 /* BSD has one and only one pattern buffer. */
661 static struct re_pattern_buffer re_comp_buf
;
665 /* Make these definitions weak in libc, so POSIX programs can redefine
666 these names if they don't use our functions, and still use
667 regcomp/regexec above without link errors. */
678 if (!re_comp_buf
.buffer
)
679 return gettext ("No previous regular expression");
683 if (re_comp_buf
.buffer
)
685 fastmap
= re_comp_buf
.fastmap
;
686 re_comp_buf
.fastmap
= NULL
;
687 __regfree (&re_comp_buf
);
688 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
689 re_comp_buf
.fastmap
= fastmap
;
692 if (re_comp_buf
.fastmap
== NULL
)
694 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
695 if (re_comp_buf
.fastmap
== NULL
)
696 return (char *) gettext (__re_error_msgid
697 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
700 /* Since `re_exec' always passes NULL for the `regs' argument, we
701 don't need to initialize the pattern buffer fields which affect it. */
703 /* Match anchors at newlines. */
704 re_comp_buf
.newline_anchor
= 1;
706 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
712 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
716 libc_freeres_fn (free_mem
)
718 __regfree (&re_comp_buf
);
722 #endif /* _REGEX_RE_COMP */
724 /* Internal entry point.
725 Compile the regular expression PATTERN, whose length is LENGTH.
726 SYNTAX indicate regular expression's syntax. */
729 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
732 reg_errcode_t err
= REG_NOERROR
;
736 /* Initialize the pattern buffer. */
737 preg
->fastmap_accurate
= 0;
738 preg
->syntax
= syntax
;
739 preg
->not_bol
= preg
->not_eol
= 0;
742 preg
->can_be_null
= 0;
743 preg
->regs_allocated
= REGS_UNALLOCATED
;
745 /* Initialize the dfa. */
746 dfa
= (re_dfa_t
*) preg
->buffer
;
747 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
749 /* If zero allocated, but buffer is non-null, try to realloc
750 enough space. This loses if buffer's address is bogus, but
751 that is the user's responsibility. If ->buffer is NULL this
752 is a simple allocation. */
753 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
756 preg
->allocated
= sizeof (re_dfa_t
);
757 preg
->buffer
= (unsigned char *) dfa
;
759 preg
->used
= sizeof (re_dfa_t
);
761 err
= init_dfa (dfa
, length
);
762 if (BE (err
!= REG_NOERROR
, 0))
764 free_dfa_content (dfa
);
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa
->re_str
= re_malloc (char, length
+ 1);
772 strncpy (dfa
->re_str
, pattern
, length
+ 1);
775 __libc_lock_init (dfa
->lock
);
777 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
778 syntax
& RE_ICASE
, dfa
);
779 if (BE (err
!= REG_NOERROR
, 0))
781 re_compile_internal_free_return
:
782 free_workarea_compile (preg
);
783 re_string_destruct (®exp
);
784 free_dfa_content (dfa
);
790 /* Parse the regular expression, and build a structure tree. */
792 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
793 if (BE (dfa
->str_tree
== NULL
, 0))
794 goto re_compile_internal_free_return
;
796 /* Analyze the tree and create the nfa. */
797 err
= analyze (preg
);
798 if (BE (err
!= REG_NOERROR
, 0))
799 goto re_compile_internal_free_return
;
801 #ifdef RE_ENABLE_I18N
802 /* If possible, do searching in single byte encoding to speed things up. */
803 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
807 /* Then create the initial state of the dfa. */
808 err
= create_initial_state (dfa
);
810 /* Release work areas. */
811 free_workarea_compile (preg
);
812 re_string_destruct (®exp
);
814 if (BE (err
!= REG_NOERROR
, 0))
816 free_dfa_content (dfa
);
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
828 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
830 unsigned int table_size
;
835 memset (dfa
, '\0', sizeof (re_dfa_t
));
837 /* Force allocation of str_tree_storage the first time. */
838 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
840 /* Avoid overflows. */
841 if (pat_len
== SIZE_MAX
)
844 dfa
->nodes_alloc
= pat_len
+ 1;
845 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
847 /* table_size = 2 ^ ceil(log pat_len) */
848 for (table_size
= 1; ; table_size
<<= 1)
849 if (table_size
> pat_len
)
852 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
853 dfa
->state_hash_mask
= table_size
- 1;
855 dfa
->mb_cur_max
= MB_CUR_MAX
;
857 if (dfa
->mb_cur_max
== 6
858 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
860 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
863 # ifdef HAVE_LANGINFO_CODESET
864 codeset_name
= nl_langinfo (CODESET
);
866 codeset_name
= getenv ("LC_ALL");
867 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
868 codeset_name
= getenv ("LC_CTYPE");
869 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
870 codeset_name
= getenv ("LANG");
871 if (codeset_name
== NULL
)
873 else if (strchr (codeset_name
, '.') != NULL
)
874 codeset_name
= strchr (codeset_name
, '.') + 1;
877 if (strcasecmp (codeset_name
, "UTF-8") == 0
878 || strcasecmp (codeset_name
, "UTF8") == 0)
881 /* We check exhaustively in the loop below if this charset is a
882 superset of ASCII. */
883 dfa
->map_notascii
= 0;
886 #ifdef RE_ENABLE_I18N
887 if (dfa
->mb_cur_max
> 1)
890 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
895 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
896 if (BE (dfa
->sb_char
== NULL
, 0))
899 /* Set the bits corresponding to single byte chars. */
900 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
901 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
903 wint_t wch
= __btowc (ch
);
905 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
907 if (isascii (ch
) && wch
!= ch
)
908 dfa
->map_notascii
= 1;
915 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
920 /* Initialize WORD_CHAR table, which indicate which character is
921 "word". In this case "word" means that it is the word construction
922 character used by some operators like "\<", "\>", etc. */
926 init_word_char (re_dfa_t
*dfa
)
928 dfa
->word_ops_used
= 1;
931 if (BE (dfa
->map_notascii
== 0, 1))
933 if (sizeof (dfa
->word_char
[0]) == 8)
935 dfa
->word_char
[0] = UINT64_C (0x03ff000000000000);
936 dfa
->word_char
[1] = UINT64_C (0x07fffffe87fffffe);
939 else if (sizeof (dfa
->word_char
[0]) == 4)
941 dfa
->word_char
[0] = UINT32_C (0x00000000);
942 dfa
->word_char
[1] = UINT32_C (0x03ff0000);
943 dfa
->word_char
[2] = UINT32_C (0x87fffffe);
944 dfa
->word_char
[3] = UINT32_C (0x07fffffe);
951 if (BE (dfa
->is_utf8
, 1))
953 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
958 for (; i
< BITSET_WORDS
; ++i
)
959 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
960 if (isalnum (ch
) || ch
== '_')
961 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
964 /* Free the work area which are only used while compiling. */
967 free_workarea_compile (regex_t
*preg
)
969 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
970 bin_tree_storage_t
*storage
, *next
;
971 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
973 next
= storage
->next
;
976 dfa
->str_tree_storage
= NULL
;
977 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
978 dfa
->str_tree
= NULL
;
979 re_free (dfa
->org_indices
);
980 dfa
->org_indices
= NULL
;
983 /* Create initial states for all contexts. */
986 create_initial_state (re_dfa_t
*dfa
)
990 re_node_set init_nodes
;
992 /* Initial states have the epsilon closure of the node which is
993 the first node of the regular expression. */
994 first
= dfa
->str_tree
->first
->node_idx
;
995 dfa
->init_node
= first
;
996 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
997 if (BE (err
!= REG_NOERROR
, 0))
1000 /* The back-references which are in initial states can epsilon transit,
1001 since in this case all of the subexpressions can be null.
1002 Then we add epsilon closures of the nodes which are the next nodes of
1003 the back-references. */
1004 if (dfa
->nbackref
> 0)
1005 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1007 int node_idx
= init_nodes
.elems
[i
];
1008 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1011 if (type
!= OP_BACK_REF
)
1013 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1015 re_token_t
*clexp_node
;
1016 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1017 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1018 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1021 if (clexp_idx
== init_nodes
.nelem
)
1024 if (type
== OP_BACK_REF
)
1026 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1027 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1029 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1032 if (err
!= REG_NOERROR
)
1039 /* It must be the first time to invoke acquire_state. */
1040 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1041 /* We don't check ERR here, since the initial state must not be NULL. */
1042 if (BE (dfa
->init_state
== NULL
, 0))
1044 if (dfa
->init_state
->has_constraint
)
1046 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1048 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1050 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1054 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1055 || dfa
->init_state_begbuf
== NULL
, 0))
1059 dfa
->init_state_word
= dfa
->init_state_nl
1060 = dfa
->init_state_begbuf
= dfa
->init_state
;
1062 re_node_set_free (&init_nodes
);
1066 #ifdef RE_ENABLE_I18N
1067 /* If it is possible to do searching in single byte encoding instead of UTF-8
1068 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1069 DFA nodes where needed. */
1072 optimize_utf8 (re_dfa_t
*dfa
)
1074 int node
, i
, mb_chars
= 0, has_period
= 0;
1076 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1077 switch (dfa
->nodes
[node
].type
)
1080 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1084 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1092 /* Word anchors etc. cannot be handled. It's okay to test
1093 opr.ctx_type since constraints (for all DFA nodes) are
1094 created by ORing one or more opr.ctx_type values. */
1104 case OP_DUP_ASTERISK
:
1105 case OP_OPEN_SUBEXP
:
1106 case OP_CLOSE_SUBEXP
:
1108 case COMPLEX_BRACKET
:
1110 case SIMPLE_BRACKET
:
1111 /* Just double check. The non-ASCII range starts at 0x80. */
1112 assert (0x80 % BITSET_WORD_BITS
== 0);
1113 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1114 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1121 if (mb_chars
|| has_period
)
1122 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1124 if (dfa
->nodes
[node
].type
== CHARACTER
1125 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1126 dfa
->nodes
[node
].mb_partial
= 0;
1127 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1128 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1131 /* The search can be in single byte locale. */
1132 dfa
->mb_cur_max
= 1;
1134 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1138 /* Analyze the structure tree, and calculate "first", "next", "edest",
1139 "eclosure", and "inveclosure". */
1141 static reg_errcode_t
1142 analyze (regex_t
*preg
)
1144 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1147 /* Allocate arrays. */
1148 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1149 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1150 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1151 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1152 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1153 || dfa
->eclosures
== NULL
, 0))
1156 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1157 if (dfa
->subexp_map
!= NULL
)
1160 for (i
= 0; i
< preg
->re_nsub
; i
++)
1161 dfa
->subexp_map
[i
] = i
;
1162 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1163 for (i
= 0; i
< preg
->re_nsub
; i
++)
1164 if (dfa
->subexp_map
[i
] != i
)
1166 if (i
== preg
->re_nsub
)
1168 free (dfa
->subexp_map
);
1169 dfa
->subexp_map
= NULL
;
1173 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1174 if (BE (ret
!= REG_NOERROR
, 0))
1176 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1177 if (BE (ret
!= REG_NOERROR
, 0))
1179 preorder (dfa
->str_tree
, calc_next
, dfa
);
1180 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1181 if (BE (ret
!= REG_NOERROR
, 0))
1183 ret
= calc_eclosure (dfa
);
1184 if (BE (ret
!= REG_NOERROR
, 0))
1187 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1188 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1189 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1192 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1193 if (BE (dfa
->inveclosures
== NULL
, 0))
1195 ret
= calc_inveclosure (dfa
);
1201 /* Our parse trees are very unbalanced, so we cannot use a stack to
1202 implement parse tree visits. Instead, we use parent pointers and
1203 some hairy code in these two functions. */
1204 static reg_errcode_t
1205 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1208 bin_tree_t
*node
, *prev
;
1210 for (node
= root
; ; )
1212 /* Descend down the tree, preferably to the left (or to the right
1213 if that's the only child). */
1214 while (node
->left
|| node
->right
)
1222 reg_errcode_t err
= fn (extra
, node
);
1223 if (BE (err
!= REG_NOERROR
, 0))
1225 if (node
->parent
== NULL
)
1228 node
= node
->parent
;
1230 /* Go up while we have a node that is reached from the right. */
1231 while (node
->right
== prev
|| node
->right
== NULL
);
1236 static reg_errcode_t
1237 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1242 for (node
= root
; ; )
1244 reg_errcode_t err
= fn (extra
, node
);
1245 if (BE (err
!= REG_NOERROR
, 0))
1248 /* Go to the left node, or up and to the right. */
1253 bin_tree_t
*prev
= NULL
;
1254 while (node
->right
== prev
|| node
->right
== NULL
)
1257 node
= node
->parent
;
1266 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1267 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1268 backreferences as well. Requires a preorder visit. */
1269 static reg_errcode_t
1270 optimize_subexps (void *extra
, bin_tree_t
*node
)
1272 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1274 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1276 int idx
= node
->token
.opr
.idx
;
1277 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1278 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1281 else if (node
->token
.type
== SUBEXP
1282 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1284 int other_idx
= node
->left
->token
.opr
.idx
;
1286 node
->left
= node
->left
->left
;
1288 node
->left
->parent
= node
;
1290 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1291 if (other_idx
< BITSET_WORD_BITS
)
1292 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1298 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1299 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1300 static reg_errcode_t
1301 lower_subexps (void *extra
, bin_tree_t
*node
)
1303 regex_t
*preg
= (regex_t
*) extra
;
1304 reg_errcode_t err
= REG_NOERROR
;
1306 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1308 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1310 node
->left
->parent
= node
;
1312 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1314 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1316 node
->right
->parent
= node
;
1323 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1325 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1326 bin_tree_t
*body
= node
->left
;
1327 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1330 /* We do not optimize empty subexpressions, because otherwise we may
1331 have bad CONCAT nodes with NULL children. This is obviously not
1332 very common, so we do not lose much. An example that triggers
1333 this case is the sed "script" /\(\)/x. */
1334 && node
->left
!= NULL
1335 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1336 || !(dfa
->used_bkref_map
1337 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1340 /* Convert the SUBEXP node to the concatenation of an
1341 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1342 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1343 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1344 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1345 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1346 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1352 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1353 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1357 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1358 nodes. Requires a postorder visit. */
1359 static reg_errcode_t
1360 calc_first (void *extra
, bin_tree_t
*node
)
1362 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1363 if (node
->token
.type
== CONCAT
)
1365 node
->first
= node
->left
->first
;
1366 node
->node_idx
= node
->left
->node_idx
;
1371 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1372 if (BE (node
->node_idx
== -1, 0))
1374 if (node
->token
.type
== ANCHOR
)
1375 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1380 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1381 static reg_errcode_t
1382 calc_next (void *extra
, bin_tree_t
*node
)
1384 switch (node
->token
.type
)
1386 case OP_DUP_ASTERISK
:
1387 node
->left
->next
= node
;
1390 node
->left
->next
= node
->right
->first
;
1391 node
->right
->next
= node
->next
;
1395 node
->left
->next
= node
->next
;
1397 node
->right
->next
= node
->next
;
1403 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404 static reg_errcode_t
1405 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1407 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1408 int idx
= node
->node_idx
;
1409 reg_errcode_t err
= REG_NOERROR
;
1411 switch (node
->token
.type
)
1417 assert (node
->next
== NULL
);
1420 case OP_DUP_ASTERISK
:
1424 dfa
->has_plural_match
= 1;
1425 if (node
->left
!= NULL
)
1426 left
= node
->left
->first
->node_idx
;
1428 left
= node
->next
->node_idx
;
1429 if (node
->right
!= NULL
)
1430 right
= node
->right
->first
->node_idx
;
1432 right
= node
->next
->node_idx
;
1434 assert (right
> -1);
1435 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1440 case OP_OPEN_SUBEXP
:
1441 case OP_CLOSE_SUBEXP
:
1442 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1446 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1447 if (node
->token
.type
== OP_BACK_REF
)
1448 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1452 assert (!IS_EPSILON_NODE (node
->token
.type
));
1453 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1460 /* Duplicate the epsilon closure of the node ROOT_NODE.
1461 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462 to their own constraint. */
1464 static reg_errcode_t
1466 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1467 int root_node
, unsigned int init_constraint
)
1469 int org_node
, clone_node
, ret
;
1470 unsigned int constraint
= init_constraint
;
1471 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1473 int org_dest
, clone_dest
;
1474 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1476 /* If the back reference epsilon-transit, its destination must
1477 also have the constraint. Then duplicate the epsilon closure
1478 of the destination of the back reference, and store it in
1479 edests of the back reference. */
1480 org_dest
= dfa
->nexts
[org_node
];
1481 re_node_set_empty (dfa
->edests
+ clone_node
);
1482 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1483 if (BE (clone_dest
== -1, 0))
1485 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1486 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1487 if (BE (ret
< 0, 0))
1490 else if (dfa
->edests
[org_node
].nelem
== 0)
1492 /* In case of the node can't epsilon-transit, don't duplicate the
1493 destination and store the original destination as the
1494 destination of the node. */
1495 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1498 else if (dfa
->edests
[org_node
].nelem
== 1)
1500 /* In case of the node can epsilon-transit, and it has only one
1502 org_dest
= dfa
->edests
[org_node
].elems
[0];
1503 re_node_set_empty (dfa
->edests
+ clone_node
);
1504 /* If the node is root_node itself, it means the epsilon clsoure
1505 has a loop. Then tie it to the destination of the root_node. */
1506 if (org_node
== root_node
&& clone_node
!= org_node
)
1508 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1509 if (BE (ret
< 0, 0))
1513 /* In case of the node has another constraint, add it. */
1514 constraint
|= dfa
->nodes
[org_node
].constraint
;
1515 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1516 if (BE (clone_dest
== -1, 0))
1518 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1519 if (BE (ret
< 0, 0))
1522 else /* dfa->edests[org_node].nelem == 2 */
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1526 org_dest
= dfa
->edests
[org_node
].elems
[0];
1527 re_node_set_empty (dfa
->edests
+ clone_node
);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1530 if (clone_dest
== -1)
1532 /* There is no such duplicated node, create a new one. */
1534 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1535 if (BE (clone_dest
== -1, 0))
1537 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1538 if (BE (ret
< 0, 0))
1540 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1541 root_node
, constraint
);
1542 if (BE (err
!= REG_NOERROR
, 0))
1547 /* There is a duplicated node which satisfies the constraint,
1548 use it to avoid infinite loop. */
1549 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1550 if (BE (ret
< 0, 0))
1554 org_dest
= dfa
->edests
[org_node
].elems
[1];
1555 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1556 if (BE (clone_dest
== -1, 0))
1558 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1559 if (BE (ret
< 0, 0))
1562 org_node
= org_dest
;
1563 clone_node
= clone_dest
;
1568 /* Search for a node which is duplicated from the node ORG_NODE, and
1569 satisfies the constraint CONSTRAINT. */
1572 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1573 unsigned int constraint
)
1576 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1578 if (org_node
== dfa
->org_indices
[idx
]
1579 && constraint
== dfa
->nodes
[idx
].constraint
)
1580 return idx
; /* Found. */
1582 return -1; /* Not found. */
1585 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1586 Return the index of the new node, or -1 if insufficient storage is
1590 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1592 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1593 if (BE (dup_idx
!= -1, 1))
1595 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1596 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1597 dfa
->nodes
[dup_idx
].duplicated
= 1;
1599 /* Store the index of the original node. */
1600 dfa
->org_indices
[dup_idx
] = org_idx
;
1605 static reg_errcode_t
1606 calc_inveclosure (re_dfa_t
*dfa
)
1609 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1610 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1612 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1614 int *elems
= dfa
->eclosures
[src
].elems
;
1615 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1617 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1618 if (BE (ret
== -1, 0))
1626 /* Calculate "eclosure" for all the node in DFA. */
1628 static reg_errcode_t
1629 calc_eclosure (re_dfa_t
*dfa
)
1631 int node_idx
, incomplete
;
1633 assert (dfa
->nodes_len
> 0);
1636 /* For each nodes, calculate epsilon closure. */
1637 for (node_idx
= 0; ; ++node_idx
)
1640 re_node_set eclosure_elem
;
1641 if (node_idx
== dfa
->nodes_len
)
1650 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1653 /* If we have already calculated, skip it. */
1654 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1656 /* Calculate epsilon closure of `node_idx'. */
1657 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1658 if (BE (err
!= REG_NOERROR
, 0))
1661 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1664 re_node_set_free (&eclosure_elem
);
1670 /* Calculate epsilon closure of NODE. */
1672 static reg_errcode_t
1673 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1677 re_node_set eclosure
;
1680 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1681 if (BE (err
!= REG_NOERROR
, 0))
1684 /* This indicates that we are calculating this node now.
1685 We reference this value to avoid infinite loop. */
1686 dfa
->eclosures
[node
].nelem
= -1;
1688 /* If the current node has constraints, duplicate all nodes
1689 since they must inherit the constraints. */
1690 if (dfa
->nodes
[node
].constraint
1691 && dfa
->edests
[node
].nelem
1692 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1694 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1695 dfa
->nodes
[node
].constraint
);
1696 if (BE (err
!= REG_NOERROR
, 0))
1700 /* Expand each epsilon destination nodes. */
1701 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1702 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1704 re_node_set eclosure_elem
;
1705 int edest
= dfa
->edests
[node
].elems
[i
];
1706 /* If calculating the epsilon closure of `edest' is in progress,
1707 return intermediate result. */
1708 if (dfa
->eclosures
[edest
].nelem
== -1)
1713 /* If we haven't calculated the epsilon closure of `edest' yet,
1714 calculate now. Otherwise use calculated epsilon closure. */
1715 if (dfa
->eclosures
[edest
].nelem
== 0)
1717 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1718 if (BE (err
!= REG_NOERROR
, 0))
1722 eclosure_elem
= dfa
->eclosures
[edest
];
1723 /* Merge the epsilon closure of `edest'. */
1724 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1725 if (BE (err
!= REG_NOERROR
, 0))
1727 /* If the epsilon closure of `edest' is incomplete,
1728 the epsilon closure of this node is also incomplete. */
1729 if (dfa
->eclosures
[edest
].nelem
== 0)
1732 re_node_set_free (&eclosure_elem
);
1736 /* An epsilon closure includes itself. */
1737 ret
= re_node_set_insert (&eclosure
, node
);
1738 if (BE (ret
< 0, 0))
1740 if (incomplete
&& !root
)
1741 dfa
->eclosures
[node
].nelem
= 0;
1743 dfa
->eclosures
[node
] = eclosure
;
1744 *new_set
= eclosure
;
1748 /* Functions for token which are used in the parser. */
1750 /* Fetch a token from INPUT.
1751 We must not use this function inside bracket expressions. */
1755 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1757 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1760 /* Peek a token from INPUT, and return the length of the token.
1761 We must not use this function inside bracket expressions. */
1765 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1769 if (re_string_eoi (input
))
1771 token
->type
= END_OF_RE
;
1775 c
= re_string_peek_byte (input
, 0);
1778 token
->word_char
= 0;
1779 #ifdef RE_ENABLE_I18N
1780 token
->mb_partial
= 0;
1781 if (input
->mb_cur_max
> 1 &&
1782 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1784 token
->type
= CHARACTER
;
1785 token
->mb_partial
= 1;
1792 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1794 token
->type
= BACK_SLASH
;
1798 c2
= re_string_peek_byte_case (input
, 1);
1800 token
->type
= CHARACTER
;
1801 #ifdef RE_ENABLE_I18N
1802 if (input
->mb_cur_max
> 1)
1804 wint_t wc
= re_string_wchar_at (input
,
1805 re_string_cur_idx (input
) + 1);
1806 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1810 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1815 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1816 token
->type
= OP_ALT
;
1818 case '1': case '2': case '3': case '4': case '5':
1819 case '6': case '7': case '8': case '9':
1820 if (!(syntax
& RE_NO_BK_REFS
))
1822 token
->type
= OP_BACK_REF
;
1823 token
->opr
.idx
= c2
- '1';
1827 if (!(syntax
& RE_NO_GNU_OPS
))
1829 token
->type
= ANCHOR
;
1830 token
->opr
.ctx_type
= WORD_FIRST
;
1834 if (!(syntax
& RE_NO_GNU_OPS
))
1836 token
->type
= ANCHOR
;
1837 token
->opr
.ctx_type
= WORD_LAST
;
1841 if (!(syntax
& RE_NO_GNU_OPS
))
1843 token
->type
= ANCHOR
;
1844 token
->opr
.ctx_type
= WORD_DELIM
;
1848 if (!(syntax
& RE_NO_GNU_OPS
))
1850 token
->type
= ANCHOR
;
1851 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1855 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= OP_WORD
;
1859 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= OP_NOTWORD
;
1863 if (!(syntax
& RE_NO_GNU_OPS
))
1864 token
->type
= OP_SPACE
;
1867 if (!(syntax
& RE_NO_GNU_OPS
))
1868 token
->type
= OP_NOTSPACE
;
1871 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= ANCHOR
;
1874 token
->opr
.ctx_type
= BUF_FIRST
;
1878 if (!(syntax
& RE_NO_GNU_OPS
))
1880 token
->type
= ANCHOR
;
1881 token
->opr
.ctx_type
= BUF_LAST
;
1885 if (!(syntax
& RE_NO_BK_PARENS
))
1886 token
->type
= OP_OPEN_SUBEXP
;
1889 if (!(syntax
& RE_NO_BK_PARENS
))
1890 token
->type
= OP_CLOSE_SUBEXP
;
1893 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1894 token
->type
= OP_DUP_PLUS
;
1897 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1898 token
->type
= OP_DUP_QUESTION
;
1901 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1902 token
->type
= OP_OPEN_DUP_NUM
;
1905 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1906 token
->type
= OP_CLOSE_DUP_NUM
;
1914 token
->type
= CHARACTER
;
1915 #ifdef RE_ENABLE_I18N
1916 if (input
->mb_cur_max
> 1)
1918 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1919 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1923 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1928 if (syntax
& RE_NEWLINE_ALT
)
1929 token
->type
= OP_ALT
;
1932 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1933 token
->type
= OP_ALT
;
1936 token
->type
= OP_DUP_ASTERISK
;
1939 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1940 token
->type
= OP_DUP_PLUS
;
1943 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1944 token
->type
= OP_DUP_QUESTION
;
1947 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1948 token
->type
= OP_OPEN_DUP_NUM
;
1951 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1952 token
->type
= OP_CLOSE_DUP_NUM
;
1955 if (syntax
& RE_NO_BK_PARENS
)
1956 token
->type
= OP_OPEN_SUBEXP
;
1959 if (syntax
& RE_NO_BK_PARENS
)
1960 token
->type
= OP_CLOSE_SUBEXP
;
1963 token
->type
= OP_OPEN_BRACKET
;
1966 token
->type
= OP_PERIOD
;
1969 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1970 re_string_cur_idx (input
) != 0)
1972 char prev
= re_string_peek_byte (input
, -1);
1973 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1976 token
->type
= ANCHOR
;
1977 token
->opr
.ctx_type
= LINE_FIRST
;
1980 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1981 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1984 re_string_skip_bytes (input
, 1);
1985 peek_token (&next
, input
, syntax
);
1986 re_string_skip_bytes (input
, -1);
1987 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1990 token
->type
= ANCHOR
;
1991 token
->opr
.ctx_type
= LINE_LAST
;
1999 /* Peek a token from INPUT, and return the length of the token.
2000 We must not use this function out of bracket expressions. */
2004 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2007 if (re_string_eoi (input
))
2009 token
->type
= END_OF_RE
;
2012 c
= re_string_peek_byte (input
, 0);
2015 #ifdef RE_ENABLE_I18N
2016 if (input
->mb_cur_max
> 1 &&
2017 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2019 token
->type
= CHARACTER
;
2022 #endif /* RE_ENABLE_I18N */
2024 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2025 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2027 /* In this case, '\' escape a character. */
2029 re_string_skip_bytes (input
, 1);
2030 c2
= re_string_peek_byte (input
, 0);
2032 token
->type
= CHARACTER
;
2035 if (c
== '[') /* '[' is a special char in a bracket exps. */
2039 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2040 c2
= re_string_peek_byte (input
, 1);
2048 token
->type
= OP_OPEN_COLL_ELEM
;
2051 token
->type
= OP_OPEN_EQUIV_CLASS
;
2054 if (syntax
& RE_CHAR_CLASSES
)
2056 token
->type
= OP_OPEN_CHAR_CLASS
;
2059 /* else fall through. */
2061 token
->type
= CHARACTER
;
2071 token
->type
= OP_CHARSET_RANGE
;
2074 token
->type
= OP_CLOSE_BRACKET
;
2077 token
->type
= OP_NON_MATCH_LIST
;
2080 token
->type
= CHARACTER
;
2085 /* Functions for parser. */
2087 /* Entry point of the parser.
2088 Parse the regular expression REGEXP and return the structure tree.
2089 If an error is occured, ERR is set by error code, and return NULL.
2090 This function build the following tree, from regular expression <reg_exp>:
2096 CAT means concatenation.
2097 EOR means end of regular expression. */
2100 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2103 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2104 bin_tree_t
*tree
, *eor
, *root
;
2105 re_token_t current_token
;
2106 dfa
->syntax
= syntax
;
2107 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2108 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2109 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2111 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2113 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2116 if (BE (eor
== NULL
|| root
== NULL
, 0))
2124 /* This function build the following tree, from regular expression
2125 <branch1>|<branch2>:
2131 ALT means alternative, which represents the operator `|'. */
2134 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2135 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2137 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2138 bin_tree_t
*tree
, *branch
= NULL
;
2139 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2140 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2143 while (token
->type
== OP_ALT
)
2145 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2146 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2147 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2149 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2150 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2155 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2156 if (BE (tree
== NULL
, 0))
2165 /* This function build the following tree, from regular expression
2172 CAT means concatenation. */
2175 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2176 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2178 bin_tree_t
*tree
, *exp
;
2179 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2180 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2181 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2184 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2185 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2187 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2188 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2191 postorder (tree
, free_tree
, NULL
);
2194 if (tree
!= NULL
&& exp
!= NULL
)
2196 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2197 if (newtree
== NULL
)
2199 postorder (exp
, free_tree
, NULL
);
2200 postorder (tree
, free_tree
, NULL
);
2206 else if (tree
== NULL
)
2208 /* Otherwise exp == NULL, we don't need to create new tree. */
2213 /* This function build the following tree, from regular expression a*:
2220 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2221 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2223 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2225 switch (token
->type
)
2228 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2229 if (BE (tree
== NULL
, 0))
2234 #ifdef RE_ENABLE_I18N
2235 if (dfa
->mb_cur_max
> 1)
2237 while (!re_string_eoi (regexp
)
2238 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2240 bin_tree_t
*mbc_remain
;
2241 fetch_token (token
, regexp
, syntax
);
2242 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2243 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2244 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2253 case OP_OPEN_SUBEXP
:
2254 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2255 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2258 case OP_OPEN_BRACKET
:
2259 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2260 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2264 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2269 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2270 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2271 if (BE (tree
== NULL
, 0))
2277 dfa
->has_mb_node
= 1;
2279 case OP_OPEN_DUP_NUM
:
2280 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2286 case OP_DUP_ASTERISK
:
2288 case OP_DUP_QUESTION
:
2289 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2294 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2296 fetch_token (token
, regexp
, syntax
);
2297 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2299 /* else fall through */
2300 case OP_CLOSE_SUBEXP
:
2301 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2302 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2307 /* else fall through */
2308 case OP_CLOSE_DUP_NUM
:
2309 /* We treat it as a normal character. */
2311 /* Then we can these characters as normal characters. */
2312 token
->type
= CHARACTER
;
2313 /* mb_partial and word_char bits should be initialized already
2315 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2316 if (BE (tree
== NULL
, 0))
2323 if ((token
->opr
.ctx_type
2324 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2325 && dfa
->word_ops_used
== 0)
2326 init_word_char (dfa
);
2327 if (token
->opr
.ctx_type
== WORD_DELIM
2328 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2330 bin_tree_t
*tree_first
, *tree_last
;
2331 if (token
->opr
.ctx_type
== WORD_DELIM
)
2333 token
->opr
.ctx_type
= WORD_FIRST
;
2334 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2335 token
->opr
.ctx_type
= WORD_LAST
;
2339 token
->opr
.ctx_type
= INSIDE_WORD
;
2340 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2341 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2343 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2344 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2345 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2353 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2354 if (BE (tree
== NULL
, 0))
2360 /* We must return here, since ANCHORs can't be followed
2361 by repetition operators.
2362 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2363 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2364 fetch_token (token
, regexp
, syntax
);
2367 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2368 if (BE (tree
== NULL
, 0))
2373 if (dfa
->mb_cur_max
> 1)
2374 dfa
->has_mb_node
= 1;
2378 tree
= build_charclass_op (dfa
, regexp
->trans
,
2379 (const unsigned char *) "alnum",
2380 (const unsigned char *) "_",
2381 token
->type
== OP_NOTWORD
, err
);
2382 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2387 tree
= build_charclass_op (dfa
, regexp
->trans
,
2388 (const unsigned char *) "space",
2389 (const unsigned char *) "",
2390 token
->type
== OP_NOTSPACE
, err
);
2391 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2401 /* Must not happen? */
2407 fetch_token (token
, regexp
, syntax
);
2409 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2410 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2412 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2413 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2415 /* In BRE consecutive duplications are not allowed. */
2416 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2417 && (token
->type
== OP_DUP_ASTERISK
2418 || token
->type
== OP_OPEN_DUP_NUM
))
2428 /* This function build the following tree, from regular expression
2436 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2437 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2439 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2442 cur_nsub
= preg
->re_nsub
++;
2444 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2446 /* The subexpression may be a null string. */
2447 if (token
->type
== OP_CLOSE_SUBEXP
)
2451 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2452 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2455 postorder (tree
, free_tree
, NULL
);
2458 if (BE (*err
!= REG_NOERROR
, 0))
2462 if (cur_nsub
<= '9' - '1')
2463 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2465 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2466 if (BE (tree
== NULL
, 0))
2471 tree
->token
.opr
.idx
= cur_nsub
;
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2478 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2479 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2481 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2482 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2483 re_token_t start_token
= *token
;
2485 if (token
->type
== OP_OPEN_DUP_NUM
)
2488 start
= fetch_number (regexp
, token
, syntax
);
2491 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2492 start
= 0; /* We treat "{,m}" as "{0,m}". */
2495 *err
= REG_BADBR
; /* <re>{} is invalid. */
2499 if (BE (start
!= -2, 1))
2501 /* We treat "{n}" as "{n,n}". */
2502 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2503 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2504 ? fetch_number (regexp
, token
, syntax
) : -2));
2506 if (BE (start
== -2 || end
== -2, 0))
2508 /* Invalid sequence. */
2509 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2511 if (token
->type
== END_OF_RE
)
2519 /* If the syntax bit is set, rollback. */
2520 re_string_set_index (regexp
, start_idx
);
2521 *token
= start_token
;
2522 token
->type
= CHARACTER
;
2523 /* mb_partial and word_char bits should be already initialized by
2528 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2530 /* First number greater than second. */
2537 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2538 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2541 fetch_token (token
, regexp
, syntax
);
2543 if (BE (elem
== NULL
, 0))
2545 if (BE (start
== 0 && end
== 0, 0))
2547 postorder (elem
, free_tree
, NULL
);
2551 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2552 if (BE (start
> 0, 0))
2555 for (i
= 2; i
<= start
; ++i
)
2557 elem
= duplicate_tree (elem
, dfa
);
2558 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2559 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2560 goto parse_dup_op_espace
;
2566 /* Duplicate ELEM before it is marked optional. */
2567 elem
= duplicate_tree (elem
, dfa
);
2573 if (elem
->token
.type
== SUBEXP
)
2574 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2576 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2577 if (BE (tree
== NULL
, 0))
2578 goto parse_dup_op_espace
;
2580 /* This loop is actually executed only when end != -1,
2581 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2582 already created the start+1-th copy. */
2583 for (i
= start
+ 2; i
<= end
; ++i
)
2585 elem
= duplicate_tree (elem
, dfa
);
2586 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2587 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2588 goto parse_dup_op_espace
;
2590 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2591 if (BE (tree
== NULL
, 0))
2592 goto parse_dup_op_espace
;
2596 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2600 parse_dup_op_espace
:
2605 /* Size of the names for collating symbol/equivalence_class/character_class.
2606 I'm not sure, but maybe enough. */
2607 #define BRACKET_NAME_BUF_SIZE 32
2610 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2611 Build the range expression which starts from START_ELEM, and ends
2612 at END_ELEM. The result are written to MBCSET and SBCSET.
2613 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2614 mbcset->range_ends, is a pointer argument sinse we may
2617 static reg_errcode_t
2619 # ifdef RE_ENABLE_I18N
2620 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2621 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2622 # else /* not RE_ENABLE_I18N */
2623 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2624 bracket_elem_t
*end_elem
)
2625 # endif /* not RE_ENABLE_I18N */
2627 unsigned int start_ch
, end_ch
;
2628 /* Equivalence Classes and Character Classes can't be a range start/end. */
2629 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2630 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2634 /* We can handle no multi character collating elements without libc
2636 if (BE ((start_elem
->type
== COLL_SYM
2637 && strlen ((char *) start_elem
->opr
.name
) > 1)
2638 || (end_elem
->type
== COLL_SYM
2639 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2640 return REG_ECOLLATE
;
2642 # ifdef RE_ENABLE_I18N
2647 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2649 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2650 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2652 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2653 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2655 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2656 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2657 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2658 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2659 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2660 return REG_ECOLLATE
;
2661 cmp_buf
[0] = start_wc
;
2662 cmp_buf
[4] = end_wc
;
2663 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2666 /* Got valid collation sequence values, add them as a new entry.
2667 However, for !_LIBC we have no collation elements: if the
2668 character set is single byte, the single byte character set
2669 that we build below suffices. parse_bracket_exp passes
2670 no MBCSET if dfa->mb_cur_max == 1. */
2673 /* Check the space of the arrays. */
2674 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2676 /* There is not enough space, need realloc. */
2677 wchar_t *new_array_start
, *new_array_end
;
2680 /* +1 in case of mbcset->nranges is 0. */
2681 new_nranges
= 2 * mbcset
->nranges
+ 1;
2682 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2683 are NULL if *range_alloc == 0. */
2684 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2686 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2689 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2692 mbcset
->range_starts
= new_array_start
;
2693 mbcset
->range_ends
= new_array_end
;
2694 *range_alloc
= new_nranges
;
2697 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2698 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2701 /* Build the table for single byte characters. */
2702 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2705 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2706 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2707 bitset_set (sbcset
, wc
);
2710 # else /* not RE_ENABLE_I18N */
2713 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2714 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2716 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2717 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2719 if (start_ch
> end_ch
)
2721 /* Build the table for single byte characters. */
2722 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2723 if (start_ch
<= ch
&& ch
<= end_ch
)
2724 bitset_set (sbcset
, ch
);
2726 # endif /* not RE_ENABLE_I18N */
2729 #endif /* not _LIBC */
2732 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2733 Build the collating element which is represented by NAME.
2734 The result are written to MBCSET and SBCSET.
2735 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2736 pointer argument since we may update it. */
2738 static reg_errcode_t
2740 # ifdef RE_ENABLE_I18N
2741 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2742 int *coll_sym_alloc
, const unsigned char *name
)
2743 # else /* not RE_ENABLE_I18N */
2744 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2745 # endif /* not RE_ENABLE_I18N */
2747 size_t name_len
= strlen ((const char *) name
);
2748 if (BE (name_len
!= 1, 0))
2749 return REG_ECOLLATE
;
2752 bitset_set (sbcset
, name
[0]);
2756 #endif /* not _LIBC */
2758 /* This function parse bracket expression like "[abc]", "[a-c]",
2762 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2763 reg_syntax_t syntax
, reg_errcode_t
*err
)
2766 const unsigned char *collseqmb
;
2767 const char *collseqwc
;
2770 const int32_t *symb_table
;
2771 const unsigned char *extra
;
2773 /* Local function for parse_bracket_exp used in _LIBC environement.
2774 Seek the collating symbol entry correspondings to NAME.
2775 Return the index of the symbol in the SYMB_TABLE. */
2778 __attribute ((always_inline
))
2779 seek_collating_symbol_entry (name
, name_len
)
2780 const unsigned char *name
;
2783 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2784 int32_t elem
= hash
% table_size
;
2785 if (symb_table
[2 * elem
] != 0)
2787 int32_t second
= hash
% (table_size
- 2) + 1;
2791 /* First compare the hashing value. */
2792 if (symb_table
[2 * elem
] == hash
2793 /* Compare the length of the name. */
2794 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2795 /* Compare the name. */
2796 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2799 /* Yep, this is the entry. */
2806 while (symb_table
[2 * elem
] != 0);
2811 /* Local function for parse_bracket_exp used in _LIBC environment.
2812 Look up the collation sequence value of BR_ELEM.
2813 Return the value if succeeded, UINT_MAX otherwise. */
2815 auto inline unsigned int
2816 __attribute ((always_inline
))
2817 lookup_collation_sequence_value (br_elem
)
2818 bracket_elem_t
*br_elem
;
2820 if (br_elem
->type
== SB_CHAR
)
2823 if (MB_CUR_MAX == 1)
2826 return collseqmb
[br_elem
->opr
.ch
];
2829 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2830 return __collseq_table_lookup (collseqwc
, wc
);
2833 else if (br_elem
->type
== MB_CHAR
)
2836 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2838 else if (br_elem
->type
== COLL_SYM
)
2840 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2844 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2846 if (symb_table
[2 * elem
] != 0)
2848 /* We found the entry. */
2849 idx
= symb_table
[2 * elem
+ 1];
2850 /* Skip the name of collating element name. */
2851 idx
+= 1 + extra
[idx
];
2852 /* Skip the byte sequence of the collating element. */
2853 idx
+= 1 + extra
[idx
];
2854 /* Adjust for the alignment. */
2855 idx
= (idx
+ 3) & ~3;
2856 /* Skip the multibyte collation sequence value. */
2857 idx
+= sizeof (unsigned int);
2858 /* Skip the wide char sequence of the collating element. */
2859 idx
+= sizeof (unsigned int) *
2860 (1 + *(unsigned int *) (extra
+ idx
));
2861 /* Return the collation sequence value. */
2862 return *(unsigned int *) (extra
+ idx
);
2864 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2866 /* No valid character. Match it as a single byte
2868 return collseqmb
[br_elem
->opr
.name
[0]];
2871 else if (sym_name_len
== 1)
2872 return collseqmb
[br_elem
->opr
.name
[0]];
2877 /* Local function for parse_bracket_exp used in _LIBC environement.
2878 Build the range expression which starts from START_ELEM, and ends
2879 at END_ELEM. The result are written to MBCSET and SBCSET.
2880 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2881 mbcset->range_ends, is a pointer argument sinse we may
2884 auto inline reg_errcode_t
2885 __attribute ((always_inline
))
2886 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2887 re_charset_t
*mbcset
;
2890 bracket_elem_t
*start_elem
, *end_elem
;
2893 uint32_t start_collseq
;
2894 uint32_t end_collseq
;
2896 /* Equivalence Classes and Character Classes can't be a range
2898 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2899 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2903 start_collseq
= lookup_collation_sequence_value (start_elem
);
2904 end_collseq
= lookup_collation_sequence_value (end_elem
);
2905 /* Check start/end collation sequence values. */
2906 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2907 return REG_ECOLLATE
;
2908 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2911 /* Got valid collation sequence values, add them as a new entry.
2912 However, if we have no collation elements, and the character set
2913 is single byte, the single byte character set that we
2914 build below suffices. */
2915 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2917 /* Check the space of the arrays. */
2918 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2920 /* There is not enough space, need realloc. */
2921 uint32_t *new_array_start
;
2922 uint32_t *new_array_end
;
2925 /* +1 in case of mbcset->nranges is 0. */
2926 new_nranges
= 2 * mbcset
->nranges
+ 1;
2927 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2929 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2932 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2935 mbcset
->range_starts
= new_array_start
;
2936 mbcset
->range_ends
= new_array_end
;
2937 *range_alloc
= new_nranges
;
2940 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2941 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2944 /* Build the table for single byte characters. */
2945 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2947 uint32_t ch_collseq
;
2949 if (MB_CUR_MAX == 1)
2952 ch_collseq
= collseqmb
[ch
];
2954 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2955 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2956 bitset_set (sbcset
, ch
);
2961 /* Local function for parse_bracket_exp used in _LIBC environement.
2962 Build the collating element which is represented by NAME.
2963 The result are written to MBCSET and SBCSET.
2964 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2965 pointer argument sinse we may update it. */
2967 auto inline reg_errcode_t
2968 __attribute ((always_inline
))
2969 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2970 re_charset_t
*mbcset
;
2971 int *coll_sym_alloc
;
2973 const unsigned char *name
;
2976 size_t name_len
= strlen ((const char *) name
);
2979 elem
= seek_collating_symbol_entry (name
, name_len
);
2980 if (symb_table
[2 * elem
] != 0)
2982 /* We found the entry. */
2983 idx
= symb_table
[2 * elem
+ 1];
2984 /* Skip the name of collating element name. */
2985 idx
+= 1 + extra
[idx
];
2987 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2989 /* No valid character, treat it as a normal
2991 bitset_set (sbcset
, name
[0]);
2995 return REG_ECOLLATE
;
2997 /* Got valid collation sequence, add it as a new entry. */
2998 /* Check the space of the arrays. */
2999 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3001 /* Not enough, realloc it. */
3002 /* +1 in case of mbcset->ncoll_syms is 0. */
3003 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3004 /* Use realloc since mbcset->coll_syms is NULL
3006 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3007 new_coll_sym_alloc
);
3008 if (BE (new_coll_syms
== NULL
, 0))
3010 mbcset
->coll_syms
= new_coll_syms
;
3011 *coll_sym_alloc
= new_coll_sym_alloc
;
3013 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3018 if (BE (name_len
!= 1, 0))
3019 return REG_ECOLLATE
;
3022 bitset_set (sbcset
, name
[0]);
3029 re_token_t br_token
;
3030 re_bitset_ptr_t sbcset
;
3031 #ifdef RE_ENABLE_I18N
3032 re_charset_t
*mbcset
;
3033 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3034 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3035 #endif /* not RE_ENABLE_I18N */
3037 bin_tree_t
*work_tree
;
3039 int first_round
= 1;
3041 collseqmb
= (const unsigned char *)
3042 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3043 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3049 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3050 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3051 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3052 _NL_COLLATE_SYMB_TABLEMB
);
3053 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3054 _NL_COLLATE_SYMB_EXTRAMB
);
3057 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3058 #ifdef RE_ENABLE_I18N
3059 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3060 #endif /* RE_ENABLE_I18N */
3061 #ifdef RE_ENABLE_I18N
3062 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3064 if (BE (sbcset
== NULL
, 0))
3065 #endif /* RE_ENABLE_I18N */
3068 #ifdef RE_ENABLE_I18N
3075 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3076 if (BE (token
->type
== END_OF_RE
, 0))
3079 goto parse_bracket_exp_free_return
;
3081 if (token
->type
== OP_NON_MATCH_LIST
)
3083 #ifdef RE_ENABLE_I18N
3084 mbcset
->non_match
= 1;
3085 #endif /* not RE_ENABLE_I18N */
3087 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3088 bitset_set (sbcset
, '\n');
3089 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3090 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3091 if (BE (token
->type
== END_OF_RE
, 0))
3094 goto parse_bracket_exp_free_return
;
3098 /* We treat the first ']' as a normal character. */
3099 if (token
->type
== OP_CLOSE_BRACKET
)
3100 token
->type
= CHARACTER
;
3104 bracket_elem_t start_elem
, end_elem
;
3105 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3106 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3108 int token_len2
= 0, is_range_exp
= 0;
3111 start_elem
.opr
.name
= start_name_buf
;
3112 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3113 syntax
, first_round
);
3114 if (BE (ret
!= REG_NOERROR
, 0))
3117 goto parse_bracket_exp_free_return
;
3121 /* Get information about the next token. We need it in any case. */
3122 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3124 /* Do not check for ranges if we know they are not allowed. */
3125 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3127 if (BE (token
->type
== END_OF_RE
, 0))
3130 goto parse_bracket_exp_free_return
;
3132 if (token
->type
== OP_CHARSET_RANGE
)
3134 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3135 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3136 if (BE (token2
.type
== END_OF_RE
, 0))
3139 goto parse_bracket_exp_free_return
;
3141 if (token2
.type
== OP_CLOSE_BRACKET
)
3143 /* We treat the last '-' as a normal character. */
3144 re_string_skip_bytes (regexp
, -token_len
);
3145 token
->type
= CHARACTER
;
3152 if (is_range_exp
== 1)
3154 end_elem
.opr
.name
= end_name_buf
;
3155 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3157 if (BE (ret
!= REG_NOERROR
, 0))
3160 goto parse_bracket_exp_free_return
;
3163 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3166 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3167 &start_elem
, &end_elem
);
3169 # ifdef RE_ENABLE_I18N
3170 *err
= build_range_exp (sbcset
,
3171 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3172 &range_alloc
, &start_elem
, &end_elem
);
3174 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3176 #endif /* RE_ENABLE_I18N */
3177 if (BE (*err
!= REG_NOERROR
, 0))
3178 goto parse_bracket_exp_free_return
;
3182 switch (start_elem
.type
)
3185 bitset_set (sbcset
, start_elem
.opr
.ch
);
3187 #ifdef RE_ENABLE_I18N
3189 /* Check whether the array has enough space. */
3190 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3192 wchar_t *new_mbchars
;
3193 /* Not enough, realloc it. */
3194 /* +1 in case of mbcset->nmbchars is 0. */
3195 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3196 /* Use realloc since array is NULL if *alloc == 0. */
3197 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3199 if (BE (new_mbchars
== NULL
, 0))
3200 goto parse_bracket_exp_espace
;
3201 mbcset
->mbchars
= new_mbchars
;
3203 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3205 #endif /* RE_ENABLE_I18N */
3207 *err
= build_equiv_class (sbcset
,
3208 #ifdef RE_ENABLE_I18N
3209 mbcset
, &equiv_class_alloc
,
3210 #endif /* RE_ENABLE_I18N */
3211 start_elem
.opr
.name
);
3212 if (BE (*err
!= REG_NOERROR
, 0))
3213 goto parse_bracket_exp_free_return
;
3216 *err
= build_collating_symbol (sbcset
,
3217 #ifdef RE_ENABLE_I18N
3218 mbcset
, &coll_sym_alloc
,
3219 #endif /* RE_ENABLE_I18N */
3220 start_elem
.opr
.name
);
3221 if (BE (*err
!= REG_NOERROR
, 0))
3222 goto parse_bracket_exp_free_return
;
3225 *err
= build_charclass (regexp
->trans
, sbcset
,
3226 #ifdef RE_ENABLE_I18N
3227 mbcset
, &char_class_alloc
,
3228 #endif /* RE_ENABLE_I18N */
3229 start_elem
.opr
.name
, syntax
);
3230 if (BE (*err
!= REG_NOERROR
, 0))
3231 goto parse_bracket_exp_free_return
;
3238 if (BE (token
->type
== END_OF_RE
, 0))
3241 goto parse_bracket_exp_free_return
;
3243 if (token
->type
== OP_CLOSE_BRACKET
)
3247 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3249 /* If it is non-matching list. */
3251 bitset_not (sbcset
);
3253 #ifdef RE_ENABLE_I18N
3254 /* Ensure only single byte characters are set. */
3255 if (dfa
->mb_cur_max
> 1)
3256 bitset_mask (sbcset
, dfa
->sb_char
);
3258 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3259 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3260 || mbcset
->non_match
)))
3262 bin_tree_t
*mbc_tree
;
3264 /* Build a tree for complex bracket. */
3265 dfa
->has_mb_node
= 1;
3266 br_token
.type
= COMPLEX_BRACKET
;
3267 br_token
.opr
.mbcset
= mbcset
;
3268 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3269 if (BE (mbc_tree
== NULL
, 0))
3270 goto parse_bracket_exp_espace
;
3271 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3272 if (sbcset
[sbc_idx
])
3274 /* If there are no bits set in sbcset, there is no point
3275 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3276 if (sbc_idx
< BITSET_WORDS
)
3278 /* Build a tree for simple bracket. */
3279 br_token
.type
= SIMPLE_BRACKET
;
3280 br_token
.opr
.sbcset
= sbcset
;
3281 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3282 if (BE (work_tree
== NULL
, 0))
3283 goto parse_bracket_exp_espace
;
3285 /* Then join them by ALT node. */
3286 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3287 if (BE (work_tree
== NULL
, 0))
3288 goto parse_bracket_exp_espace
;
3293 work_tree
= mbc_tree
;
3297 #endif /* not RE_ENABLE_I18N */
3299 #ifdef RE_ENABLE_I18N
3300 free_charset (mbcset
);
3302 /* Build a tree for simple bracket. */
3303 br_token
.type
= SIMPLE_BRACKET
;
3304 br_token
.opr
.sbcset
= sbcset
;
3305 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3306 if (BE (work_tree
== NULL
, 0))
3307 goto parse_bracket_exp_espace
;
3311 parse_bracket_exp_espace
:
3313 parse_bracket_exp_free_return
:
3315 #ifdef RE_ENABLE_I18N
3316 free_charset (mbcset
);
3317 #endif /* RE_ENABLE_I18N */
3321 /* Parse an element in the bracket expression. */
3323 static reg_errcode_t
3324 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3325 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3326 reg_syntax_t syntax
, int accept_hyphen
)
3328 #ifdef RE_ENABLE_I18N
3330 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3331 if (cur_char_size
> 1)
3333 elem
->type
= MB_CHAR
;
3334 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3335 re_string_skip_bytes (regexp
, cur_char_size
);
3338 #endif /* RE_ENABLE_I18N */
3339 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3340 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3341 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3342 return parse_bracket_symbol (elem
, regexp
, token
);
3343 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3345 /* A '-' must only appear as anything but a range indicator before
3346 the closing bracket. Everything else is an error. */
3348 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3349 if (token2
.type
!= OP_CLOSE_BRACKET
)
3350 /* The actual error value is not standardized since this whole
3351 case is undefined. But ERANGE makes good sense. */
3354 elem
->type
= SB_CHAR
;
3355 elem
->opr
.ch
= token
->opr
.c
;
3359 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3360 such as [:<character_class>:], [.<collating_element>.], and
3361 [=<equivalent_class>=]. */
3363 static reg_errcode_t
3364 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3367 unsigned char ch
, delim
= token
->opr
.c
;
3369 if (re_string_eoi(regexp
))
3373 if (i
>= BRACKET_NAME_BUF_SIZE
)
3375 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3376 ch
= re_string_fetch_byte_case (regexp
);
3378 ch
= re_string_fetch_byte (regexp
);
3379 if (re_string_eoi(regexp
))
3381 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3383 elem
->opr
.name
[i
] = ch
;
3385 re_string_skip_bytes (regexp
, 1);
3386 elem
->opr
.name
[i
] = '\0';
3387 switch (token
->type
)
3389 case OP_OPEN_COLL_ELEM
:
3390 elem
->type
= COLL_SYM
;
3392 case OP_OPEN_EQUIV_CLASS
:
3393 elem
->type
= EQUIV_CLASS
;
3395 case OP_OPEN_CHAR_CLASS
:
3396 elem
->type
= CHAR_CLASS
;
3404 /* Helper function for parse_bracket_exp.
3405 Build the equivalence class which is represented by NAME.
3406 The result are written to MBCSET and SBCSET.
3407 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3408 is a pointer argument sinse we may update it. */
3410 static reg_errcode_t
3411 #ifdef RE_ENABLE_I18N
3412 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3413 int *equiv_class_alloc
, const unsigned char *name
)
3414 #else /* not RE_ENABLE_I18N */
3415 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3416 #endif /* not RE_ENABLE_I18N */
3419 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3422 const int32_t *table
, *indirect
;
3423 const unsigned char *weights
, *extra
, *cp
;
3424 unsigned char char_buf
[2];
3428 /* This #include defines a local function! */
3429 # include <locale/weight.h>
3430 /* Calculate the index for equivalence class. */
3432 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3433 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3434 _NL_COLLATE_WEIGHTMB
);
3435 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3436 _NL_COLLATE_EXTRAMB
);
3437 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3438 _NL_COLLATE_INDIRECTMB
);
3439 idx1
= findidx (&cp
, -1);
3440 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3441 /* This isn't a valid character. */
3442 return REG_ECOLLATE
;
3444 /* Build single byte matcing table for this equivalence class. */
3445 len
= weights
[idx1
& 0xffffff];
3446 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3450 idx2
= findidx (&cp
, 1);
3455 /* This isn't a valid character. */
3457 /* Compare only if the length matches and the collation rule
3458 index is the same. */
3459 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3463 while (cnt
<= len
&&
3464 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3465 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3469 bitset_set (sbcset
, ch
);
3472 /* Check whether the array has enough space. */
3473 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3475 /* Not enough, realloc it. */
3476 /* +1 in case of mbcset->nequiv_classes is 0. */
3477 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3478 /* Use realloc since the array is NULL if *alloc == 0. */
3479 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3481 new_equiv_class_alloc
);
3482 if (BE (new_equiv_classes
== NULL
, 0))
3484 mbcset
->equiv_classes
= new_equiv_classes
;
3485 *equiv_class_alloc
= new_equiv_class_alloc
;
3487 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3492 if (BE (strlen ((const char *) name
) != 1, 0))
3493 return REG_ECOLLATE
;
3494 bitset_set (sbcset
, *name
);
3499 /* Helper function for parse_bracket_exp.
3500 Build the character class which is represented by NAME.
3501 The result are written to MBCSET and SBCSET.
3502 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3503 is a pointer argument sinse we may update it. */
3505 static reg_errcode_t
3506 #ifdef RE_ENABLE_I18N
3507 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3508 re_charset_t
*mbcset
, int *char_class_alloc
,
3509 const unsigned char *class_name
, reg_syntax_t syntax
)
3510 #else /* not RE_ENABLE_I18N */
3511 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3512 const unsigned char *class_name
, reg_syntax_t syntax
)
3513 #endif /* not RE_ENABLE_I18N */
3516 const char *name
= (const char *) class_name
;
3518 /* In case of REG_ICASE "upper" and "lower" match the both of
3519 upper and lower cases. */
3520 if ((syntax
& RE_ICASE
)
3521 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3524 #ifdef RE_ENABLE_I18N
3525 /* Check the space of the arrays. */
3526 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nchar_classes is 0. */
3530 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3531 /* Use realloc since array is NULL if *alloc == 0. */
3532 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3533 new_char_class_alloc
);
3534 if (BE (new_char_classes
== NULL
, 0))
3536 mbcset
->char_classes
= new_char_classes
;
3537 *char_class_alloc
= new_char_class_alloc
;
3539 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3540 #endif /* RE_ENABLE_I18N */
3542 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3544 if (BE (trans != NULL, 0)) \
3546 for (i = 0; i < SBC_MAX; ++i) \
3547 if (ctype_func (i)) \
3548 bitset_set (sbcset, trans[i]); \
3552 for (i = 0; i < SBC_MAX; ++i) \
3553 if (ctype_func (i)) \
3554 bitset_set (sbcset, i); \
3558 if (strcmp (name
, "alnum") == 0)
3559 BUILD_CHARCLASS_LOOP (isalnum
);
3560 else if (strcmp (name
, "cntrl") == 0)
3561 BUILD_CHARCLASS_LOOP (iscntrl
);
3562 else if (strcmp (name
, "lower") == 0)
3563 BUILD_CHARCLASS_LOOP (islower
);
3564 else if (strcmp (name
, "space") == 0)
3565 BUILD_CHARCLASS_LOOP (isspace
);
3566 else if (strcmp (name
, "alpha") == 0)
3567 BUILD_CHARCLASS_LOOP (isalpha
);
3568 else if (strcmp (name
, "digit") == 0)
3569 BUILD_CHARCLASS_LOOP (isdigit
);
3570 else if (strcmp (name
, "print") == 0)
3571 BUILD_CHARCLASS_LOOP (isprint
);
3572 else if (strcmp (name
, "upper") == 0)
3573 BUILD_CHARCLASS_LOOP (isupper
);
3574 else if (strcmp (name
, "blank") == 0)
3575 BUILD_CHARCLASS_LOOP (isblank
);
3576 else if (strcmp (name
, "graph") == 0)
3577 BUILD_CHARCLASS_LOOP (isgraph
);
3578 else if (strcmp (name
, "punct") == 0)
3579 BUILD_CHARCLASS_LOOP (ispunct
);
3580 else if (strcmp (name
, "xdigit") == 0)
3581 BUILD_CHARCLASS_LOOP (isxdigit
);
3589 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3590 const unsigned char *class_name
,
3591 const unsigned char *extra
, int non_match
,
3594 re_bitset_ptr_t sbcset
;
3595 #ifdef RE_ENABLE_I18N
3596 re_charset_t
*mbcset
;
3598 #endif /* not RE_ENABLE_I18N */
3600 re_token_t br_token
;
3603 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3604 #ifdef RE_ENABLE_I18N
3605 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3606 #endif /* RE_ENABLE_I18N */
3608 #ifdef RE_ENABLE_I18N
3609 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3610 #else /* not RE_ENABLE_I18N */
3611 if (BE (sbcset
== NULL
, 0))
3612 #endif /* not RE_ENABLE_I18N */
3620 #ifdef RE_ENABLE_I18N
3621 mbcset
->non_match
= 1;
3622 #endif /* not RE_ENABLE_I18N */
3625 /* We don't care the syntax in this case. */
3626 ret
= build_charclass (trans
, sbcset
,
3627 #ifdef RE_ENABLE_I18N
3629 #endif /* RE_ENABLE_I18N */
3632 if (BE (ret
!= REG_NOERROR
, 0))
3635 #ifdef RE_ENABLE_I18N
3636 free_charset (mbcset
);
3637 #endif /* RE_ENABLE_I18N */
3641 /* \w match '_' also. */
3642 for (; *extra
; extra
++)
3643 bitset_set (sbcset
, *extra
);
3645 /* If it is non-matching list. */
3647 bitset_not (sbcset
);
3649 #ifdef RE_ENABLE_I18N
3650 /* Ensure only single byte characters are set. */
3651 if (dfa
->mb_cur_max
> 1)
3652 bitset_mask (sbcset
, dfa
->sb_char
);
3655 /* Build a tree for simple bracket. */
3656 br_token
.type
= SIMPLE_BRACKET
;
3657 br_token
.opr
.sbcset
= sbcset
;
3658 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3659 if (BE (tree
== NULL
, 0))
3660 goto build_word_op_espace
;
3662 #ifdef RE_ENABLE_I18N
3663 if (dfa
->mb_cur_max
> 1)
3665 bin_tree_t
*mbc_tree
;
3666 /* Build a tree for complex bracket. */
3667 br_token
.type
= COMPLEX_BRACKET
;
3668 br_token
.opr
.mbcset
= mbcset
;
3669 dfa
->has_mb_node
= 1;
3670 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3671 if (BE (mbc_tree
== NULL
, 0))
3672 goto build_word_op_espace
;
3673 /* Then join them by ALT node. */
3674 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3675 if (BE (mbc_tree
!= NULL
, 1))
3680 free_charset (mbcset
);
3683 #else /* not RE_ENABLE_I18N */
3685 #endif /* not RE_ENABLE_I18N */
3687 build_word_op_espace
:
3689 #ifdef RE_ENABLE_I18N
3690 free_charset (mbcset
);
3691 #endif /* RE_ENABLE_I18N */
3696 /* This is intended for the expressions like "a{1,3}".
3697 Fetch a number from `input', and return the number.
3698 Return -1, if the number field is empty like "{,1}".
3699 Return -2, If an error is occured. */
3702 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3708 fetch_token (token
, input
, syntax
);
3710 if (BE (token
->type
== END_OF_RE
, 0))
3712 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3714 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3715 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3716 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3721 #ifdef RE_ENABLE_I18N
3723 free_charset (re_charset_t
*cset
)
3725 re_free (cset
->mbchars
);
3727 re_free (cset
->coll_syms
);
3728 re_free (cset
->equiv_classes
);
3729 re_free (cset
->range_starts
);
3730 re_free (cset
->range_ends
);
3732 re_free (cset
->char_classes
);
3735 #endif /* RE_ENABLE_I18N */
3737 /* Functions for binary tree operation. */
3739 /* Create a tree node. */
3742 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3743 re_token_type_t type
)
3747 return create_token_tree (dfa
, left
, right
, &t
);
3751 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3752 const re_token_t
*token
)
3755 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3757 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3759 if (storage
== NULL
)
3761 storage
->next
= dfa
->str_tree_storage
;
3762 dfa
->str_tree_storage
= storage
;
3763 dfa
->str_tree_storage_idx
= 0;
3765 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3767 tree
->parent
= NULL
;
3769 tree
->right
= right
;
3770 tree
->token
= *token
;
3771 tree
->token
.duplicated
= 0;
3772 tree
->token
.opt_subexp
= 0;
3775 tree
->node_idx
= -1;
3778 left
->parent
= tree
;
3780 right
->parent
= tree
;
3784 /* Mark the tree SRC as an optional subexpression.
3785 To be called from preorder or postorder. */
3787 static reg_errcode_t
3788 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3790 int idx
= (int) (long) extra
;
3791 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3792 node
->token
.opt_subexp
= 1;
3797 /* Free the allocated memory inside NODE. */
3800 free_token (re_token_t
*node
)
3802 #ifdef RE_ENABLE_I18N
3803 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3804 free_charset (node
->opr
.mbcset
);
3806 #endif /* RE_ENABLE_I18N */
3807 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3808 re_free (node
->opr
.sbcset
);
3811 /* Worker function for tree walking. Free the allocated memory inside NODE
3812 and its children. */
3814 static reg_errcode_t
3815 free_tree (void *extra
, bin_tree_t
*node
)
3817 free_token (&node
->token
);
3822 /* Duplicate the node SRC, and return new node. This is a preorder
3823 visit similar to the one implemented by the generic visitor, but
3824 we need more infrastructure to maintain two parallel trees --- so,
3825 it's easier to duplicate. */
3828 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3830 const bin_tree_t
*node
;
3831 bin_tree_t
*dup_root
;
3832 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3834 for (node
= root
; ; )
3836 /* Create a new tree and link it back to the current parent. */
3837 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3840 (*p_new
)->parent
= dup_node
;
3841 (*p_new
)->token
.duplicated
= 1;
3844 /* Go to the left node, or up and to the right. */
3848 p_new
= &dup_node
->left
;
3852 const bin_tree_t
*prev
= NULL
;
3853 while (node
->right
== prev
|| node
->right
== NULL
)
3856 node
= node
->parent
;
3857 dup_node
= dup_node
->parent
;
3862 p_new
= &dup_node
->right
;