1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
21 size_t length
, reg_syntax_t syntax
);
22 static void re_compile_fastmap_iter (regex_t
*bufp
,
23 const re_dfastate_t
*init_state
,
25 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
27 static void free_charset (re_charset_t
*cset
);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t
*preg
);
30 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
32 static void optimize_utf8 (re_dfa_t
*dfa
);
34 static reg_errcode_t
analyze (regex_t
*preg
);
35 static reg_errcode_t
preorder (bin_tree_t
*root
,
36 reg_errcode_t (fn (void *, bin_tree_t
*)),
38 static reg_errcode_t
postorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
42 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
43 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
45 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
48 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
49 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
50 unsigned int constraint
);
51 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
52 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
54 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
55 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
57 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 reg_syntax_t syntax
) internal_function
;
59 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
60 reg_syntax_t syntax
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 int nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 int nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
74 re_dfa_t
*dfa
, re_token_t
*token
,
75 reg_syntax_t syntax
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
77 re_token_t
*token
, reg_syntax_t syntax
,
79 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
81 re_token_t
*token
, int token_len
,
85 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
89 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
91 int *equiv_class_alloc
,
92 const unsigned char *name
);
93 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
96 int *char_class_alloc
,
97 const unsigned char *class_name
,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
101 const unsigned char *name
);
102 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
104 const unsigned char *class_name
,
105 reg_syntax_t syntax
);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
108 RE_TRANSLATE_TYPE trans
,
109 const unsigned char *class_name
,
110 const unsigned char *extra
,
111 int non_match
, reg_errcode_t
*err
);
112 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
113 bin_tree_t
*left
, bin_tree_t
*right
,
114 re_token_type_t type
);
115 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
116 bin_tree_t
*left
, bin_tree_t
*right
,
117 const re_token_t
*token
);
118 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
119 static void free_token (re_token_t
*node
);
120 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
121 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid
[] attribute_hidden
=
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx
[] attribute_hidden
=
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (pattern
, length
, bufp
)
216 struct re_pattern_buffer
*bufp
;
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
225 /* Match anchors at newline. */
226 bufp
->newline_anchor
= 1;
228 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
232 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
235 weak_alias (__re_compile_pattern
, re_compile_pattern
)
238 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options
;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
254 re_set_syntax (syntax
)
257 reg_syntax_t ret
= re_syntax_options
;
259 re_syntax_options
= syntax
;
263 weak_alias (__re_set_syntax
, re_set_syntax
)
267 re_compile_fastmap (bufp
)
268 struct re_pattern_buffer
*bufp
;
270 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
271 char *fastmap
= bufp
->fastmap
;
273 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
274 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
275 if (dfa
->init_state
!= dfa
->init_state_word
)
276 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_nl
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
281 bufp
->fastmap_accurate
= 1;
285 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
289 __attribute ((always_inline
))
290 re_set_fastmap (char *fastmap
, int icase
, int ch
)
294 fastmap
[tolower (ch
)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
301 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
304 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
306 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
307 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
309 int node
= init_state
->nodes
.elems
[node_cnt
];
310 re_token_type_t type
= dfa
->nodes
[node
].type
;
312 if (type
== CHARACTER
)
314 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
318 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
323 *p
++ = dfa
->nodes
[node
].opr
.c
;
324 while (++node
< dfa
->nodes_len
325 && dfa
->nodes
[node
].type
== CHARACTER
326 && dfa
->nodes
[node
].mb_partial
)
327 *p
++ = dfa
->nodes
[node
].opr
.c
;
328 memset (&state
, '\0', sizeof (state
));
329 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
331 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
333 re_set_fastmap (fastmap
, 0, buf
[0]);
337 else if (type
== SIMPLE_BRACKET
)
340 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
343 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
344 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
345 if (w
& ((bitset_word_t
) 1 << j
))
346 re_set_fastmap (fastmap
, icase
, ch
);
349 #ifdef RE_ENABLE_I18N
350 else if (type
== COMPLEX_BRACKET
)
352 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
356 /* See if we have to try all bytes which start multiple collation
358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 collation element, and don't catch 'b' since 'b' is
360 the only collation element which starts from 'b' (and
361 it is caught by SIMPLE_BRACKET). */
362 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
363 && (cset
->ncoll_syms
|| cset
->nranges
))
365 const int32_t *table
= (const int32_t *)
366 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
367 for (i
= 0; i
< SBC_MAX
; ++i
)
369 re_set_fastmap (fastmap
, icase
, i
);
373 /* See if we have to start the match at all multibyte characters,
374 i.e. where we would not find an invalid sequence. This only
375 applies to multibyte character sets; for single byte character
376 sets, the SIMPLE_BRACKET again suffices. */
377 if (dfa
->mb_cur_max
> 1
378 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
380 || cset
->nequiv_classes
388 memset (&mbs
, 0, sizeof (mbs
));
389 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
390 re_set_fastmap (fastmap
, false, (int) c
);
397 /* ... Else catch all bytes which can start the mbchars. */
398 for (i
= 0; i
< cset
->nmbchars
; ++i
)
402 memset (&state
, '\0', sizeof (state
));
403 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
404 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
405 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
407 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
409 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
414 #endif /* RE_ENABLE_I18N */
415 else if (type
== OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 || type
== OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 || type
== END_OF_RE
)
421 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
422 if (type
== END_OF_RE
)
423 bufp
->can_be_null
= 1;
429 /* Entry point for POSIX code. */
430 /* regcomp takes a regular expression as a string and compiles it.
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
445 PATTERN is the address of the pattern string.
447 CFLAGS is a series of bits which affect compilation.
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
466 regcomp (preg
, pattern
, cflags
)
467 regex_t
*__restrict preg
;
468 const char *__restrict pattern
;
472 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
473 : RE_SYNTAX_POSIX_BASIC
);
479 /* Try to allocate space for the fastmap. */
480 preg
->fastmap
= re_malloc (char, SBC_MAX
);
481 if (BE (preg
->fastmap
== NULL
, 0))
484 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
486 /* If REG_NEWLINE is set, newlines are treated differently. */
487 if (cflags
& REG_NEWLINE
)
488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
489 syntax
&= ~RE_DOT_NEWLINE
;
490 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
491 /* It also changes the matching behavior. */
492 preg
->newline_anchor
= 1;
495 preg
->newline_anchor
= 0;
496 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
497 preg
->translate
= NULL
;
499 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
501 /* POSIX doesn't distinguish between an unmatched open-group and an
502 unmatched close-group: both are REG_EPAREN. */
503 if (ret
== REG_ERPAREN
)
506 /* We have already checked preg->fastmap != NULL. */
507 if (BE (ret
== REG_NOERROR
, 1))
508 /* Compute the fastmap now, since regexec cannot modify the pattern
509 buffer. This function never fails in this implementation. */
510 (void) re_compile_fastmap (preg
);
513 /* Some error occurred while compiling the expression. */
514 re_free (preg
->fastmap
);
515 preg
->fastmap
= NULL
;
521 weak_alias (__regcomp
, regcomp
)
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
528 regerror (errcode
, preg
, errbuf
, errbuf_size
)
530 const regex_t
*__restrict preg
;
531 char *__restrict errbuf
;
538 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
539 / sizeof (__re_error_msgid_idx
[0])), 0))
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
546 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
548 msg_size
= strlen (msg
) + 1; /* Includes the null. */
550 if (BE (errbuf_size
!= 0, 1))
552 if (BE (msg_size
> errbuf_size
, 0))
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
557 memcpy (errbuf
, msg
, errbuf_size
- 1);
558 errbuf
[errbuf_size
- 1] = 0;
562 memcpy (errbuf
, msg
, msg_size
);
568 weak_alias (__regerror
, regerror
)
572 #ifdef RE_ENABLE_I18N
573 /* This static array is used for the map to single-byte characters when
574 UTF-8 is used. Otherwise we would allocate memory just to initialize
575 it the same all the time. UTF-8 is the preferred encoding so this is
576 a worthwhile optimization. */
577 static const bitset_t utf8_sb_map
=
579 /* Set the first 128 bits. */
580 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
586 free_dfa_content (re_dfa_t
*dfa
)
591 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
592 free_token (dfa
->nodes
+ i
);
593 re_free (dfa
->nexts
);
594 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
596 if (dfa
->eclosures
!= NULL
)
597 re_node_set_free (dfa
->eclosures
+ i
);
598 if (dfa
->inveclosures
!= NULL
)
599 re_node_set_free (dfa
->inveclosures
+ i
);
600 if (dfa
->edests
!= NULL
)
601 re_node_set_free (dfa
->edests
+ i
);
603 re_free (dfa
->edests
);
604 re_free (dfa
->eclosures
);
605 re_free (dfa
->inveclosures
);
606 re_free (dfa
->nodes
);
608 if (dfa
->state_table
)
609 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
611 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
612 for (j
= 0; j
< entry
->num
; ++j
)
614 re_dfastate_t
*state
= entry
->array
[j
];
617 re_free (entry
->array
);
619 re_free (dfa
->state_table
);
620 #ifdef RE_ENABLE_I18N
621 if (dfa
->sb_char
!= utf8_sb_map
)
622 re_free (dfa
->sb_char
);
624 re_free (dfa
->subexp_map
);
626 re_free (dfa
->re_str
);
633 /* Free dynamically allocated space used by PREG. */
639 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
640 if (BE (dfa
!= NULL
, 1))
641 free_dfa_content (dfa
);
645 re_free (preg
->fastmap
);
646 preg
->fastmap
= NULL
;
648 re_free (preg
->translate
);
649 preg
->translate
= NULL
;
652 weak_alias (__regfree
, regfree
)
655 /* Entry points compatible with 4.2 BSD regex library. We don't define
656 them unless specifically requested. */
658 #if defined _REGEX_RE_COMP || defined _LIBC
660 /* BSD has one and only one pattern buffer. */
661 static struct re_pattern_buffer re_comp_buf
;
665 /* Make these definitions weak in libc, so POSIX programs can redefine
666 these names if they don't use our functions, and still use
667 regcomp/regexec above without link errors. */
678 if (!re_comp_buf
.buffer
)
679 return gettext ("No previous regular expression");
683 if (re_comp_buf
.buffer
)
685 fastmap
= re_comp_buf
.fastmap
;
686 re_comp_buf
.fastmap
= NULL
;
687 __regfree (&re_comp_buf
);
688 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
689 re_comp_buf
.fastmap
= fastmap
;
692 if (re_comp_buf
.fastmap
== NULL
)
694 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
695 if (re_comp_buf
.fastmap
== NULL
)
696 return (char *) gettext (__re_error_msgid
697 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
700 /* Since `re_exec' always passes NULL for the `regs' argument, we
701 don't need to initialize the pattern buffer fields which affect it. */
703 /* Match anchors at newlines. */
704 re_comp_buf
.newline_anchor
= 1;
706 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
712 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
716 libc_freeres_fn (free_mem
)
718 __regfree (&re_comp_buf
);
722 #endif /* _REGEX_RE_COMP */
724 /* Internal entry point.
725 Compile the regular expression PATTERN, whose length is LENGTH.
726 SYNTAX indicate regular expression's syntax. */
729 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
732 reg_errcode_t err
= REG_NOERROR
;
736 /* Initialize the pattern buffer. */
737 preg
->fastmap_accurate
= 0;
738 preg
->syntax
= syntax
;
739 preg
->not_bol
= preg
->not_eol
= 0;
742 preg
->can_be_null
= 0;
743 preg
->regs_allocated
= REGS_UNALLOCATED
;
745 /* Initialize the dfa. */
746 dfa
= (re_dfa_t
*) preg
->buffer
;
747 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
749 /* If zero allocated, but buffer is non-null, try to realloc
750 enough space. This loses if buffer's address is bogus, but
751 that is the user's responsibility. If ->buffer is NULL this
752 is a simple allocation. */
753 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
756 preg
->allocated
= sizeof (re_dfa_t
);
757 preg
->buffer
= (unsigned char *) dfa
;
759 preg
->used
= sizeof (re_dfa_t
);
761 err
= init_dfa (dfa
, length
);
762 if (BE (err
!= REG_NOERROR
, 0))
764 free_dfa_content (dfa
);
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa
->re_str
= re_malloc (char, length
+ 1);
772 strncpy (dfa
->re_str
, pattern
, length
+ 1);
775 __libc_lock_init (dfa
->lock
);
777 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
778 syntax
& RE_ICASE
, dfa
);
779 if (BE (err
!= REG_NOERROR
, 0))
781 re_compile_internal_free_return
:
782 free_workarea_compile (preg
);
783 re_string_destruct (®exp
);
784 free_dfa_content (dfa
);
790 /* Parse the regular expression, and build a structure tree. */
792 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
793 if (BE (dfa
->str_tree
== NULL
, 0))
794 goto re_compile_internal_free_return
;
796 /* Analyze the tree and create the nfa. */
797 err
= analyze (preg
);
798 if (BE (err
!= REG_NOERROR
, 0))
799 goto re_compile_internal_free_return
;
801 #ifdef RE_ENABLE_I18N
802 /* If possible, do searching in single byte encoding to speed things up. */
803 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
807 /* Then create the initial state of the dfa. */
808 err
= create_initial_state (dfa
);
810 /* Release work areas. */
811 free_workarea_compile (preg
);
812 re_string_destruct (®exp
);
814 if (BE (err
!= REG_NOERROR
, 0))
816 free_dfa_content (dfa
);
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
828 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
830 unsigned int table_size
;
835 memset (dfa
, '\0', sizeof (re_dfa_t
));
837 /* Force allocation of str_tree_storage the first time. */
838 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
840 /* Avoid overflows. */
841 if (pat_len
== SIZE_MAX
)
844 dfa
->nodes_alloc
= pat_len
+ 1;
845 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
847 /* table_size = 2 ^ ceil(log pat_len) */
848 for (table_size
= 1; ; table_size
<<= 1)
849 if (table_size
> pat_len
)
852 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
853 dfa
->state_hash_mask
= table_size
- 1;
855 dfa
->mb_cur_max
= MB_CUR_MAX
;
857 if (dfa
->mb_cur_max
== 6
858 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
860 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
863 # ifdef HAVE_LANGINFO_CODESET
864 codeset_name
= nl_langinfo (CODESET
);
866 codeset_name
= getenv ("LC_ALL");
867 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
868 codeset_name
= getenv ("LC_CTYPE");
869 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
870 codeset_name
= getenv ("LANG");
871 if (codeset_name
== NULL
)
873 else if (strchr (codeset_name
, '.') != NULL
)
874 codeset_name
= strchr (codeset_name
, '.') + 1;
877 if (strcasecmp (codeset_name
, "UTF-8") == 0
878 || strcasecmp (codeset_name
, "UTF8") == 0)
881 /* We check exhaustively in the loop below if this charset is a
882 superset of ASCII. */
883 dfa
->map_notascii
= 0;
886 #ifdef RE_ENABLE_I18N
887 if (dfa
->mb_cur_max
> 1)
890 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
895 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
896 if (BE (dfa
->sb_char
== NULL
, 0))
899 /* Set the bits corresponding to single byte chars. */
900 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
901 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
903 wint_t wch
= __btowc (ch
);
905 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
907 if (isascii (ch
) && wch
!= ch
)
908 dfa
->map_notascii
= 1;
915 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
920 /* Initialize WORD_CHAR table, which indicate which character is
921 "word". In this case "word" means that it is the word construction
922 character used by some operators like "\<", "\>", etc. */
926 init_word_char (re_dfa_t
*dfa
)
928 dfa
->word_ops_used
= 1;
931 if (BE (dfa
->map_notascii
== 0, 1))
933 if (sizeof (dfa
->word_char
[0]) == 8)
935 /* The extra temporaries here avoid "implicitly truncated"
936 warnings in the case when this is dead code, i.e. 32-bit. */
937 const uint64_t wc0
= UINT64_C (0x03ff000000000000);
938 const uint64_t wc1
= UINT64_C (0x07fffffe87fffffe);
939 dfa
->word_char
[0] = wc0
;
940 dfa
->word_char
[1] = wc1
;
943 else if (sizeof (dfa
->word_char
[0]) == 4)
945 dfa
->word_char
[0] = UINT32_C (0x00000000);
946 dfa
->word_char
[1] = UINT32_C (0x03ff0000);
947 dfa
->word_char
[2] = UINT32_C (0x87fffffe);
948 dfa
->word_char
[3] = UINT32_C (0x07fffffe);
955 if (BE (dfa
->is_utf8
, 1))
957 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
962 for (; i
< BITSET_WORDS
; ++i
)
963 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
964 if (isalnum (ch
) || ch
== '_')
965 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
968 /* Free the work area which are only used while compiling. */
971 free_workarea_compile (regex_t
*preg
)
973 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
974 bin_tree_storage_t
*storage
, *next
;
975 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
977 next
= storage
->next
;
980 dfa
->str_tree_storage
= NULL
;
981 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
982 dfa
->str_tree
= NULL
;
983 re_free (dfa
->org_indices
);
984 dfa
->org_indices
= NULL
;
987 /* Create initial states for all contexts. */
990 create_initial_state (re_dfa_t
*dfa
)
994 re_node_set init_nodes
;
996 /* Initial states have the epsilon closure of the node which is
997 the first node of the regular expression. */
998 first
= dfa
->str_tree
->first
->node_idx
;
999 dfa
->init_node
= first
;
1000 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1001 if (BE (err
!= REG_NOERROR
, 0))
1004 /* The back-references which are in initial states can epsilon transit,
1005 since in this case all of the subexpressions can be null.
1006 Then we add epsilon closures of the nodes which are the next nodes of
1007 the back-references. */
1008 if (dfa
->nbackref
> 0)
1009 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1011 int node_idx
= init_nodes
.elems
[i
];
1012 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1015 if (type
!= OP_BACK_REF
)
1017 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1019 re_token_t
*clexp_node
;
1020 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1021 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1022 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1025 if (clexp_idx
== init_nodes
.nelem
)
1028 if (type
== OP_BACK_REF
)
1030 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1031 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1033 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1036 if (err
!= REG_NOERROR
)
1043 /* It must be the first time to invoke acquire_state. */
1044 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1045 /* We don't check ERR here, since the initial state must not be NULL. */
1046 if (BE (dfa
->init_state
== NULL
, 0))
1048 if (dfa
->init_state
->has_constraint
)
1050 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1052 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1054 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1058 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1059 || dfa
->init_state_begbuf
== NULL
, 0))
1063 dfa
->init_state_word
= dfa
->init_state_nl
1064 = dfa
->init_state_begbuf
= dfa
->init_state
;
1066 re_node_set_free (&init_nodes
);
1070 #ifdef RE_ENABLE_I18N
1071 /* If it is possible to do searching in single byte encoding instead of UTF-8
1072 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073 DFA nodes where needed. */
1076 optimize_utf8 (re_dfa_t
*dfa
)
1078 int node
, i
, mb_chars
= 0, has_period
= 0;
1080 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1081 switch (dfa
->nodes
[node
].type
)
1084 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1088 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1096 /* Word anchors etc. cannot be handled. It's okay to test
1097 opr.ctx_type since constraints (for all DFA nodes) are
1098 created by ORing one or more opr.ctx_type values. */
1108 case OP_DUP_ASTERISK
:
1109 case OP_OPEN_SUBEXP
:
1110 case OP_CLOSE_SUBEXP
:
1112 case COMPLEX_BRACKET
:
1114 case SIMPLE_BRACKET
:
1115 /* Just double check. The non-ASCII range starts at 0x80. */
1116 assert (0x80 % BITSET_WORD_BITS
== 0);
1117 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1118 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1125 if (mb_chars
|| has_period
)
1126 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1128 if (dfa
->nodes
[node
].type
== CHARACTER
1129 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1130 dfa
->nodes
[node
].mb_partial
= 0;
1131 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1132 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1135 /* The search can be in single byte locale. */
1136 dfa
->mb_cur_max
= 1;
1138 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143 "eclosure", and "inveclosure". */
1145 static reg_errcode_t
1146 analyze (regex_t
*preg
)
1148 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1151 /* Allocate arrays. */
1152 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1153 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1154 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1155 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1156 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1157 || dfa
->eclosures
== NULL
, 0))
1160 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1161 if (dfa
->subexp_map
!= NULL
)
1164 for (i
= 0; i
< preg
->re_nsub
; i
++)
1165 dfa
->subexp_map
[i
] = i
;
1166 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1167 for (i
= 0; i
< preg
->re_nsub
; i
++)
1168 if (dfa
->subexp_map
[i
] != i
)
1170 if (i
== preg
->re_nsub
)
1172 free (dfa
->subexp_map
);
1173 dfa
->subexp_map
= NULL
;
1177 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1178 if (BE (ret
!= REG_NOERROR
, 0))
1180 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1181 if (BE (ret
!= REG_NOERROR
, 0))
1183 preorder (dfa
->str_tree
, calc_next
, dfa
);
1184 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1185 if (BE (ret
!= REG_NOERROR
, 0))
1187 ret
= calc_eclosure (dfa
);
1188 if (BE (ret
!= REG_NOERROR
, 0))
1191 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1193 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1196 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1197 if (BE (dfa
->inveclosures
== NULL
, 0))
1199 ret
= calc_inveclosure (dfa
);
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206 implement parse tree visits. Instead, we use parent pointers and
1207 some hairy code in these two functions. */
1208 static reg_errcode_t
1209 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1212 bin_tree_t
*node
, *prev
;
1214 for (node
= root
; ; )
1216 /* Descend down the tree, preferably to the left (or to the right
1217 if that's the only child). */
1218 while (node
->left
|| node
->right
)
1226 reg_errcode_t err
= fn (extra
, node
);
1227 if (BE (err
!= REG_NOERROR
, 0))
1229 if (node
->parent
== NULL
)
1232 node
= node
->parent
;
1234 /* Go up while we have a node that is reached from the right. */
1235 while (node
->right
== prev
|| node
->right
== NULL
);
1240 static reg_errcode_t
1241 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1246 for (node
= root
; ; )
1248 reg_errcode_t err
= fn (extra
, node
);
1249 if (BE (err
!= REG_NOERROR
, 0))
1252 /* Go to the left node, or up and to the right. */
1257 bin_tree_t
*prev
= NULL
;
1258 while (node
->right
== prev
|| node
->right
== NULL
)
1261 node
= node
->parent
;
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1272 backreferences as well. Requires a preorder visit. */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra
, bin_tree_t
*node
)
1276 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1278 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1280 int idx
= node
->token
.opr
.idx
;
1281 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1282 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1285 else if (node
->token
.type
== SUBEXP
1286 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1288 int other_idx
= node
->left
->token
.opr
.idx
;
1290 node
->left
= node
->left
->left
;
1292 node
->left
->parent
= node
;
1294 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1295 if (other_idx
< BITSET_WORD_BITS
)
1296 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304 static reg_errcode_t
1305 lower_subexps (void *extra
, bin_tree_t
*node
)
1307 regex_t
*preg
= (regex_t
*) extra
;
1308 reg_errcode_t err
= REG_NOERROR
;
1310 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1312 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1314 node
->left
->parent
= node
;
1316 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1318 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1320 node
->right
->parent
= node
;
1327 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1329 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1330 bin_tree_t
*body
= node
->left
;
1331 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1334 /* We do not optimize empty subexpressions, because otherwise we may
1335 have bad CONCAT nodes with NULL children. This is obviously not
1336 very common, so we do not lose much. An example that triggers
1337 this case is the sed "script" /\(\)/x. */
1338 && node
->left
!= NULL
1339 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1340 || !(dfa
->used_bkref_map
1341 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1344 /* Convert the SUBEXP node to the concatenation of an
1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1347 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1348 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1349 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1350 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1356 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1357 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362 nodes. Requires a postorder visit. */
1363 static reg_errcode_t
1364 calc_first (void *extra
, bin_tree_t
*node
)
1366 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1367 if (node
->token
.type
== CONCAT
)
1369 node
->first
= node
->left
->first
;
1370 node
->node_idx
= node
->left
->node_idx
;
1375 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1376 if (BE (node
->node_idx
== -1, 0))
1378 if (node
->token
.type
== ANCHOR
)
1379 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1384 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1385 static reg_errcode_t
1386 calc_next (void *extra
, bin_tree_t
*node
)
1388 switch (node
->token
.type
)
1390 case OP_DUP_ASTERISK
:
1391 node
->left
->next
= node
;
1394 node
->left
->next
= node
->right
->first
;
1395 node
->right
->next
= node
->next
;
1399 node
->left
->next
= node
->next
;
1401 node
->right
->next
= node
->next
;
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1411 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1412 int idx
= node
->node_idx
;
1413 reg_errcode_t err
= REG_NOERROR
;
1415 switch (node
->token
.type
)
1421 assert (node
->next
== NULL
);
1424 case OP_DUP_ASTERISK
:
1428 dfa
->has_plural_match
= 1;
1429 if (node
->left
!= NULL
)
1430 left
= node
->left
->first
->node_idx
;
1432 left
= node
->next
->node_idx
;
1433 if (node
->right
!= NULL
)
1434 right
= node
->right
->first
->node_idx
;
1436 right
= node
->next
->node_idx
;
1438 assert (right
> -1);
1439 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1444 case OP_OPEN_SUBEXP
:
1445 case OP_CLOSE_SUBEXP
:
1446 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1450 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1451 if (node
->token
.type
== OP_BACK_REF
)
1452 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1456 assert (!IS_EPSILON_NODE (node
->token
.type
));
1457 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466 to their own constraint. */
1468 static reg_errcode_t
1470 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1471 int root_node
, unsigned int init_constraint
)
1473 int org_node
, clone_node
, ret
;
1474 unsigned int constraint
= init_constraint
;
1475 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1477 int org_dest
, clone_dest
;
1478 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest
= dfa
->nexts
[org_node
];
1485 re_node_set_empty (dfa
->edests
+ clone_node
);
1486 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1487 if (BE (clone_dest
== -1, 0))
1489 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1490 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1491 if (BE (ret
< 0, 0))
1494 else if (dfa
->edests
[org_node
].nelem
== 0)
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1502 else if (dfa
->edests
[org_node
].nelem
== 1)
1504 /* In case of the node can epsilon-transit, and it has only one
1506 org_dest
= dfa
->edests
[org_node
].elems
[0];
1507 re_node_set_empty (dfa
->edests
+ clone_node
);
1508 /* If the node is root_node itself, it means the epsilon clsoure
1509 has a loop. Then tie it to the destination of the root_node. */
1510 if (org_node
== root_node
&& clone_node
!= org_node
)
1512 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1513 if (BE (ret
< 0, 0))
1517 /* In case of the node has another constraint, add it. */
1518 constraint
|= dfa
->nodes
[org_node
].constraint
;
1519 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1520 if (BE (clone_dest
== -1, 0))
1522 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1523 if (BE (ret
< 0, 0))
1526 else /* dfa->edests[org_node].nelem == 2 */
1528 /* In case of the node can epsilon-transit, and it has two
1529 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1530 org_dest
= dfa
->edests
[org_node
].elems
[0];
1531 re_node_set_empty (dfa
->edests
+ clone_node
);
1532 /* Search for a duplicated node which satisfies the constraint. */
1533 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1534 if (clone_dest
== -1)
1536 /* There is no such duplicated node, create a new one. */
1538 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1539 if (BE (clone_dest
== -1, 0))
1541 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1542 if (BE (ret
< 0, 0))
1544 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1545 root_node
, constraint
);
1546 if (BE (err
!= REG_NOERROR
, 0))
1551 /* There is a duplicated node which satisfies the constraint,
1552 use it to avoid infinite loop. */
1553 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1554 if (BE (ret
< 0, 0))
1558 org_dest
= dfa
->edests
[org_node
].elems
[1];
1559 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1560 if (BE (clone_dest
== -1, 0))
1562 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1563 if (BE (ret
< 0, 0))
1566 org_node
= org_dest
;
1567 clone_node
= clone_dest
;
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573 satisfies the constraint CONSTRAINT. */
1576 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1577 unsigned int constraint
)
1580 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1582 if (org_node
== dfa
->org_indices
[idx
]
1583 && constraint
== dfa
->nodes
[idx
].constraint
)
1584 return idx
; /* Found. */
1586 return -1; /* Not found. */
1589 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590 Return the index of the new node, or -1 if insufficient storage is
1594 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1596 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1597 if (BE (dup_idx
!= -1, 1))
1599 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1600 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1601 dfa
->nodes
[dup_idx
].duplicated
= 1;
1603 /* Store the index of the original node. */
1604 dfa
->org_indices
[dup_idx
] = org_idx
;
1609 static reg_errcode_t
1610 calc_inveclosure (re_dfa_t
*dfa
)
1613 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1614 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1616 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1618 int *elems
= dfa
->eclosures
[src
].elems
;
1619 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1621 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1622 if (BE (ret
== -1, 0))
1630 /* Calculate "eclosure" for all the node in DFA. */
1632 static reg_errcode_t
1633 calc_eclosure (re_dfa_t
*dfa
)
1635 int node_idx
, incomplete
;
1637 assert (dfa
->nodes_len
> 0);
1640 /* For each nodes, calculate epsilon closure. */
1641 for (node_idx
= 0; ; ++node_idx
)
1644 re_node_set eclosure_elem
;
1645 if (node_idx
== dfa
->nodes_len
)
1654 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1657 /* If we have already calculated, skip it. */
1658 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1660 /* Calculate epsilon closure of `node_idx'. */
1661 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1662 if (BE (err
!= REG_NOERROR
, 0))
1665 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1668 re_node_set_free (&eclosure_elem
);
1674 /* Calculate epsilon closure of NODE. */
1676 static reg_errcode_t
1677 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1681 re_node_set eclosure
;
1684 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1685 if (BE (err
!= REG_NOERROR
, 0))
1688 /* This indicates that we are calculating this node now.
1689 We reference this value to avoid infinite loop. */
1690 dfa
->eclosures
[node
].nelem
= -1;
1692 /* If the current node has constraints, duplicate all nodes
1693 since they must inherit the constraints. */
1694 if (dfa
->nodes
[node
].constraint
1695 && dfa
->edests
[node
].nelem
1696 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1698 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1699 dfa
->nodes
[node
].constraint
);
1700 if (BE (err
!= REG_NOERROR
, 0))
1704 /* Expand each epsilon destination nodes. */
1705 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1706 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1708 re_node_set eclosure_elem
;
1709 int edest
= dfa
->edests
[node
].elems
[i
];
1710 /* If calculating the epsilon closure of `edest' is in progress,
1711 return intermediate result. */
1712 if (dfa
->eclosures
[edest
].nelem
== -1)
1717 /* If we haven't calculated the epsilon closure of `edest' yet,
1718 calculate now. Otherwise use calculated epsilon closure. */
1719 if (dfa
->eclosures
[edest
].nelem
== 0)
1721 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1722 if (BE (err
!= REG_NOERROR
, 0))
1726 eclosure_elem
= dfa
->eclosures
[edest
];
1727 /* Merge the epsilon closure of `edest'. */
1728 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1729 if (BE (err
!= REG_NOERROR
, 0))
1731 /* If the epsilon closure of `edest' is incomplete,
1732 the epsilon closure of this node is also incomplete. */
1733 if (dfa
->eclosures
[edest
].nelem
== 0)
1736 re_node_set_free (&eclosure_elem
);
1740 /* An epsilon closure includes itself. */
1741 ret
= re_node_set_insert (&eclosure
, node
);
1742 if (BE (ret
< 0, 0))
1744 if (incomplete
&& !root
)
1745 dfa
->eclosures
[node
].nelem
= 0;
1747 dfa
->eclosures
[node
] = eclosure
;
1748 *new_set
= eclosure
;
1752 /* Functions for token which are used in the parser. */
1754 /* Fetch a token from INPUT.
1755 We must not use this function inside bracket expressions. */
1759 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1761 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1764 /* Peek a token from INPUT, and return the length of the token.
1765 We must not use this function inside bracket expressions. */
1769 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1773 if (re_string_eoi (input
))
1775 token
->type
= END_OF_RE
;
1779 c
= re_string_peek_byte (input
, 0);
1782 token
->word_char
= 0;
1783 #ifdef RE_ENABLE_I18N
1784 token
->mb_partial
= 0;
1785 if (input
->mb_cur_max
> 1 &&
1786 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1788 token
->type
= CHARACTER
;
1789 token
->mb_partial
= 1;
1796 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1798 token
->type
= BACK_SLASH
;
1802 c2
= re_string_peek_byte_case (input
, 1);
1804 token
->type
= CHARACTER
;
1805 #ifdef RE_ENABLE_I18N
1806 if (input
->mb_cur_max
> 1)
1808 wint_t wc
= re_string_wchar_at (input
,
1809 re_string_cur_idx (input
) + 1);
1810 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1814 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1819 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1820 token
->type
= OP_ALT
;
1822 case '1': case '2': case '3': case '4': case '5':
1823 case '6': case '7': case '8': case '9':
1824 if (!(syntax
& RE_NO_BK_REFS
))
1826 token
->type
= OP_BACK_REF
;
1827 token
->opr
.idx
= c2
- '1';
1831 if (!(syntax
& RE_NO_GNU_OPS
))
1833 token
->type
= ANCHOR
;
1834 token
->opr
.ctx_type
= WORD_FIRST
;
1838 if (!(syntax
& RE_NO_GNU_OPS
))
1840 token
->type
= ANCHOR
;
1841 token
->opr
.ctx_type
= WORD_LAST
;
1845 if (!(syntax
& RE_NO_GNU_OPS
))
1847 token
->type
= ANCHOR
;
1848 token
->opr
.ctx_type
= WORD_DELIM
;
1852 if (!(syntax
& RE_NO_GNU_OPS
))
1854 token
->type
= ANCHOR
;
1855 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1859 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= OP_WORD
;
1863 if (!(syntax
& RE_NO_GNU_OPS
))
1864 token
->type
= OP_NOTWORD
;
1867 if (!(syntax
& RE_NO_GNU_OPS
))
1868 token
->type
= OP_SPACE
;
1871 if (!(syntax
& RE_NO_GNU_OPS
))
1872 token
->type
= OP_NOTSPACE
;
1875 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= ANCHOR
;
1878 token
->opr
.ctx_type
= BUF_FIRST
;
1882 if (!(syntax
& RE_NO_GNU_OPS
))
1884 token
->type
= ANCHOR
;
1885 token
->opr
.ctx_type
= BUF_LAST
;
1889 if (!(syntax
& RE_NO_BK_PARENS
))
1890 token
->type
= OP_OPEN_SUBEXP
;
1893 if (!(syntax
& RE_NO_BK_PARENS
))
1894 token
->type
= OP_CLOSE_SUBEXP
;
1897 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1898 token
->type
= OP_DUP_PLUS
;
1901 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1902 token
->type
= OP_DUP_QUESTION
;
1905 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1906 token
->type
= OP_OPEN_DUP_NUM
;
1909 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1910 token
->type
= OP_CLOSE_DUP_NUM
;
1918 token
->type
= CHARACTER
;
1919 #ifdef RE_ENABLE_I18N
1920 if (input
->mb_cur_max
> 1)
1922 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1923 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1927 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1932 if (syntax
& RE_NEWLINE_ALT
)
1933 token
->type
= OP_ALT
;
1936 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1937 token
->type
= OP_ALT
;
1940 token
->type
= OP_DUP_ASTERISK
;
1943 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1944 token
->type
= OP_DUP_PLUS
;
1947 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1948 token
->type
= OP_DUP_QUESTION
;
1951 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1952 token
->type
= OP_OPEN_DUP_NUM
;
1955 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1956 token
->type
= OP_CLOSE_DUP_NUM
;
1959 if (syntax
& RE_NO_BK_PARENS
)
1960 token
->type
= OP_OPEN_SUBEXP
;
1963 if (syntax
& RE_NO_BK_PARENS
)
1964 token
->type
= OP_CLOSE_SUBEXP
;
1967 token
->type
= OP_OPEN_BRACKET
;
1970 token
->type
= OP_PERIOD
;
1973 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1974 re_string_cur_idx (input
) != 0)
1976 char prev
= re_string_peek_byte (input
, -1);
1977 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1980 token
->type
= ANCHOR
;
1981 token
->opr
.ctx_type
= LINE_FIRST
;
1984 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1985 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1988 re_string_skip_bytes (input
, 1);
1989 peek_token (&next
, input
, syntax
);
1990 re_string_skip_bytes (input
, -1);
1991 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1994 token
->type
= ANCHOR
;
1995 token
->opr
.ctx_type
= LINE_LAST
;
2003 /* Peek a token from INPUT, and return the length of the token.
2004 We must not use this function out of bracket expressions. */
2008 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2011 if (re_string_eoi (input
))
2013 token
->type
= END_OF_RE
;
2016 c
= re_string_peek_byte (input
, 0);
2019 #ifdef RE_ENABLE_I18N
2020 if (input
->mb_cur_max
> 1 &&
2021 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2023 token
->type
= CHARACTER
;
2026 #endif /* RE_ENABLE_I18N */
2028 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2029 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2031 /* In this case, '\' escape a character. */
2033 re_string_skip_bytes (input
, 1);
2034 c2
= re_string_peek_byte (input
, 0);
2036 token
->type
= CHARACTER
;
2039 if (c
== '[') /* '[' is a special char in a bracket exps. */
2043 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2044 c2
= re_string_peek_byte (input
, 1);
2052 token
->type
= OP_OPEN_COLL_ELEM
;
2055 token
->type
= OP_OPEN_EQUIV_CLASS
;
2058 if (syntax
& RE_CHAR_CLASSES
)
2060 token
->type
= OP_OPEN_CHAR_CLASS
;
2063 /* else fall through. */
2065 token
->type
= CHARACTER
;
2075 token
->type
= OP_CHARSET_RANGE
;
2078 token
->type
= OP_CLOSE_BRACKET
;
2081 token
->type
= OP_NON_MATCH_LIST
;
2084 token
->type
= CHARACTER
;
2089 /* Functions for parser. */
2091 /* Entry point of the parser.
2092 Parse the regular expression REGEXP and return the structure tree.
2093 If an error is occured, ERR is set by error code, and return NULL.
2094 This function build the following tree, from regular expression <reg_exp>:
2100 CAT means concatenation.
2101 EOR means end of regular expression. */
2104 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2107 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2108 bin_tree_t
*tree
, *eor
, *root
;
2109 re_token_t current_token
;
2110 dfa
->syntax
= syntax
;
2111 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2112 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2113 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2115 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2117 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2120 if (BE (eor
== NULL
|| root
== NULL
, 0))
2128 /* This function build the following tree, from regular expression
2129 <branch1>|<branch2>:
2135 ALT means alternative, which represents the operator `|'. */
2138 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2139 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2141 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2142 bin_tree_t
*tree
, *branch
= NULL
;
2143 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2144 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2147 while (token
->type
== OP_ALT
)
2149 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2150 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2151 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2153 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2154 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2159 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2160 if (BE (tree
== NULL
, 0))
2169 /* This function build the following tree, from regular expression
2176 CAT means concatenation. */
2179 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2180 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2182 bin_tree_t
*tree
, *exp
;
2183 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2184 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2185 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2188 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2189 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2191 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2192 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2195 postorder (tree
, free_tree
, NULL
);
2198 if (tree
!= NULL
&& exp
!= NULL
)
2200 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2201 if (newtree
== NULL
)
2203 postorder (exp
, free_tree
, NULL
);
2204 postorder (tree
, free_tree
, NULL
);
2210 else if (tree
== NULL
)
2212 /* Otherwise exp == NULL, we don't need to create new tree. */
2217 /* This function build the following tree, from regular expression a*:
2224 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2225 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2227 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2229 switch (token
->type
)
2232 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2233 if (BE (tree
== NULL
, 0))
2238 #ifdef RE_ENABLE_I18N
2239 if (dfa
->mb_cur_max
> 1)
2241 while (!re_string_eoi (regexp
)
2242 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2244 bin_tree_t
*mbc_remain
;
2245 fetch_token (token
, regexp
, syntax
);
2246 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2247 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2248 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2257 case OP_OPEN_SUBEXP
:
2258 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2259 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2262 case OP_OPEN_BRACKET
:
2263 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2264 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2268 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2273 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2274 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2275 if (BE (tree
== NULL
, 0))
2281 dfa
->has_mb_node
= 1;
2283 case OP_OPEN_DUP_NUM
:
2284 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2290 case OP_DUP_ASTERISK
:
2292 case OP_DUP_QUESTION
:
2293 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2298 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2300 fetch_token (token
, regexp
, syntax
);
2301 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP
:
2305 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2306 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM
:
2313 /* We treat it as a normal character. */
2315 /* Then we can these characters as normal characters. */
2316 token
->type
= CHARACTER
;
2317 /* mb_partial and word_char bits should be initialized already
2319 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2320 if (BE (tree
== NULL
, 0))
2327 if ((token
->opr
.ctx_type
2328 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2329 && dfa
->word_ops_used
== 0)
2330 init_word_char (dfa
);
2331 if (token
->opr
.ctx_type
== WORD_DELIM
2332 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2334 bin_tree_t
*tree_first
, *tree_last
;
2335 if (token
->opr
.ctx_type
== WORD_DELIM
)
2337 token
->opr
.ctx_type
= WORD_FIRST
;
2338 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2339 token
->opr
.ctx_type
= WORD_LAST
;
2343 token
->opr
.ctx_type
= INSIDE_WORD
;
2344 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2345 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2347 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2348 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2349 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2357 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2358 if (BE (tree
== NULL
, 0))
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token
, regexp
, syntax
);
2371 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2372 if (BE (tree
== NULL
, 0))
2377 if (dfa
->mb_cur_max
> 1)
2378 dfa
->has_mb_node
= 1;
2382 tree
= build_charclass_op (dfa
, regexp
->trans
,
2383 (const unsigned char *) "alnum",
2384 (const unsigned char *) "_",
2385 token
->type
== OP_NOTWORD
, err
);
2386 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2391 tree
= build_charclass_op (dfa
, regexp
->trans
,
2392 (const unsigned char *) "space",
2393 (const unsigned char *) "",
2394 token
->type
== OP_NOTSPACE
, err
);
2395 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2405 /* Must not happen? */
2411 fetch_token (token
, regexp
, syntax
);
2413 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2414 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2416 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2417 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2421 && (token
->type
== OP_DUP_ASTERISK
2422 || token
->type
== OP_OPEN_DUP_NUM
))
2432 /* This function build the following tree, from regular expression
2440 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2441 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2443 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2446 cur_nsub
= preg
->re_nsub
++;
2448 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2450 /* The subexpression may be a null string. */
2451 if (token
->type
== OP_CLOSE_SUBEXP
)
2455 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2456 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2459 postorder (tree
, free_tree
, NULL
);
2462 if (BE (*err
!= REG_NOERROR
, 0))
2466 if (cur_nsub
<= '9' - '1')
2467 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2469 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2470 if (BE (tree
== NULL
, 0))
2475 tree
->token
.opr
.idx
= cur_nsub
;
2479 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2482 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2483 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2485 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2486 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2487 re_token_t start_token
= *token
;
2489 if (token
->type
== OP_OPEN_DUP_NUM
)
2492 start
= fetch_number (regexp
, token
, syntax
);
2495 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2496 start
= 0; /* We treat "{,m}" as "{0,m}". */
2499 *err
= REG_BADBR
; /* <re>{} is invalid. */
2503 if (BE (start
!= -2, 1))
2505 /* We treat "{n}" as "{n,n}". */
2506 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2507 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2508 ? fetch_number (regexp
, token
, syntax
) : -2));
2510 if (BE (start
== -2 || end
== -2, 0))
2512 /* Invalid sequence. */
2513 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2515 if (token
->type
== END_OF_RE
)
2523 /* If the syntax bit is set, rollback. */
2524 re_string_set_index (regexp
, start_idx
);
2525 *token
= start_token
;
2526 token
->type
= CHARACTER
;
2527 /* mb_partial and word_char bits should be already initialized by
2532 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2534 /* First number greater than second. */
2541 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2542 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2545 fetch_token (token
, regexp
, syntax
);
2547 if (BE (elem
== NULL
, 0))
2549 if (BE (start
== 0 && end
== 0, 0))
2551 postorder (elem
, free_tree
, NULL
);
2555 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2556 if (BE (start
> 0, 0))
2559 for (i
= 2; i
<= start
; ++i
)
2561 elem
= duplicate_tree (elem
, dfa
);
2562 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2563 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2564 goto parse_dup_op_espace
;
2570 /* Duplicate ELEM before it is marked optional. */
2571 elem
= duplicate_tree (elem
, dfa
);
2577 if (elem
->token
.type
== SUBEXP
)
2578 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2580 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2581 if (BE (tree
== NULL
, 0))
2582 goto parse_dup_op_espace
;
2584 /* This loop is actually executed only when end != -1,
2585 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2586 already created the start+1-th copy. */
2587 for (i
= start
+ 2; i
<= end
; ++i
)
2589 elem
= duplicate_tree (elem
, dfa
);
2590 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2591 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2592 goto parse_dup_op_espace
;
2594 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2595 if (BE (tree
== NULL
, 0))
2596 goto parse_dup_op_espace
;
2600 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2604 parse_dup_op_espace
:
2609 /* Size of the names for collating symbol/equivalence_class/character_class.
2610 I'm not sure, but maybe enough. */
2611 #define BRACKET_NAME_BUF_SIZE 32
2614 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615 Build the range expression which starts from START_ELEM, and ends
2616 at END_ELEM. The result are written to MBCSET and SBCSET.
2617 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618 mbcset->range_ends, is a pointer argument sinse we may
2621 static reg_errcode_t
2623 # ifdef RE_ENABLE_I18N
2624 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2625 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2626 # else /* not RE_ENABLE_I18N */
2627 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2628 bracket_elem_t
*end_elem
)
2629 # endif /* not RE_ENABLE_I18N */
2631 unsigned int start_ch
, end_ch
;
2632 /* Equivalence Classes and Character Classes can't be a range start/end. */
2633 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2634 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2638 /* We can handle no multi character collating elements without libc
2640 if (BE ((start_elem
->type
== COLL_SYM
2641 && strlen ((char *) start_elem
->opr
.name
) > 1)
2642 || (end_elem
->type
== COLL_SYM
2643 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2644 return REG_ECOLLATE
;
2646 # ifdef RE_ENABLE_I18N
2651 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2653 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2654 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2656 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2657 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2659 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2660 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2661 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2662 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2663 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2664 return REG_ECOLLATE
;
2665 cmp_buf
[0] = start_wc
;
2666 cmp_buf
[4] = end_wc
;
2667 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2670 /* Got valid collation sequence values, add them as a new entry.
2671 However, for !_LIBC we have no collation elements: if the
2672 character set is single byte, the single byte character set
2673 that we build below suffices. parse_bracket_exp passes
2674 no MBCSET if dfa->mb_cur_max == 1. */
2677 /* Check the space of the arrays. */
2678 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2680 /* There is not enough space, need realloc. */
2681 wchar_t *new_array_start
, *new_array_end
;
2684 /* +1 in case of mbcset->nranges is 0. */
2685 new_nranges
= 2 * mbcset
->nranges
+ 1;
2686 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687 are NULL if *range_alloc == 0. */
2688 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2690 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2693 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2696 mbcset
->range_starts
= new_array_start
;
2697 mbcset
->range_ends
= new_array_end
;
2698 *range_alloc
= new_nranges
;
2701 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2702 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2705 /* Build the table for single byte characters. */
2706 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2709 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2710 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2711 bitset_set (sbcset
, wc
);
2714 # else /* not RE_ENABLE_I18N */
2717 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2718 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2720 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2721 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2723 if (start_ch
> end_ch
)
2725 /* Build the table for single byte characters. */
2726 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2727 if (start_ch
<= ch
&& ch
<= end_ch
)
2728 bitset_set (sbcset
, ch
);
2730 # endif /* not RE_ENABLE_I18N */
2733 #endif /* not _LIBC */
2736 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737 Build the collating element which is represented by NAME.
2738 The result are written to MBCSET and SBCSET.
2739 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740 pointer argument since we may update it. */
2742 static reg_errcode_t
2744 # ifdef RE_ENABLE_I18N
2745 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2746 int *coll_sym_alloc
, const unsigned char *name
)
2747 # else /* not RE_ENABLE_I18N */
2748 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2749 # endif /* not RE_ENABLE_I18N */
2751 size_t name_len
= strlen ((const char *) name
);
2752 if (BE (name_len
!= 1, 0))
2753 return REG_ECOLLATE
;
2756 bitset_set (sbcset
, name
[0]);
2760 #endif /* not _LIBC */
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2766 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2767 reg_syntax_t syntax
, reg_errcode_t
*err
)
2770 const unsigned char *collseqmb
;
2771 const char *collseqwc
;
2774 const int32_t *symb_table
;
2775 const unsigned char *extra
;
2777 /* Local function for parse_bracket_exp used in _LIBC environement.
2778 Seek the collating symbol entry correspondings to NAME.
2779 Return the index of the symbol in the SYMB_TABLE,
2780 or -1 if not found. */
2783 __attribute ((always_inline
))
2784 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2788 for (elem
= 0; elem
< table_size
; elem
++)
2789 if (symb_table
[2 * elem
] != 0)
2791 int32_t idx
= symb_table
[2 * elem
+ 1];
2792 /* Skip the name of collating element name. */
2793 idx
+= 1 + extra
[idx
];
2794 if (/* Compare the length of the name. */
2795 name_len
== extra
[idx
]
2796 /* Compare the name. */
2797 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2798 /* Yep, this is the entry. */
2804 /* Local function for parse_bracket_exp used in _LIBC environment.
2805 Look up the collation sequence value of BR_ELEM.
2806 Return the value if succeeded, UINT_MAX otherwise. */
2808 auto inline unsigned int
2809 __attribute ((always_inline
))
2810 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2812 if (br_elem
->type
== SB_CHAR
)
2815 if (MB_CUR_MAX == 1)
2818 return collseqmb
[br_elem
->opr
.ch
];
2821 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2822 return __collseq_table_lookup (collseqwc
, wc
);
2825 else if (br_elem
->type
== MB_CHAR
)
2828 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2830 else if (br_elem
->type
== COLL_SYM
)
2832 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2836 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2840 /* We found the entry. */
2841 idx
= symb_table
[2 * elem
+ 1];
2842 /* Skip the name of collating element name. */
2843 idx
+= 1 + extra
[idx
];
2844 /* Skip the byte sequence of the collating element. */
2845 idx
+= 1 + extra
[idx
];
2846 /* Adjust for the alignment. */
2847 idx
= (idx
+ 3) & ~3;
2848 /* Skip the multibyte collation sequence value. */
2849 idx
+= sizeof (unsigned int);
2850 /* Skip the wide char sequence of the collating element. */
2851 idx
+= sizeof (unsigned int) *
2852 (1 + *(unsigned int *) (extra
+ idx
));
2853 /* Return the collation sequence value. */
2854 return *(unsigned int *) (extra
+ idx
);
2856 else if (sym_name_len
== 1)
2858 /* No valid character. Match it as a single byte
2860 return collseqmb
[br_elem
->opr
.name
[0]];
2863 else if (sym_name_len
== 1)
2864 return collseqmb
[br_elem
->opr
.name
[0]];
2869 /* Local function for parse_bracket_exp used in _LIBC environement.
2870 Build the range expression which starts from START_ELEM, and ends
2871 at END_ELEM. The result are written to MBCSET and SBCSET.
2872 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873 mbcset->range_ends, is a pointer argument sinse we may
2876 auto inline reg_errcode_t
2877 __attribute ((always_inline
))
2878 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2879 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2882 uint32_t start_collseq
;
2883 uint32_t end_collseq
;
2885 /* Equivalence Classes and Character Classes can't be a range
2887 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2888 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2892 start_collseq
= lookup_collation_sequence_value (start_elem
);
2893 end_collseq
= lookup_collation_sequence_value (end_elem
);
2894 /* Check start/end collation sequence values. */
2895 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2896 return REG_ECOLLATE
;
2897 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2900 /* Got valid collation sequence values, add them as a new entry.
2901 However, if we have no collation elements, and the character set
2902 is single byte, the single byte character set that we
2903 build below suffices. */
2904 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2906 /* Check the space of the arrays. */
2907 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2909 /* There is not enough space, need realloc. */
2910 uint32_t *new_array_start
;
2911 uint32_t *new_array_end
;
2914 /* +1 in case of mbcset->nranges is 0. */
2915 new_nranges
= 2 * mbcset
->nranges
+ 1;
2916 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2918 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2921 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2924 mbcset
->range_starts
= new_array_start
;
2925 mbcset
->range_ends
= new_array_end
;
2926 *range_alloc
= new_nranges
;
2929 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2930 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2933 /* Build the table for single byte characters. */
2934 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2936 uint32_t ch_collseq
;
2938 if (MB_CUR_MAX == 1)
2941 ch_collseq
= collseqmb
[ch
];
2943 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2944 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2945 bitset_set (sbcset
, ch
);
2950 /* Local function for parse_bracket_exp used in _LIBC environement.
2951 Build the collating element which is represented by NAME.
2952 The result are written to MBCSET and SBCSET.
2953 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954 pointer argument sinse we may update it. */
2956 auto inline reg_errcode_t
2957 __attribute ((always_inline
))
2958 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2959 int *coll_sym_alloc
, const unsigned char *name
)
2962 size_t name_len
= strlen ((const char *) name
);
2965 elem
= seek_collating_symbol_entry (name
, name_len
);
2968 /* We found the entry. */
2969 idx
= symb_table
[2 * elem
+ 1];
2970 /* Skip the name of collating element name. */
2971 idx
+= 1 + extra
[idx
];
2973 else if (name_len
== 1)
2975 /* No valid character, treat it as a normal
2977 bitset_set (sbcset
, name
[0]);
2981 return REG_ECOLLATE
;
2983 /* Got valid collation sequence, add it as a new entry. */
2984 /* Check the space of the arrays. */
2985 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2987 /* Not enough, realloc it. */
2988 /* +1 in case of mbcset->ncoll_syms is 0. */
2989 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2990 /* Use realloc since mbcset->coll_syms is NULL
2992 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2993 new_coll_sym_alloc
);
2994 if (BE (new_coll_syms
== NULL
, 0))
2996 mbcset
->coll_syms
= new_coll_syms
;
2997 *coll_sym_alloc
= new_coll_sym_alloc
;
2999 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3004 if (BE (name_len
!= 1, 0))
3005 return REG_ECOLLATE
;
3008 bitset_set (sbcset
, name
[0]);
3015 re_token_t br_token
;
3016 re_bitset_ptr_t sbcset
;
3017 #ifdef RE_ENABLE_I18N
3018 re_charset_t
*mbcset
;
3019 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3020 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3021 #endif /* not RE_ENABLE_I18N */
3023 bin_tree_t
*work_tree
;
3025 int first_round
= 1;
3027 collseqmb
= (const unsigned char *)
3028 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3029 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3035 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3036 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3037 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3038 _NL_COLLATE_SYMB_TABLEMB
);
3039 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3040 _NL_COLLATE_SYMB_EXTRAMB
);
3043 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3044 #ifdef RE_ENABLE_I18N
3045 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3046 #endif /* RE_ENABLE_I18N */
3047 #ifdef RE_ENABLE_I18N
3048 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3050 if (BE (sbcset
== NULL
, 0))
3051 #endif /* RE_ENABLE_I18N */
3054 #ifdef RE_ENABLE_I18N
3061 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3062 if (BE (token
->type
== END_OF_RE
, 0))
3065 goto parse_bracket_exp_free_return
;
3067 if (token
->type
== OP_NON_MATCH_LIST
)
3069 #ifdef RE_ENABLE_I18N
3070 mbcset
->non_match
= 1;
3071 #endif /* not RE_ENABLE_I18N */
3073 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3074 bitset_set (sbcset
, '\n');
3075 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3076 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3077 if (BE (token
->type
== END_OF_RE
, 0))
3080 goto parse_bracket_exp_free_return
;
3084 /* We treat the first ']' as a normal character. */
3085 if (token
->type
== OP_CLOSE_BRACKET
)
3086 token
->type
= CHARACTER
;
3090 bracket_elem_t start_elem
, end_elem
;
3091 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3092 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3094 int token_len2
= 0, is_range_exp
= 0;
3097 start_elem
.opr
.name
= start_name_buf
;
3098 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3099 syntax
, first_round
);
3100 if (BE (ret
!= REG_NOERROR
, 0))
3103 goto parse_bracket_exp_free_return
;
3107 /* Get information about the next token. We need it in any case. */
3108 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3110 /* Do not check for ranges if we know they are not allowed. */
3111 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3113 if (BE (token
->type
== END_OF_RE
, 0))
3116 goto parse_bracket_exp_free_return
;
3118 if (token
->type
== OP_CHARSET_RANGE
)
3120 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3121 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3122 if (BE (token2
.type
== END_OF_RE
, 0))
3125 goto parse_bracket_exp_free_return
;
3127 if (token2
.type
== OP_CLOSE_BRACKET
)
3129 /* We treat the last '-' as a normal character. */
3130 re_string_skip_bytes (regexp
, -token_len
);
3131 token
->type
= CHARACTER
;
3138 if (is_range_exp
== 1)
3140 end_elem
.opr
.name
= end_name_buf
;
3141 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3143 if (BE (ret
!= REG_NOERROR
, 0))
3146 goto parse_bracket_exp_free_return
;
3149 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3152 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3153 &start_elem
, &end_elem
);
3155 # ifdef RE_ENABLE_I18N
3156 *err
= build_range_exp (sbcset
,
3157 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3158 &range_alloc
, &start_elem
, &end_elem
);
3160 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3162 #endif /* RE_ENABLE_I18N */
3163 if (BE (*err
!= REG_NOERROR
, 0))
3164 goto parse_bracket_exp_free_return
;
3168 switch (start_elem
.type
)
3171 bitset_set (sbcset
, start_elem
.opr
.ch
);
3173 #ifdef RE_ENABLE_I18N
3175 /* Check whether the array has enough space. */
3176 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3178 wchar_t *new_mbchars
;
3179 /* Not enough, realloc it. */
3180 /* +1 in case of mbcset->nmbchars is 0. */
3181 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3182 /* Use realloc since array is NULL if *alloc == 0. */
3183 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3185 if (BE (new_mbchars
== NULL
, 0))
3186 goto parse_bracket_exp_espace
;
3187 mbcset
->mbchars
= new_mbchars
;
3189 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3191 #endif /* RE_ENABLE_I18N */
3193 *err
= build_equiv_class (sbcset
,
3194 #ifdef RE_ENABLE_I18N
3195 mbcset
, &equiv_class_alloc
,
3196 #endif /* RE_ENABLE_I18N */
3197 start_elem
.opr
.name
);
3198 if (BE (*err
!= REG_NOERROR
, 0))
3199 goto parse_bracket_exp_free_return
;
3202 *err
= build_collating_symbol (sbcset
,
3203 #ifdef RE_ENABLE_I18N
3204 mbcset
, &coll_sym_alloc
,
3205 #endif /* RE_ENABLE_I18N */
3206 start_elem
.opr
.name
);
3207 if (BE (*err
!= REG_NOERROR
, 0))
3208 goto parse_bracket_exp_free_return
;
3211 *err
= build_charclass (regexp
->trans
, sbcset
,
3212 #ifdef RE_ENABLE_I18N
3213 mbcset
, &char_class_alloc
,
3214 #endif /* RE_ENABLE_I18N */
3215 start_elem
.opr
.name
, syntax
);
3216 if (BE (*err
!= REG_NOERROR
, 0))
3217 goto parse_bracket_exp_free_return
;
3224 if (BE (token
->type
== END_OF_RE
, 0))
3227 goto parse_bracket_exp_free_return
;
3229 if (token
->type
== OP_CLOSE_BRACKET
)
3233 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3235 /* If it is non-matching list. */
3237 bitset_not (sbcset
);
3239 #ifdef RE_ENABLE_I18N
3240 /* Ensure only single byte characters are set. */
3241 if (dfa
->mb_cur_max
> 1)
3242 bitset_mask (sbcset
, dfa
->sb_char
);
3244 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3245 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3246 || mbcset
->non_match
)))
3248 bin_tree_t
*mbc_tree
;
3250 /* Build a tree for complex bracket. */
3251 dfa
->has_mb_node
= 1;
3252 br_token
.type
= COMPLEX_BRACKET
;
3253 br_token
.opr
.mbcset
= mbcset
;
3254 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3255 if (BE (mbc_tree
== NULL
, 0))
3256 goto parse_bracket_exp_espace
;
3257 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3258 if (sbcset
[sbc_idx
])
3260 /* If there are no bits set in sbcset, there is no point
3261 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3262 if (sbc_idx
< BITSET_WORDS
)
3264 /* Build a tree for simple bracket. */
3265 br_token
.type
= SIMPLE_BRACKET
;
3266 br_token
.opr
.sbcset
= sbcset
;
3267 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3268 if (BE (work_tree
== NULL
, 0))
3269 goto parse_bracket_exp_espace
;
3271 /* Then join them by ALT node. */
3272 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3273 if (BE (work_tree
== NULL
, 0))
3274 goto parse_bracket_exp_espace
;
3279 work_tree
= mbc_tree
;
3283 #endif /* not RE_ENABLE_I18N */
3285 #ifdef RE_ENABLE_I18N
3286 free_charset (mbcset
);
3288 /* Build a tree for simple bracket. */
3289 br_token
.type
= SIMPLE_BRACKET
;
3290 br_token
.opr
.sbcset
= sbcset
;
3291 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3292 if (BE (work_tree
== NULL
, 0))
3293 goto parse_bracket_exp_espace
;
3297 parse_bracket_exp_espace
:
3299 parse_bracket_exp_free_return
:
3301 #ifdef RE_ENABLE_I18N
3302 free_charset (mbcset
);
3303 #endif /* RE_ENABLE_I18N */
3307 /* Parse an element in the bracket expression. */
3309 static reg_errcode_t
3310 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3311 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3312 reg_syntax_t syntax
, int accept_hyphen
)
3314 #ifdef RE_ENABLE_I18N
3316 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3317 if (cur_char_size
> 1)
3319 elem
->type
= MB_CHAR
;
3320 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3321 re_string_skip_bytes (regexp
, cur_char_size
);
3324 #endif /* RE_ENABLE_I18N */
3325 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3326 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3327 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3328 return parse_bracket_symbol (elem
, regexp
, token
);
3329 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3331 /* A '-' must only appear as anything but a range indicator before
3332 the closing bracket. Everything else is an error. */
3334 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3335 if (token2
.type
!= OP_CLOSE_BRACKET
)
3336 /* The actual error value is not standardized since this whole
3337 case is undefined. But ERANGE makes good sense. */
3340 elem
->type
= SB_CHAR
;
3341 elem
->opr
.ch
= token
->opr
.c
;
3345 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3346 such as [:<character_class>:], [.<collating_element>.], and
3347 [=<equivalent_class>=]. */
3349 static reg_errcode_t
3350 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3353 unsigned char ch
, delim
= token
->opr
.c
;
3355 if (re_string_eoi(regexp
))
3359 if (i
>= BRACKET_NAME_BUF_SIZE
)
3361 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3362 ch
= re_string_fetch_byte_case (regexp
);
3364 ch
= re_string_fetch_byte (regexp
);
3365 if (re_string_eoi(regexp
))
3367 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3369 elem
->opr
.name
[i
] = ch
;
3371 re_string_skip_bytes (regexp
, 1);
3372 elem
->opr
.name
[i
] = '\0';
3373 switch (token
->type
)
3375 case OP_OPEN_COLL_ELEM
:
3376 elem
->type
= COLL_SYM
;
3378 case OP_OPEN_EQUIV_CLASS
:
3379 elem
->type
= EQUIV_CLASS
;
3381 case OP_OPEN_CHAR_CLASS
:
3382 elem
->type
= CHAR_CLASS
;
3390 /* Helper function for parse_bracket_exp.
3391 Build the equivalence class which is represented by NAME.
3392 The result are written to MBCSET and SBCSET.
3393 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394 is a pointer argument sinse we may update it. */
3396 static reg_errcode_t
3397 #ifdef RE_ENABLE_I18N
3398 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3399 int *equiv_class_alloc
, const unsigned char *name
)
3400 #else /* not RE_ENABLE_I18N */
3401 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3402 #endif /* not RE_ENABLE_I18N */
3405 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3408 const int32_t *table
, *indirect
;
3409 const unsigned char *weights
, *extra
, *cp
;
3410 unsigned char char_buf
[2];
3414 /* This #include defines a local function! */
3415 # include <locale/weight.h>
3416 /* Calculate the index for equivalence class. */
3418 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3419 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3420 _NL_COLLATE_WEIGHTMB
);
3421 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3422 _NL_COLLATE_EXTRAMB
);
3423 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3424 _NL_COLLATE_INDIRECTMB
);
3425 idx1
= findidx (&cp
, -1);
3426 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3427 /* This isn't a valid character. */
3428 return REG_ECOLLATE
;
3430 /* Build single byte matcing table for this equivalence class. */
3431 len
= weights
[idx1
& 0xffffff];
3432 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3436 idx2
= findidx (&cp
, 1);
3441 /* This isn't a valid character. */
3443 /* Compare only if the length matches and the collation rule
3444 index is the same. */
3445 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3449 while (cnt
<= len
&&
3450 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3451 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3455 bitset_set (sbcset
, ch
);
3458 /* Check whether the array has enough space. */
3459 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3461 /* Not enough, realloc it. */
3462 /* +1 in case of mbcset->nequiv_classes is 0. */
3463 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3464 /* Use realloc since the array is NULL if *alloc == 0. */
3465 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3467 new_equiv_class_alloc
);
3468 if (BE (new_equiv_classes
== NULL
, 0))
3470 mbcset
->equiv_classes
= new_equiv_classes
;
3471 *equiv_class_alloc
= new_equiv_class_alloc
;
3473 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3478 if (BE (strlen ((const char *) name
) != 1, 0))
3479 return REG_ECOLLATE
;
3480 bitset_set (sbcset
, *name
);
3485 /* Helper function for parse_bracket_exp.
3486 Build the character class which is represented by NAME.
3487 The result are written to MBCSET and SBCSET.
3488 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489 is a pointer argument sinse we may update it. */
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
3493 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3494 re_charset_t
*mbcset
, int *char_class_alloc
,
3495 const unsigned char *class_name
, reg_syntax_t syntax
)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3498 const unsigned char *class_name
, reg_syntax_t syntax
)
3499 #endif /* not RE_ENABLE_I18N */
3502 const char *name
= (const char *) class_name
;
3504 /* In case of REG_ICASE "upper" and "lower" match the both of
3505 upper and lower cases. */
3506 if ((syntax
& RE_ICASE
)
3507 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3510 #ifdef RE_ENABLE_I18N
3511 /* Check the space of the arrays. */
3512 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3514 /* Not enough, realloc it. */
3515 /* +1 in case of mbcset->nchar_classes is 0. */
3516 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3517 /* Use realloc since array is NULL if *alloc == 0. */
3518 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3519 new_char_class_alloc
);
3520 if (BE (new_char_classes
== NULL
, 0))
3522 mbcset
->char_classes
= new_char_classes
;
3523 *char_class_alloc
= new_char_class_alloc
;
3525 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3526 #endif /* RE_ENABLE_I18N */
3528 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3530 if (BE (trans != NULL, 0)) \
3532 for (i = 0; i < SBC_MAX; ++i) \
3533 if (ctype_func (i)) \
3534 bitset_set (sbcset, trans[i]); \
3538 for (i = 0; i < SBC_MAX; ++i) \
3539 if (ctype_func (i)) \
3540 bitset_set (sbcset, i); \
3544 if (strcmp (name
, "alnum") == 0)
3545 BUILD_CHARCLASS_LOOP (isalnum
);
3546 else if (strcmp (name
, "cntrl") == 0)
3547 BUILD_CHARCLASS_LOOP (iscntrl
);
3548 else if (strcmp (name
, "lower") == 0)
3549 BUILD_CHARCLASS_LOOP (islower
);
3550 else if (strcmp (name
, "space") == 0)
3551 BUILD_CHARCLASS_LOOP (isspace
);
3552 else if (strcmp (name
, "alpha") == 0)
3553 BUILD_CHARCLASS_LOOP (isalpha
);
3554 else if (strcmp (name
, "digit") == 0)
3555 BUILD_CHARCLASS_LOOP (isdigit
);
3556 else if (strcmp (name
, "print") == 0)
3557 BUILD_CHARCLASS_LOOP (isprint
);
3558 else if (strcmp (name
, "upper") == 0)
3559 BUILD_CHARCLASS_LOOP (isupper
);
3560 else if (strcmp (name
, "blank") == 0)
3561 BUILD_CHARCLASS_LOOP (isblank
);
3562 else if (strcmp (name
, "graph") == 0)
3563 BUILD_CHARCLASS_LOOP (isgraph
);
3564 else if (strcmp (name
, "punct") == 0)
3565 BUILD_CHARCLASS_LOOP (ispunct
);
3566 else if (strcmp (name
, "xdigit") == 0)
3567 BUILD_CHARCLASS_LOOP (isxdigit
);
3575 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3576 const unsigned char *class_name
,
3577 const unsigned char *extra
, int non_match
,
3580 re_bitset_ptr_t sbcset
;
3581 #ifdef RE_ENABLE_I18N
3582 re_charset_t
*mbcset
;
3584 #endif /* not RE_ENABLE_I18N */
3586 re_token_t br_token
;
3589 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3590 #ifdef RE_ENABLE_I18N
3591 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3592 #endif /* RE_ENABLE_I18N */
3594 #ifdef RE_ENABLE_I18N
3595 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3596 #else /* not RE_ENABLE_I18N */
3597 if (BE (sbcset
== NULL
, 0))
3598 #endif /* not RE_ENABLE_I18N */
3606 #ifdef RE_ENABLE_I18N
3607 mbcset
->non_match
= 1;
3608 #endif /* not RE_ENABLE_I18N */
3611 /* We don't care the syntax in this case. */
3612 ret
= build_charclass (trans
, sbcset
,
3613 #ifdef RE_ENABLE_I18N
3615 #endif /* RE_ENABLE_I18N */
3618 if (BE (ret
!= REG_NOERROR
, 0))
3621 #ifdef RE_ENABLE_I18N
3622 free_charset (mbcset
);
3623 #endif /* RE_ENABLE_I18N */
3627 /* \w match '_' also. */
3628 for (; *extra
; extra
++)
3629 bitset_set (sbcset
, *extra
);
3631 /* If it is non-matching list. */
3633 bitset_not (sbcset
);
3635 #ifdef RE_ENABLE_I18N
3636 /* Ensure only single byte characters are set. */
3637 if (dfa
->mb_cur_max
> 1)
3638 bitset_mask (sbcset
, dfa
->sb_char
);
3641 /* Build a tree for simple bracket. */
3642 br_token
.type
= SIMPLE_BRACKET
;
3643 br_token
.opr
.sbcset
= sbcset
;
3644 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3645 if (BE (tree
== NULL
, 0))
3646 goto build_word_op_espace
;
3648 #ifdef RE_ENABLE_I18N
3649 if (dfa
->mb_cur_max
> 1)
3651 bin_tree_t
*mbc_tree
;
3652 /* Build a tree for complex bracket. */
3653 br_token
.type
= COMPLEX_BRACKET
;
3654 br_token
.opr
.mbcset
= mbcset
;
3655 dfa
->has_mb_node
= 1;
3656 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3657 if (BE (mbc_tree
== NULL
, 0))
3658 goto build_word_op_espace
;
3659 /* Then join them by ALT node. */
3660 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3661 if (BE (mbc_tree
!= NULL
, 1))
3666 free_charset (mbcset
);
3669 #else /* not RE_ENABLE_I18N */
3671 #endif /* not RE_ENABLE_I18N */
3673 build_word_op_espace
:
3675 #ifdef RE_ENABLE_I18N
3676 free_charset (mbcset
);
3677 #endif /* RE_ENABLE_I18N */
3682 /* This is intended for the expressions like "a{1,3}".
3683 Fetch a number from `input', and return the number.
3684 Return -1, if the number field is empty like "{,1}".
3685 Return -2, If an error is occured. */
3688 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3694 fetch_token (token
, input
, syntax
);
3696 if (BE (token
->type
== END_OF_RE
, 0))
3698 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3700 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3701 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3702 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3707 #ifdef RE_ENABLE_I18N
3709 free_charset (re_charset_t
*cset
)
3711 re_free (cset
->mbchars
);
3713 re_free (cset
->coll_syms
);
3714 re_free (cset
->equiv_classes
);
3715 re_free (cset
->range_starts
);
3716 re_free (cset
->range_ends
);
3718 re_free (cset
->char_classes
);
3721 #endif /* RE_ENABLE_I18N */
3723 /* Functions for binary tree operation. */
3725 /* Create a tree node. */
3728 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3729 re_token_type_t type
)
3733 return create_token_tree (dfa
, left
, right
, &t
);
3737 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3738 const re_token_t
*token
)
3741 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3743 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3745 if (storage
== NULL
)
3747 storage
->next
= dfa
->str_tree_storage
;
3748 dfa
->str_tree_storage
= storage
;
3749 dfa
->str_tree_storage_idx
= 0;
3751 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3753 tree
->parent
= NULL
;
3755 tree
->right
= right
;
3756 tree
->token
= *token
;
3757 tree
->token
.duplicated
= 0;
3758 tree
->token
.opt_subexp
= 0;
3761 tree
->node_idx
= -1;
3764 left
->parent
= tree
;
3766 right
->parent
= tree
;
3770 /* Mark the tree SRC as an optional subexpression.
3771 To be called from preorder or postorder. */
3773 static reg_errcode_t
3774 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3776 int idx
= (int) (long) extra
;
3777 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3778 node
->token
.opt_subexp
= 1;
3783 /* Free the allocated memory inside NODE. */
3786 free_token (re_token_t
*node
)
3788 #ifdef RE_ENABLE_I18N
3789 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3790 free_charset (node
->opr
.mbcset
);
3792 #endif /* RE_ENABLE_I18N */
3793 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3794 re_free (node
->opr
.sbcset
);
3797 /* Worker function for tree walking. Free the allocated memory inside NODE
3798 and its children. */
3800 static reg_errcode_t
3801 free_tree (void *extra
, bin_tree_t
*node
)
3803 free_token (&node
->token
);
3808 /* Duplicate the node SRC, and return new node. This is a preorder
3809 visit similar to the one implemented by the generic visitor, but
3810 we need more infrastructure to maintain two parallel trees --- so,
3811 it's easier to duplicate. */
3814 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3816 const bin_tree_t
*node
;
3817 bin_tree_t
*dup_root
;
3818 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3820 for (node
= root
; ; )
3822 /* Create a new tree and link it back to the current parent. */
3823 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3826 (*p_new
)->parent
= dup_node
;
3827 (*p_new
)->token
.duplicated
= 1;
3830 /* Go to the left node, or up and to the right. */
3834 p_new
= &dup_node
->left
;
3838 const bin_tree_t
*prev
= NULL
;
3839 while (node
->right
== prev
|| node
->right
== NULL
)
3842 node
= node
->parent
;
3843 dup_node
= dup_node
->parent
;
3848 p_new
= &dup_node
->right
;