1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
23 size_t length
, reg_syntax_t syntax
);
24 static void re_compile_fastmap_iter (regex_t
*bufp
,
25 const re_dfastate_t
*init_state
,
27 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
preorder (bin_tree_t
*root
,
38 reg_errcode_t (fn (void *, bin_tree_t
*)),
40 static reg_errcode_t
postorder (bin_tree_t
*root
,
41 reg_errcode_t (fn (void *, bin_tree_t
*)),
43 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
44 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
45 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
47 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
50 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
51 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
52 unsigned int constraint
);
53 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
54 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
56 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
57 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
59 static int peek_token (re_token_t
*token
, re_string_t
*input
,
60 reg_syntax_t syntax
) internal_function
;
61 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
62 reg_syntax_t syntax
, reg_errcode_t
*err
);
63 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
64 re_token_t
*token
, reg_syntax_t syntax
,
65 int nest
, reg_errcode_t
*err
);
66 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
67 re_token_t
*token
, reg_syntax_t syntax
,
68 int nest
, reg_errcode_t
*err
);
69 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
70 re_token_t
*token
, reg_syntax_t syntax
,
71 int nest
, reg_errcode_t
*err
);
72 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
73 re_token_t
*token
, reg_syntax_t syntax
,
74 int nest
, reg_errcode_t
*err
);
75 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
76 re_dfa_t
*dfa
, re_token_t
*token
,
77 reg_syntax_t syntax
, reg_errcode_t
*err
);
78 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
79 re_token_t
*token
, reg_syntax_t syntax
,
81 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
83 re_token_t
*token
, int token_len
,
87 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
91 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
93 int *equiv_class_alloc
,
94 const unsigned char *name
);
95 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
98 int *char_class_alloc
,
99 const unsigned char *class_name
,
100 reg_syntax_t syntax
);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
103 const unsigned char *name
);
104 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
106 const unsigned char *class_name
,
107 reg_syntax_t syntax
);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
110 RE_TRANSLATE_TYPE trans
,
111 const unsigned char *class_name
,
112 const unsigned char *extra
,
113 int non_match
, reg_errcode_t
*err
);
114 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
115 bin_tree_t
*left
, bin_tree_t
*right
,
116 re_token_type_t type
);
117 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
118 bin_tree_t
*left
, bin_tree_t
*right
,
119 const re_token_t
*token
);
120 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
121 static void free_token (re_token_t
*node
);
122 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
123 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 const char __re_error_msgid
[] attribute_hidden
=
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 const size_t __re_error_msgid_idx
[] attribute_hidden
=
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
215 re_compile_pattern (pattern
, length
, bufp
)
218 struct re_pattern_buffer
*bufp
;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
227 /* Match anchors at newline. */
228 bufp
->newline_anchor
= 1;
230 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
234 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
237 weak_alias (__re_compile_pattern
, re_compile_pattern
)
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options
;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
256 re_set_syntax (syntax
)
259 reg_syntax_t ret
= re_syntax_options
;
261 re_syntax_options
= syntax
;
265 weak_alias (__re_set_syntax
, re_set_syntax
)
269 re_compile_fastmap (bufp
)
270 struct re_pattern_buffer
*bufp
;
272 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
273 char *fastmap
= bufp
->fastmap
;
275 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
276 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_word
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_nl
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
281 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
282 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
283 bufp
->fastmap_accurate
= 1;
287 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
291 __attribute ((always_inline
))
292 re_set_fastmap (char *fastmap
, int icase
, int ch
)
296 fastmap
[tolower (ch
)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
303 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
306 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
308 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
309 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
311 int node
= init_state
->nodes
.elems
[node_cnt
];
312 re_token_type_t type
= dfa
->nodes
[node
].type
;
314 if (type
== CHARACTER
)
316 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
320 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
325 *p
++ = dfa
->nodes
[node
].opr
.c
;
326 while (++node
< dfa
->nodes_len
327 && dfa
->nodes
[node
].type
== CHARACTER
328 && dfa
->nodes
[node
].mb_partial
)
329 *p
++ = dfa
->nodes
[node
].opr
.c
;
330 memset (&state
, '\0', sizeof (state
));
331 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
333 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
335 re_set_fastmap (fastmap
, 0, buf
[0]);
339 else if (type
== SIMPLE_BRACKET
)
342 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
345 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
346 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
347 if (w
& ((bitset_word_t
) 1 << j
))
348 re_set_fastmap (fastmap
, icase
, ch
);
351 #ifdef RE_ENABLE_I18N
352 else if (type
== COMPLEX_BRACKET
)
354 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
358 /* See if we have to try all bytes which start multiple collation
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
365 && (cset
->ncoll_syms
|| cset
->nranges
))
367 const int32_t *table
= (const int32_t *)
368 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
369 for (i
= 0; i
< SBC_MAX
; ++i
)
371 re_set_fastmap (fastmap
, icase
, i
);
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa
->mb_cur_max
> 1
380 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
382 || cset
->nequiv_classes
390 memset (&mbs
, 0, sizeof (mbs
));
391 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
392 re_set_fastmap (fastmap
, false, (int) c
);
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i
= 0; i
< cset
->nmbchars
; ++i
)
404 memset (&state
, '\0', sizeof (state
));
405 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
406 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
407 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
409 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
411 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
416 #endif /* RE_ENABLE_I18N */
417 else if (type
== OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type
== OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type
== END_OF_RE
)
423 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
424 if (type
== END_OF_RE
)
425 bufp
->can_be_null
= 1;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 `buffer' to the compiled pattern;
438 `used' to the length of the compiled pattern;
439 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 `fastmap' to an allocated space for the fastmap;
444 `fastmap_accurate' to zero;
445 `re_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (preg
, pattern
, cflags
)
469 regex_t
*__restrict preg
;
470 const char *__restrict pattern
;
474 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
475 : RE_SYNTAX_POSIX_BASIC
);
481 /* Try to allocate space for the fastmap. */
482 preg
->fastmap
= re_malloc (char, SBC_MAX
);
483 if (BE (preg
->fastmap
== NULL
, 0))
486 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
488 /* If REG_NEWLINE is set, newlines are treated differently. */
489 if (cflags
& REG_NEWLINE
)
490 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
491 syntax
&= ~RE_DOT_NEWLINE
;
492 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
493 /* It also changes the matching behavior. */
494 preg
->newline_anchor
= 1;
497 preg
->newline_anchor
= 0;
498 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
499 preg
->translate
= NULL
;
501 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
503 /* POSIX doesn't distinguish between an unmatched open-group and an
504 unmatched close-group: both are REG_EPAREN. */
505 if (ret
== REG_ERPAREN
)
508 /* We have already checked preg->fastmap != NULL. */
509 if (BE (ret
== REG_NOERROR
, 1))
510 /* Compute the fastmap now, since regexec cannot modify the pattern
511 buffer. This function never fails in this implementation. */
512 (void) re_compile_fastmap (preg
);
515 /* Some error occurred while compiling the expression. */
516 re_free (preg
->fastmap
);
517 preg
->fastmap
= NULL
;
523 weak_alias (__regcomp
, regcomp
)
526 /* Returns a message corresponding to an error code, ERRCODE, returned
527 from either regcomp or regexec. We don't use PREG here. */
530 regerror (errcode
, preg
, errbuf
, errbuf_size
)
532 const regex_t
*__restrict preg
;
533 char *__restrict errbuf
;
540 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
541 / sizeof (__re_error_msgid_idx
[0])), 0))
542 /* Only error codes returned by the rest of the code should be passed
543 to this routine. If we are given anything else, or if other regex
544 code generates an invalid error code, then the program has a bug.
545 Dump core so we can fix it. */
548 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
550 msg_size
= strlen (msg
) + 1; /* Includes the null. */
552 if (BE (errbuf_size
!= 0, 1))
554 if (BE (msg_size
> errbuf_size
, 0))
556 #if defined HAVE_MEMPCPY || defined _LIBC
557 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
559 memcpy (errbuf
, msg
, errbuf_size
- 1);
560 errbuf
[errbuf_size
- 1] = 0;
564 memcpy (errbuf
, msg
, msg_size
);
570 weak_alias (__regerror
, regerror
)
574 #ifdef RE_ENABLE_I18N
575 /* This static array is used for the map to single-byte characters when
576 UTF-8 is used. Otherwise we would allocate memory just to initialize
577 it the same all the time. UTF-8 is the preferred encoding so this is
578 a worthwhile optimization. */
579 static const bitset_t utf8_sb_map
=
581 /* Set the first 128 bits. */
582 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
588 free_dfa_content (re_dfa_t
*dfa
)
593 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
594 free_token (dfa
->nodes
+ i
);
595 re_free (dfa
->nexts
);
596 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
598 if (dfa
->eclosures
!= NULL
)
599 re_node_set_free (dfa
->eclosures
+ i
);
600 if (dfa
->inveclosures
!= NULL
)
601 re_node_set_free (dfa
->inveclosures
+ i
);
602 if (dfa
->edests
!= NULL
)
603 re_node_set_free (dfa
->edests
+ i
);
605 re_free (dfa
->edests
);
606 re_free (dfa
->eclosures
);
607 re_free (dfa
->inveclosures
);
608 re_free (dfa
->nodes
);
610 if (dfa
->state_table
)
611 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
613 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
614 for (j
= 0; j
< entry
->num
; ++j
)
616 re_dfastate_t
*state
= entry
->array
[j
];
619 re_free (entry
->array
);
621 re_free (dfa
->state_table
);
622 #ifdef RE_ENABLE_I18N
623 if (dfa
->sb_char
!= utf8_sb_map
)
624 re_free (dfa
->sb_char
);
626 re_free (dfa
->subexp_map
);
628 re_free (dfa
->re_str
);
635 /* Free dynamically allocated space used by PREG. */
641 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
642 if (BE (dfa
!= NULL
, 1))
643 free_dfa_content (dfa
);
647 re_free (preg
->fastmap
);
648 preg
->fastmap
= NULL
;
650 re_free (preg
->translate
);
651 preg
->translate
= NULL
;
654 weak_alias (__regfree
, regfree
)
657 /* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
660 #if defined _REGEX_RE_COMP || defined _LIBC
662 /* BSD has one and only one pattern buffer. */
663 static struct re_pattern_buffer re_comp_buf
;
667 /* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
680 if (!re_comp_buf
.buffer
)
681 return gettext ("No previous regular expression");
685 if (re_comp_buf
.buffer
)
687 fastmap
= re_comp_buf
.fastmap
;
688 re_comp_buf
.fastmap
= NULL
;
689 __regfree (&re_comp_buf
);
690 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
691 re_comp_buf
.fastmap
= fastmap
;
694 if (re_comp_buf
.fastmap
== NULL
)
696 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
697 if (re_comp_buf
.fastmap
== NULL
)
698 return (char *) gettext (__re_error_msgid
699 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
702 /* Since `re_exec' always passes NULL for the `regs' argument, we
703 don't need to initialize the pattern buffer fields which affect it. */
705 /* Match anchors at newlines. */
706 re_comp_buf
.newline_anchor
= 1;
708 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
713 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
714 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
718 libc_freeres_fn (free_mem
)
720 __regfree (&re_comp_buf
);
724 #endif /* _REGEX_RE_COMP */
726 /* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
731 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
734 reg_errcode_t err
= REG_NOERROR
;
738 /* Initialize the pattern buffer. */
739 preg
->fastmap_accurate
= 0;
740 preg
->syntax
= syntax
;
741 preg
->not_bol
= preg
->not_eol
= 0;
744 preg
->can_be_null
= 0;
745 preg
->regs_allocated
= REGS_UNALLOCATED
;
747 /* Initialize the dfa. */
748 dfa
= (re_dfa_t
*) preg
->buffer
;
749 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
758 preg
->allocated
= sizeof (re_dfa_t
);
759 preg
->buffer
= (unsigned char *) dfa
;
761 preg
->used
= sizeof (re_dfa_t
);
763 err
= init_dfa (dfa
, length
);
764 if (BE (err
!= REG_NOERROR
, 0))
766 free_dfa_content (dfa
);
772 /* Note: length+1 will not overflow since it is checked in init_dfa. */
773 dfa
->re_str
= re_malloc (char, length
+ 1);
774 strncpy (dfa
->re_str
, pattern
, length
+ 1);
777 __libc_lock_init (dfa
->lock
);
779 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
780 syntax
& RE_ICASE
, dfa
);
781 if (BE (err
!= REG_NOERROR
, 0))
783 re_compile_internal_free_return
:
784 free_workarea_compile (preg
);
785 re_string_destruct (®exp
);
786 free_dfa_content (dfa
);
792 /* Parse the regular expression, and build a structure tree. */
794 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
795 if (BE (dfa
->str_tree
== NULL
, 0))
796 goto re_compile_internal_free_return
;
798 /* Analyze the tree and create the nfa. */
799 err
= analyze (preg
);
800 if (BE (err
!= REG_NOERROR
, 0))
801 goto re_compile_internal_free_return
;
803 #ifdef RE_ENABLE_I18N
804 /* If possible, do searching in single byte encoding to speed things up. */
805 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
809 /* Then create the initial state of the dfa. */
810 err
= create_initial_state (dfa
);
812 /* Release work areas. */
813 free_workarea_compile (preg
);
814 re_string_destruct (®exp
);
816 if (BE (err
!= REG_NOERROR
, 0))
818 free_dfa_content (dfa
);
826 /* Initialize DFA. We use the length of the regular expression PAT_LEN
827 as the initial length of some arrays. */
830 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
832 unsigned int table_size
;
837 memset (dfa
, '\0', sizeof (re_dfa_t
));
839 /* Force allocation of str_tree_storage the first time. */
840 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
842 /* Avoid overflows. */
843 if (pat_len
== SIZE_MAX
)
846 dfa
->nodes_alloc
= pat_len
+ 1;
847 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
849 /* table_size = 2 ^ ceil(log pat_len) */
850 for (table_size
= 1; ; table_size
<<= 1)
851 if (table_size
> pat_len
)
854 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
855 dfa
->state_hash_mask
= table_size
- 1;
857 dfa
->mb_cur_max
= MB_CUR_MAX
;
859 if (dfa
->mb_cur_max
== 6
860 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
862 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
865 # ifdef HAVE_LANGINFO_CODESET
866 codeset_name
= nl_langinfo (CODESET
);
868 codeset_name
= getenv ("LC_ALL");
869 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
870 codeset_name
= getenv ("LC_CTYPE");
871 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
872 codeset_name
= getenv ("LANG");
873 if (codeset_name
== NULL
)
875 else if (strchr (codeset_name
, '.') != NULL
)
876 codeset_name
= strchr (codeset_name
, '.') + 1;
879 if (strcasecmp (codeset_name
, "UTF-8") == 0
880 || strcasecmp (codeset_name
, "UTF8") == 0)
883 /* We check exhaustively in the loop below if this charset is a
884 superset of ASCII. */
885 dfa
->map_notascii
= 0;
888 #ifdef RE_ENABLE_I18N
889 if (dfa
->mb_cur_max
> 1)
892 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
897 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
898 if (BE (dfa
->sb_char
== NULL
, 0))
901 /* Set the bits corresponding to single byte chars. */
902 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
903 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
905 wint_t wch
= __btowc (ch
);
907 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
909 if (isascii (ch
) && wch
!= ch
)
910 dfa
->map_notascii
= 1;
917 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
922 /* Initialize WORD_CHAR table, which indicate which character is
923 "word". In this case "word" means that it is the word construction
924 character used by some operators like "\<", "\>", etc. */
928 init_word_char (re_dfa_t
*dfa
)
930 dfa
->word_ops_used
= 1;
933 if (BE (dfa
->map_notascii
== 0, 1))
935 if (sizeof (dfa
->word_char
[0]) == 8)
937 /* The extra temporaries here avoid "implicitly truncated"
938 warnings in the case when this is dead code, i.e. 32-bit. */
939 const uint64_t wc0
= UINT64_C (0x03ff000000000000);
940 const uint64_t wc1
= UINT64_C (0x07fffffe87fffffe);
941 dfa
->word_char
[0] = wc0
;
942 dfa
->word_char
[1] = wc1
;
945 else if (sizeof (dfa
->word_char
[0]) == 4)
947 dfa
->word_char
[0] = UINT32_C (0x00000000);
948 dfa
->word_char
[1] = UINT32_C (0x03ff0000);
949 dfa
->word_char
[2] = UINT32_C (0x87fffffe);
950 dfa
->word_char
[3] = UINT32_C (0x07fffffe);
957 if (BE (dfa
->is_utf8
, 1))
959 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
964 for (; i
< BITSET_WORDS
; ++i
)
965 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
966 if (isalnum (ch
) || ch
== '_')
967 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
970 /* Free the work area which are only used while compiling. */
973 free_workarea_compile (regex_t
*preg
)
975 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
976 bin_tree_storage_t
*storage
, *next
;
977 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
979 next
= storage
->next
;
982 dfa
->str_tree_storage
= NULL
;
983 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
984 dfa
->str_tree
= NULL
;
985 re_free (dfa
->org_indices
);
986 dfa
->org_indices
= NULL
;
989 /* Create initial states for all contexts. */
992 create_initial_state (re_dfa_t
*dfa
)
996 re_node_set init_nodes
;
998 /* Initial states have the epsilon closure of the node which is
999 the first node of the regular expression. */
1000 first
= dfa
->str_tree
->first
->node_idx
;
1001 dfa
->init_node
= first
;
1002 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1003 if (BE (err
!= REG_NOERROR
, 0))
1006 /* The back-references which are in initial states can epsilon transit,
1007 since in this case all of the subexpressions can be null.
1008 Then we add epsilon closures of the nodes which are the next nodes of
1009 the back-references. */
1010 if (dfa
->nbackref
> 0)
1011 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1013 int node_idx
= init_nodes
.elems
[i
];
1014 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1017 if (type
!= OP_BACK_REF
)
1019 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1021 re_token_t
*clexp_node
;
1022 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1023 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1024 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1027 if (clexp_idx
== init_nodes
.nelem
)
1030 if (type
== OP_BACK_REF
)
1032 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1033 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1035 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1038 if (err
!= REG_NOERROR
)
1045 /* It must be the first time to invoke acquire_state. */
1046 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1047 /* We don't check ERR here, since the initial state must not be NULL. */
1048 if (BE (dfa
->init_state
== NULL
, 0))
1050 if (dfa
->init_state
->has_constraint
)
1052 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1054 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1056 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1060 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1061 || dfa
->init_state_begbuf
== NULL
, 0))
1065 dfa
->init_state_word
= dfa
->init_state_nl
1066 = dfa
->init_state_begbuf
= dfa
->init_state
;
1068 re_node_set_free (&init_nodes
);
1072 #ifdef RE_ENABLE_I18N
1073 /* If it is possible to do searching in single byte encoding instead of UTF-8
1074 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1075 DFA nodes where needed. */
1078 optimize_utf8 (re_dfa_t
*dfa
)
1080 int node
, i
, mb_chars
= 0, has_period
= 0;
1082 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1083 switch (dfa
->nodes
[node
].type
)
1086 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1090 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1098 /* Word anchors etc. cannot be handled. It's okay to test
1099 opr.ctx_type since constraints (for all DFA nodes) are
1100 created by ORing one or more opr.ctx_type values. */
1110 case OP_DUP_ASTERISK
:
1111 case OP_OPEN_SUBEXP
:
1112 case OP_CLOSE_SUBEXP
:
1114 case COMPLEX_BRACKET
:
1116 case SIMPLE_BRACKET
:
1117 /* Just double check. The non-ASCII range starts at 0x80. */
1118 assert (0x80 % BITSET_WORD_BITS
== 0);
1119 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1120 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1127 if (mb_chars
|| has_period
)
1128 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1130 if (dfa
->nodes
[node
].type
== CHARACTER
1131 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1132 dfa
->nodes
[node
].mb_partial
= 0;
1133 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1134 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1137 /* The search can be in single byte locale. */
1138 dfa
->mb_cur_max
= 1;
1140 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145 "eclosure", and "inveclosure". */
1147 static reg_errcode_t
1148 analyze (regex_t
*preg
)
1150 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1153 /* Allocate arrays. */
1154 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1155 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1156 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1157 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1158 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1159 || dfa
->eclosures
== NULL
, 0))
1162 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1163 if (dfa
->subexp_map
!= NULL
)
1166 for (i
= 0; i
< preg
->re_nsub
; i
++)
1167 dfa
->subexp_map
[i
] = i
;
1168 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1169 for (i
= 0; i
< preg
->re_nsub
; i
++)
1170 if (dfa
->subexp_map
[i
] != i
)
1172 if (i
== preg
->re_nsub
)
1174 free (dfa
->subexp_map
);
1175 dfa
->subexp_map
= NULL
;
1179 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1180 if (BE (ret
!= REG_NOERROR
, 0))
1182 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1183 if (BE (ret
!= REG_NOERROR
, 0))
1185 preorder (dfa
->str_tree
, calc_next
, dfa
);
1186 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1187 if (BE (ret
!= REG_NOERROR
, 0))
1189 ret
= calc_eclosure (dfa
);
1190 if (BE (ret
!= REG_NOERROR
, 0))
1193 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1195 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1198 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1199 if (BE (dfa
->inveclosures
== NULL
, 0))
1201 ret
= calc_inveclosure (dfa
);
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208 implement parse tree visits. Instead, we use parent pointers and
1209 some hairy code in these two functions. */
1210 static reg_errcode_t
1211 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1214 bin_tree_t
*node
, *prev
;
1216 for (node
= root
; ; )
1218 /* Descend down the tree, preferably to the left (or to the right
1219 if that's the only child). */
1220 while (node
->left
|| node
->right
)
1228 reg_errcode_t err
= fn (extra
, node
);
1229 if (BE (err
!= REG_NOERROR
, 0))
1231 if (node
->parent
== NULL
)
1234 node
= node
->parent
;
1236 /* Go up while we have a node that is reached from the right. */
1237 while (node
->right
== prev
|| node
->right
== NULL
);
1242 static reg_errcode_t
1243 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1248 for (node
= root
; ; )
1250 reg_errcode_t err
= fn (extra
, node
);
1251 if (BE (err
!= REG_NOERROR
, 0))
1254 /* Go to the left node, or up and to the right. */
1259 bin_tree_t
*prev
= NULL
;
1260 while (node
->right
== prev
|| node
->right
== NULL
)
1263 node
= node
->parent
;
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1274 backreferences as well. Requires a preorder visit. */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra
, bin_tree_t
*node
)
1278 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1280 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1282 int idx
= node
->token
.opr
.idx
;
1283 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1284 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1287 else if (node
->token
.type
== SUBEXP
1288 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1290 int other_idx
= node
->left
->token
.opr
.idx
;
1292 node
->left
= node
->left
->left
;
1294 node
->left
->parent
= node
;
1296 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1297 if (other_idx
< BITSET_WORD_BITS
)
1298 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1306 static reg_errcode_t
1307 lower_subexps (void *extra
, bin_tree_t
*node
)
1309 regex_t
*preg
= (regex_t
*) extra
;
1310 reg_errcode_t err
= REG_NOERROR
;
1312 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1314 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1316 node
->left
->parent
= node
;
1318 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1320 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1322 node
->right
->parent
= node
;
1329 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1331 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1332 bin_tree_t
*body
= node
->left
;
1333 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1336 /* We do not optimize empty subexpressions, because otherwise we may
1337 have bad CONCAT nodes with NULL children. This is obviously not
1338 very common, so we do not lose much. An example that triggers
1339 this case is the sed "script" /\(\)/x. */
1340 && node
->left
!= NULL
1341 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1342 || !(dfa
->used_bkref_map
1343 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1346 /* Convert the SUBEXP node to the concatenation of an
1347 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1348 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1349 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1350 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1351 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1352 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1358 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1359 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364 nodes. Requires a postorder visit. */
1365 static reg_errcode_t
1366 calc_first (void *extra
, bin_tree_t
*node
)
1368 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1369 if (node
->token
.type
== CONCAT
)
1371 node
->first
= node
->left
->first
;
1372 node
->node_idx
= node
->left
->node_idx
;
1377 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1378 if (BE (node
->node_idx
== -1, 0))
1380 if (node
->token
.type
== ANCHOR
)
1381 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1386 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1387 static reg_errcode_t
1388 calc_next (void *extra
, bin_tree_t
*node
)
1390 switch (node
->token
.type
)
1392 case OP_DUP_ASTERISK
:
1393 node
->left
->next
= node
;
1396 node
->left
->next
= node
->right
->first
;
1397 node
->right
->next
= node
->next
;
1401 node
->left
->next
= node
->next
;
1403 node
->right
->next
= node
->next
;
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1413 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1414 int idx
= node
->node_idx
;
1415 reg_errcode_t err
= REG_NOERROR
;
1417 switch (node
->token
.type
)
1423 assert (node
->next
== NULL
);
1426 case OP_DUP_ASTERISK
:
1430 dfa
->has_plural_match
= 1;
1431 if (node
->left
!= NULL
)
1432 left
= node
->left
->first
->node_idx
;
1434 left
= node
->next
->node_idx
;
1435 if (node
->right
!= NULL
)
1436 right
= node
->right
->first
->node_idx
;
1438 right
= node
->next
->node_idx
;
1440 assert (right
> -1);
1441 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1446 case OP_OPEN_SUBEXP
:
1447 case OP_CLOSE_SUBEXP
:
1448 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1452 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1453 if (node
->token
.type
== OP_BACK_REF
)
1454 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1458 assert (!IS_EPSILON_NODE (node
->token
.type
));
1459 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468 to their own constraint. */
1470 static reg_errcode_t
1472 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1473 int root_node
, unsigned int init_constraint
)
1475 int org_node
, clone_node
, ret
;
1476 unsigned int constraint
= init_constraint
;
1477 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1479 int org_dest
, clone_dest
;
1480 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1482 /* If the back reference epsilon-transit, its destination must
1483 also have the constraint. Then duplicate the epsilon closure
1484 of the destination of the back reference, and store it in
1485 edests of the back reference. */
1486 org_dest
= dfa
->nexts
[org_node
];
1487 re_node_set_empty (dfa
->edests
+ clone_node
);
1488 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1489 if (BE (clone_dest
== -1, 0))
1491 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1492 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1493 if (BE (ret
< 0, 0))
1496 else if (dfa
->edests
[org_node
].nelem
== 0)
1498 /* In case of the node can't epsilon-transit, don't duplicate the
1499 destination and store the original destination as the
1500 destination of the node. */
1501 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1504 else if (dfa
->edests
[org_node
].nelem
== 1)
1506 /* In case of the node can epsilon-transit, and it has only one
1508 org_dest
= dfa
->edests
[org_node
].elems
[0];
1509 re_node_set_empty (dfa
->edests
+ clone_node
);
1510 /* If the node is root_node itself, it means the epsilon clsoure
1511 has a loop. Then tie it to the destination of the root_node. */
1512 if (org_node
== root_node
&& clone_node
!= org_node
)
1514 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1515 if (BE (ret
< 0, 0))
1519 /* In case of the node has another constraint, add it. */
1520 constraint
|= dfa
->nodes
[org_node
].constraint
;
1521 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1522 if (BE (clone_dest
== -1, 0))
1524 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1525 if (BE (ret
< 0, 0))
1528 else /* dfa->edests[org_node].nelem == 2 */
1530 /* In case of the node can epsilon-transit, and it has two
1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1532 org_dest
= dfa
->edests
[org_node
].elems
[0];
1533 re_node_set_empty (dfa
->edests
+ clone_node
);
1534 /* Search for a duplicated node which satisfies the constraint. */
1535 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1536 if (clone_dest
== -1)
1538 /* There is no such duplicated node, create a new one. */
1540 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1541 if (BE (clone_dest
== -1, 0))
1543 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1544 if (BE (ret
< 0, 0))
1546 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1547 root_node
, constraint
);
1548 if (BE (err
!= REG_NOERROR
, 0))
1553 /* There is a duplicated node which satisfies the constraint,
1554 use it to avoid infinite loop. */
1555 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1556 if (BE (ret
< 0, 0))
1560 org_dest
= dfa
->edests
[org_node
].elems
[1];
1561 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1562 if (BE (clone_dest
== -1, 0))
1564 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1565 if (BE (ret
< 0, 0))
1568 org_node
= org_dest
;
1569 clone_node
= clone_dest
;
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575 satisfies the constraint CONSTRAINT. */
1578 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1579 unsigned int constraint
)
1582 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1584 if (org_node
== dfa
->org_indices
[idx
]
1585 && constraint
== dfa
->nodes
[idx
].constraint
)
1586 return idx
; /* Found. */
1588 return -1; /* Not found. */
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592 Return the index of the new node, or -1 if insufficient storage is
1596 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1598 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1599 if (BE (dup_idx
!= -1, 1))
1601 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1602 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1603 dfa
->nodes
[dup_idx
].duplicated
= 1;
1605 /* Store the index of the original node. */
1606 dfa
->org_indices
[dup_idx
] = org_idx
;
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t
*dfa
)
1615 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1616 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1618 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1620 int *elems
= dfa
->eclosures
[src
].elems
;
1621 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1623 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1624 if (BE (ret
== -1, 0))
1632 /* Calculate "eclosure" for all the node in DFA. */
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t
*dfa
)
1637 int node_idx
, incomplete
;
1639 assert (dfa
->nodes_len
> 0);
1642 /* For each nodes, calculate epsilon closure. */
1643 for (node_idx
= 0; ; ++node_idx
)
1646 re_node_set eclosure_elem
;
1647 if (node_idx
== dfa
->nodes_len
)
1656 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1659 /* If we have already calculated, skip it. */
1660 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1662 /* Calculate epsilon closure of `node_idx'. */
1663 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1664 if (BE (err
!= REG_NOERROR
, 0))
1667 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1670 re_node_set_free (&eclosure_elem
);
1676 /* Calculate epsilon closure of NODE. */
1678 static reg_errcode_t
1679 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1683 re_node_set eclosure
;
1686 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1687 if (BE (err
!= REG_NOERROR
, 0))
1690 /* This indicates that we are calculating this node now.
1691 We reference this value to avoid infinite loop. */
1692 dfa
->eclosures
[node
].nelem
= -1;
1694 /* If the current node has constraints, duplicate all nodes
1695 since they must inherit the constraints. */
1696 if (dfa
->nodes
[node
].constraint
1697 && dfa
->edests
[node
].nelem
1698 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1700 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1701 dfa
->nodes
[node
].constraint
);
1702 if (BE (err
!= REG_NOERROR
, 0))
1706 /* Expand each epsilon destination nodes. */
1707 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1708 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1710 re_node_set eclosure_elem
;
1711 int edest
= dfa
->edests
[node
].elems
[i
];
1712 /* If calculating the epsilon closure of `edest' is in progress,
1713 return intermediate result. */
1714 if (dfa
->eclosures
[edest
].nelem
== -1)
1719 /* If we haven't calculated the epsilon closure of `edest' yet,
1720 calculate now. Otherwise use calculated epsilon closure. */
1721 if (dfa
->eclosures
[edest
].nelem
== 0)
1723 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1724 if (BE (err
!= REG_NOERROR
, 0))
1728 eclosure_elem
= dfa
->eclosures
[edest
];
1729 /* Merge the epsilon closure of `edest'. */
1730 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1731 if (BE (err
!= REG_NOERROR
, 0))
1733 /* If the epsilon closure of `edest' is incomplete,
1734 the epsilon closure of this node is also incomplete. */
1735 if (dfa
->eclosures
[edest
].nelem
== 0)
1738 re_node_set_free (&eclosure_elem
);
1742 /* An epsilon closure includes itself. */
1743 ret
= re_node_set_insert (&eclosure
, node
);
1744 if (BE (ret
< 0, 0))
1746 if (incomplete
&& !root
)
1747 dfa
->eclosures
[node
].nelem
= 0;
1749 dfa
->eclosures
[node
] = eclosure
;
1750 *new_set
= eclosure
;
1754 /* Functions for token which are used in the parser. */
1756 /* Fetch a token from INPUT.
1757 We must not use this function inside bracket expressions. */
1761 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1763 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1766 /* Peek a token from INPUT, and return the length of the token.
1767 We must not use this function inside bracket expressions. */
1771 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1775 if (re_string_eoi (input
))
1777 token
->type
= END_OF_RE
;
1781 c
= re_string_peek_byte (input
, 0);
1784 token
->word_char
= 0;
1785 #ifdef RE_ENABLE_I18N
1786 token
->mb_partial
= 0;
1787 if (input
->mb_cur_max
> 1 &&
1788 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1790 token
->type
= CHARACTER
;
1791 token
->mb_partial
= 1;
1798 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1800 token
->type
= BACK_SLASH
;
1804 c2
= re_string_peek_byte_case (input
, 1);
1806 token
->type
= CHARACTER
;
1807 #ifdef RE_ENABLE_I18N
1808 if (input
->mb_cur_max
> 1)
1810 wint_t wc
= re_string_wchar_at (input
,
1811 re_string_cur_idx (input
) + 1);
1812 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1816 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1821 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1822 token
->type
= OP_ALT
;
1824 case '1': case '2': case '3': case '4': case '5':
1825 case '6': case '7': case '8': case '9':
1826 if (!(syntax
& RE_NO_BK_REFS
))
1828 token
->type
= OP_BACK_REF
;
1829 token
->opr
.idx
= c2
- '1';
1833 if (!(syntax
& RE_NO_GNU_OPS
))
1835 token
->type
= ANCHOR
;
1836 token
->opr
.ctx_type
= WORD_FIRST
;
1840 if (!(syntax
& RE_NO_GNU_OPS
))
1842 token
->type
= ANCHOR
;
1843 token
->opr
.ctx_type
= WORD_LAST
;
1847 if (!(syntax
& RE_NO_GNU_OPS
))
1849 token
->type
= ANCHOR
;
1850 token
->opr
.ctx_type
= WORD_DELIM
;
1854 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= ANCHOR
;
1857 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1862 token
->type
= OP_WORD
;
1865 if (!(syntax
& RE_NO_GNU_OPS
))
1866 token
->type
= OP_NOTWORD
;
1869 if (!(syntax
& RE_NO_GNU_OPS
))
1870 token
->type
= OP_SPACE
;
1873 if (!(syntax
& RE_NO_GNU_OPS
))
1874 token
->type
= OP_NOTSPACE
;
1877 if (!(syntax
& RE_NO_GNU_OPS
))
1879 token
->type
= ANCHOR
;
1880 token
->opr
.ctx_type
= BUF_FIRST
;
1884 if (!(syntax
& RE_NO_GNU_OPS
))
1886 token
->type
= ANCHOR
;
1887 token
->opr
.ctx_type
= BUF_LAST
;
1891 if (!(syntax
& RE_NO_BK_PARENS
))
1892 token
->type
= OP_OPEN_SUBEXP
;
1895 if (!(syntax
& RE_NO_BK_PARENS
))
1896 token
->type
= OP_CLOSE_SUBEXP
;
1899 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1900 token
->type
= OP_DUP_PLUS
;
1903 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1904 token
->type
= OP_DUP_QUESTION
;
1907 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1908 token
->type
= OP_OPEN_DUP_NUM
;
1911 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1912 token
->type
= OP_CLOSE_DUP_NUM
;
1920 token
->type
= CHARACTER
;
1921 #ifdef RE_ENABLE_I18N
1922 if (input
->mb_cur_max
> 1)
1924 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1925 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1929 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1934 if (syntax
& RE_NEWLINE_ALT
)
1935 token
->type
= OP_ALT
;
1938 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1939 token
->type
= OP_ALT
;
1942 token
->type
= OP_DUP_ASTERISK
;
1945 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1946 token
->type
= OP_DUP_PLUS
;
1949 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1950 token
->type
= OP_DUP_QUESTION
;
1953 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1954 token
->type
= OP_OPEN_DUP_NUM
;
1957 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1958 token
->type
= OP_CLOSE_DUP_NUM
;
1961 if (syntax
& RE_NO_BK_PARENS
)
1962 token
->type
= OP_OPEN_SUBEXP
;
1965 if (syntax
& RE_NO_BK_PARENS
)
1966 token
->type
= OP_CLOSE_SUBEXP
;
1969 token
->type
= OP_OPEN_BRACKET
;
1972 token
->type
= OP_PERIOD
;
1975 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1976 re_string_cur_idx (input
) != 0)
1978 char prev
= re_string_peek_byte (input
, -1);
1979 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1982 token
->type
= ANCHOR
;
1983 token
->opr
.ctx_type
= LINE_FIRST
;
1986 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1987 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1990 re_string_skip_bytes (input
, 1);
1991 peek_token (&next
, input
, syntax
);
1992 re_string_skip_bytes (input
, -1);
1993 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1996 token
->type
= ANCHOR
;
1997 token
->opr
.ctx_type
= LINE_LAST
;
2005 /* Peek a token from INPUT, and return the length of the token.
2006 We must not use this function out of bracket expressions. */
2010 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2013 if (re_string_eoi (input
))
2015 token
->type
= END_OF_RE
;
2018 c
= re_string_peek_byte (input
, 0);
2021 #ifdef RE_ENABLE_I18N
2022 if (input
->mb_cur_max
> 1 &&
2023 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2025 token
->type
= CHARACTER
;
2028 #endif /* RE_ENABLE_I18N */
2030 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2031 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2033 /* In this case, '\' escape a character. */
2035 re_string_skip_bytes (input
, 1);
2036 c2
= re_string_peek_byte (input
, 0);
2038 token
->type
= CHARACTER
;
2041 if (c
== '[') /* '[' is a special char in a bracket exps. */
2045 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2046 c2
= re_string_peek_byte (input
, 1);
2054 token
->type
= OP_OPEN_COLL_ELEM
;
2057 token
->type
= OP_OPEN_EQUIV_CLASS
;
2060 if (syntax
& RE_CHAR_CLASSES
)
2062 token
->type
= OP_OPEN_CHAR_CLASS
;
2065 /* else fall through. */
2067 token
->type
= CHARACTER
;
2077 token
->type
= OP_CHARSET_RANGE
;
2080 token
->type
= OP_CLOSE_BRACKET
;
2083 token
->type
= OP_NON_MATCH_LIST
;
2086 token
->type
= CHARACTER
;
2091 /* Functions for parser. */
2093 /* Entry point of the parser.
2094 Parse the regular expression REGEXP and return the structure tree.
2095 If an error is occured, ERR is set by error code, and return NULL.
2096 This function build the following tree, from regular expression <reg_exp>:
2102 CAT means concatenation.
2103 EOR means end of regular expression. */
2106 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2109 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2110 bin_tree_t
*tree
, *eor
, *root
;
2111 re_token_t current_token
;
2112 dfa
->syntax
= syntax
;
2113 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2114 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2115 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2117 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2119 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2122 if (BE (eor
== NULL
|| root
== NULL
, 0))
2130 /* This function build the following tree, from regular expression
2131 <branch1>|<branch2>:
2137 ALT means alternative, which represents the operator `|'. */
2140 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2141 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2143 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2144 bin_tree_t
*tree
, *branch
= NULL
;
2145 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2146 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2149 while (token
->type
== OP_ALT
)
2151 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2152 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2153 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2155 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2156 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2159 postorder (tree
, free_tree
, NULL
);
2165 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2166 if (BE (tree
== NULL
, 0))
2175 /* This function build the following tree, from regular expression
2182 CAT means concatenation. */
2185 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2186 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2188 bin_tree_t
*tree
, *exp
;
2189 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2190 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2191 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2194 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2195 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2197 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2198 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2201 postorder (tree
, free_tree
, NULL
);
2204 if (tree
!= NULL
&& exp
!= NULL
)
2206 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2207 if (newtree
== NULL
)
2209 postorder (exp
, free_tree
, NULL
);
2210 postorder (tree
, free_tree
, NULL
);
2216 else if (tree
== NULL
)
2218 /* Otherwise exp == NULL, we don't need to create new tree. */
2223 /* This function build the following tree, from regular expression a*:
2230 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2231 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2233 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2235 switch (token
->type
)
2238 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2239 if (BE (tree
== NULL
, 0))
2244 #ifdef RE_ENABLE_I18N
2245 if (dfa
->mb_cur_max
> 1)
2247 while (!re_string_eoi (regexp
)
2248 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2250 bin_tree_t
*mbc_remain
;
2251 fetch_token (token
, regexp
, syntax
);
2252 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2253 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2254 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2263 case OP_OPEN_SUBEXP
:
2264 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2265 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2268 case OP_OPEN_BRACKET
:
2269 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2270 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2274 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2279 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2280 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2281 if (BE (tree
== NULL
, 0))
2287 dfa
->has_mb_node
= 1;
2289 case OP_OPEN_DUP_NUM
:
2290 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2296 case OP_DUP_ASTERISK
:
2298 case OP_DUP_QUESTION
:
2299 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2304 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2306 fetch_token (token
, regexp
, syntax
);
2307 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2309 /* else fall through */
2310 case OP_CLOSE_SUBEXP
:
2311 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2312 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2317 /* else fall through */
2318 case OP_CLOSE_DUP_NUM
:
2319 /* We treat it as a normal character. */
2321 /* Then we can these characters as normal characters. */
2322 token
->type
= CHARACTER
;
2323 /* mb_partial and word_char bits should be initialized already
2325 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2326 if (BE (tree
== NULL
, 0))
2333 if ((token
->opr
.ctx_type
2334 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2335 && dfa
->word_ops_used
== 0)
2336 init_word_char (dfa
);
2337 if (token
->opr
.ctx_type
== WORD_DELIM
2338 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2340 bin_tree_t
*tree_first
, *tree_last
;
2341 if (token
->opr
.ctx_type
== WORD_DELIM
)
2343 token
->opr
.ctx_type
= WORD_FIRST
;
2344 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2345 token
->opr
.ctx_type
= WORD_LAST
;
2349 token
->opr
.ctx_type
= INSIDE_WORD
;
2350 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2351 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2353 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2354 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2355 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2363 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2364 if (BE (tree
== NULL
, 0))
2370 /* We must return here, since ANCHORs can't be followed
2371 by repetition operators.
2372 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2373 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2374 fetch_token (token
, regexp
, syntax
);
2377 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2378 if (BE (tree
== NULL
, 0))
2383 if (dfa
->mb_cur_max
> 1)
2384 dfa
->has_mb_node
= 1;
2388 tree
= build_charclass_op (dfa
, regexp
->trans
,
2389 (const unsigned char *) "alnum",
2390 (const unsigned char *) "_",
2391 token
->type
== OP_NOTWORD
, err
);
2392 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2397 tree
= build_charclass_op (dfa
, regexp
->trans
,
2398 (const unsigned char *) "space",
2399 (const unsigned char *) "",
2400 token
->type
== OP_NOTSPACE
, err
);
2401 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2411 /* Must not happen? */
2417 fetch_token (token
, regexp
, syntax
);
2419 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2420 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2422 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2423 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2426 postorder (tree
, free_tree
, NULL
);
2430 /* In BRE consecutive duplications are not allowed. */
2431 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2432 && (token
->type
== OP_DUP_ASTERISK
2433 || token
->type
== OP_OPEN_DUP_NUM
))
2436 postorder (tree
, free_tree
, NULL
);
2445 /* This function build the following tree, from regular expression
2453 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2454 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2456 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2459 cur_nsub
= preg
->re_nsub
++;
2461 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2463 /* The subexpression may be a null string. */
2464 if (token
->type
== OP_CLOSE_SUBEXP
)
2468 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2469 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2472 postorder (tree
, free_tree
, NULL
);
2475 if (BE (*err
!= REG_NOERROR
, 0))
2479 if (cur_nsub
<= '9' - '1')
2480 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2482 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2483 if (BE (tree
== NULL
, 0))
2488 tree
->token
.opr
.idx
= cur_nsub
;
2492 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2495 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2496 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2498 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2499 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2500 re_token_t start_token
= *token
;
2502 if (token
->type
== OP_OPEN_DUP_NUM
)
2505 start
= fetch_number (regexp
, token
, syntax
);
2508 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2509 start
= 0; /* We treat "{,m}" as "{0,m}". */
2512 *err
= REG_BADBR
; /* <re>{} is invalid. */
2516 if (BE (start
!= -2, 1))
2518 /* We treat "{n}" as "{n,n}". */
2519 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2520 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2521 ? fetch_number (regexp
, token
, syntax
) : -2));
2523 if (BE (start
== -2 || end
== -2, 0))
2525 /* Invalid sequence. */
2526 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2528 if (token
->type
== END_OF_RE
)
2536 /* If the syntax bit is set, rollback. */
2537 re_string_set_index (regexp
, start_idx
);
2538 *token
= start_token
;
2539 token
->type
= CHARACTER
;
2540 /* mb_partial and word_char bits should be already initialized by
2545 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2547 /* First number greater than second. */
2554 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2555 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2558 fetch_token (token
, regexp
, syntax
);
2560 if (BE (elem
== NULL
, 0))
2562 if (BE (start
== 0 && end
== 0, 0))
2564 postorder (elem
, free_tree
, NULL
);
2568 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2569 if (BE (start
> 0, 0))
2572 for (i
= 2; i
<= start
; ++i
)
2574 elem
= duplicate_tree (elem
, dfa
);
2575 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2576 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2577 goto parse_dup_op_espace
;
2583 /* Duplicate ELEM before it is marked optional. */
2584 elem
= duplicate_tree (elem
, dfa
);
2590 if (elem
->token
.type
== SUBEXP
)
2591 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2593 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2594 if (BE (tree
== NULL
, 0))
2595 goto parse_dup_op_espace
;
2597 /* This loop is actually executed only when end != -1,
2598 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2599 already created the start+1-th copy. */
2600 for (i
= start
+ 2; i
<= end
; ++i
)
2602 elem
= duplicate_tree (elem
, dfa
);
2603 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2604 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2605 goto parse_dup_op_espace
;
2607 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2608 if (BE (tree
== NULL
, 0))
2609 goto parse_dup_op_espace
;
2613 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2617 parse_dup_op_espace
:
2622 /* Size of the names for collating symbol/equivalence_class/character_class.
2623 I'm not sure, but maybe enough. */
2624 #define BRACKET_NAME_BUF_SIZE 32
2627 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2628 Build the range expression which starts from START_ELEM, and ends
2629 at END_ELEM. The result are written to MBCSET and SBCSET.
2630 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2631 mbcset->range_ends, is a pointer argument sinse we may
2634 static reg_errcode_t
2636 # ifdef RE_ENABLE_I18N
2637 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2638 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2639 # else /* not RE_ENABLE_I18N */
2640 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2641 bracket_elem_t
*end_elem
)
2642 # endif /* not RE_ENABLE_I18N */
2644 unsigned int start_ch
, end_ch
;
2645 /* Equivalence Classes and Character Classes can't be a range start/end. */
2646 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2647 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2651 /* We can handle no multi character collating elements without libc
2653 if (BE ((start_elem
->type
== COLL_SYM
2654 && strlen ((char *) start_elem
->opr
.name
) > 1)
2655 || (end_elem
->type
== COLL_SYM
2656 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2657 return REG_ECOLLATE
;
2659 # ifdef RE_ENABLE_I18N
2664 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2666 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2667 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2669 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2670 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2672 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2673 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2674 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2675 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2676 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2677 return REG_ECOLLATE
;
2678 cmp_buf
[0] = start_wc
;
2679 cmp_buf
[4] = end_wc
;
2680 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2683 /* Got valid collation sequence values, add them as a new entry.
2684 However, for !_LIBC we have no collation elements: if the
2685 character set is single byte, the single byte character set
2686 that we build below suffices. parse_bracket_exp passes
2687 no MBCSET if dfa->mb_cur_max == 1. */
2690 /* Check the space of the arrays. */
2691 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2693 /* There is not enough space, need realloc. */
2694 wchar_t *new_array_start
, *new_array_end
;
2697 /* +1 in case of mbcset->nranges is 0. */
2698 new_nranges
= 2 * mbcset
->nranges
+ 1;
2699 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2700 are NULL if *range_alloc == 0. */
2701 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2703 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2706 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2709 mbcset
->range_starts
= new_array_start
;
2710 mbcset
->range_ends
= new_array_end
;
2711 *range_alloc
= new_nranges
;
2714 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2715 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2718 /* Build the table for single byte characters. */
2719 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2722 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2723 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2724 bitset_set (sbcset
, wc
);
2727 # else /* not RE_ENABLE_I18N */
2730 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2731 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2733 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2734 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2736 if (start_ch
> end_ch
)
2738 /* Build the table for single byte characters. */
2739 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2740 if (start_ch
<= ch
&& ch
<= end_ch
)
2741 bitset_set (sbcset
, ch
);
2743 # endif /* not RE_ENABLE_I18N */
2746 #endif /* not _LIBC */
2749 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2750 Build the collating element which is represented by NAME.
2751 The result are written to MBCSET and SBCSET.
2752 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2753 pointer argument since we may update it. */
2755 static reg_errcode_t
2757 # ifdef RE_ENABLE_I18N
2758 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2759 int *coll_sym_alloc
, const unsigned char *name
)
2760 # else /* not RE_ENABLE_I18N */
2761 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2762 # endif /* not RE_ENABLE_I18N */
2764 size_t name_len
= strlen ((const char *) name
);
2765 if (BE (name_len
!= 1, 0))
2766 return REG_ECOLLATE
;
2769 bitset_set (sbcset
, name
[0]);
2773 #endif /* not _LIBC */
2775 /* This function parse bracket expression like "[abc]", "[a-c]",
2779 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2780 reg_syntax_t syntax
, reg_errcode_t
*err
)
2783 const unsigned char *collseqmb
;
2784 const char *collseqwc
;
2787 const int32_t *symb_table
;
2788 const unsigned char *extra
;
2790 /* Local function for parse_bracket_exp used in _LIBC environement.
2791 Seek the collating symbol entry correspondings to NAME.
2792 Return the index of the symbol in the SYMB_TABLE,
2793 or -1 if not found. */
2796 __attribute ((always_inline
))
2797 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2801 for (elem
= 0; elem
< table_size
; elem
++)
2802 if (symb_table
[2 * elem
] != 0)
2804 int32_t idx
= symb_table
[2 * elem
+ 1];
2805 /* Skip the name of collating element name. */
2806 idx
+= 1 + extra
[idx
];
2807 if (/* Compare the length of the name. */
2808 name_len
== extra
[idx
]
2809 /* Compare the name. */
2810 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2811 /* Yep, this is the entry. */
2817 /* Local function for parse_bracket_exp used in _LIBC environment.
2818 Look up the collation sequence value of BR_ELEM.
2819 Return the value if succeeded, UINT_MAX otherwise. */
2821 auto inline unsigned int
2822 __attribute ((always_inline
))
2823 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2825 if (br_elem
->type
== SB_CHAR
)
2828 if (MB_CUR_MAX == 1)
2831 return collseqmb
[br_elem
->opr
.ch
];
2834 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2835 return __collseq_table_lookup (collseqwc
, wc
);
2838 else if (br_elem
->type
== MB_CHAR
)
2841 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2843 else if (br_elem
->type
== COLL_SYM
)
2845 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2849 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2853 /* We found the entry. */
2854 idx
= symb_table
[2 * elem
+ 1];
2855 /* Skip the name of collating element name. */
2856 idx
+= 1 + extra
[idx
];
2857 /* Skip the byte sequence of the collating element. */
2858 idx
+= 1 + extra
[idx
];
2859 /* Adjust for the alignment. */
2860 idx
= (idx
+ 3) & ~3;
2861 /* Skip the multibyte collation sequence value. */
2862 idx
+= sizeof (unsigned int);
2863 /* Skip the wide char sequence of the collating element. */
2864 idx
+= sizeof (unsigned int) *
2865 (1 + *(unsigned int *) (extra
+ idx
));
2866 /* Return the collation sequence value. */
2867 return *(unsigned int *) (extra
+ idx
);
2869 else if (sym_name_len
== 1)
2871 /* No valid character. Match it as a single byte
2873 return collseqmb
[br_elem
->opr
.name
[0]];
2876 else if (sym_name_len
== 1)
2877 return collseqmb
[br_elem
->opr
.name
[0]];
2882 /* Local function for parse_bracket_exp used in _LIBC environement.
2883 Build the range expression which starts from START_ELEM, and ends
2884 at END_ELEM. The result are written to MBCSET and SBCSET.
2885 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2886 mbcset->range_ends, is a pointer argument sinse we may
2889 auto inline reg_errcode_t
2890 __attribute ((always_inline
))
2891 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2892 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2895 uint32_t start_collseq
;
2896 uint32_t end_collseq
;
2898 /* Equivalence Classes and Character Classes can't be a range
2900 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2901 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2905 start_collseq
= lookup_collation_sequence_value (start_elem
);
2906 end_collseq
= lookup_collation_sequence_value (end_elem
);
2907 /* Check start/end collation sequence values. */
2908 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2909 return REG_ECOLLATE
;
2910 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2913 /* Got valid collation sequence values, add them as a new entry.
2914 However, if we have no collation elements, and the character set
2915 is single byte, the single byte character set that we
2916 build below suffices. */
2917 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2919 /* Check the space of the arrays. */
2920 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2922 /* There is not enough space, need realloc. */
2923 uint32_t *new_array_start
;
2924 uint32_t *new_array_end
;
2927 /* +1 in case of mbcset->nranges is 0. */
2928 new_nranges
= 2 * mbcset
->nranges
+ 1;
2929 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2931 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2934 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2937 mbcset
->range_starts
= new_array_start
;
2938 mbcset
->range_ends
= new_array_end
;
2939 *range_alloc
= new_nranges
;
2942 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2943 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2946 /* Build the table for single byte characters. */
2947 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2949 uint32_t ch_collseq
;
2951 if (MB_CUR_MAX == 1)
2954 ch_collseq
= collseqmb
[ch
];
2956 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2957 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2958 bitset_set (sbcset
, ch
);
2963 /* Local function for parse_bracket_exp used in _LIBC environement.
2964 Build the collating element which is represented by NAME.
2965 The result are written to MBCSET and SBCSET.
2966 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2967 pointer argument sinse we may update it. */
2969 auto inline reg_errcode_t
2970 __attribute ((always_inline
))
2971 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2972 int *coll_sym_alloc
, const unsigned char *name
)
2975 size_t name_len
= strlen ((const char *) name
);
2978 elem
= seek_collating_symbol_entry (name
, name_len
);
2981 /* We found the entry. */
2982 idx
= symb_table
[2 * elem
+ 1];
2983 /* Skip the name of collating element name. */
2984 idx
+= 1 + extra
[idx
];
2986 else if (name_len
== 1)
2988 /* No valid character, treat it as a normal
2990 bitset_set (sbcset
, name
[0]);
2994 return REG_ECOLLATE
;
2996 /* Got valid collation sequence, add it as a new entry. */
2997 /* Check the space of the arrays. */
2998 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3000 /* Not enough, realloc it. */
3001 /* +1 in case of mbcset->ncoll_syms is 0. */
3002 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3003 /* Use realloc since mbcset->coll_syms is NULL
3005 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3006 new_coll_sym_alloc
);
3007 if (BE (new_coll_syms
== NULL
, 0))
3009 mbcset
->coll_syms
= new_coll_syms
;
3010 *coll_sym_alloc
= new_coll_sym_alloc
;
3012 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3017 if (BE (name_len
!= 1, 0))
3018 return REG_ECOLLATE
;
3021 bitset_set (sbcset
, name
[0]);
3028 re_token_t br_token
;
3029 re_bitset_ptr_t sbcset
;
3030 #ifdef RE_ENABLE_I18N
3031 re_charset_t
*mbcset
;
3032 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3033 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3034 #endif /* not RE_ENABLE_I18N */
3036 bin_tree_t
*work_tree
;
3038 int first_round
= 1;
3040 collseqmb
= (const unsigned char *)
3041 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3042 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3048 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3049 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3050 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3051 _NL_COLLATE_SYMB_TABLEMB
);
3052 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3053 _NL_COLLATE_SYMB_EXTRAMB
);
3056 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3057 #ifdef RE_ENABLE_I18N
3058 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3059 #endif /* RE_ENABLE_I18N */
3060 #ifdef RE_ENABLE_I18N
3061 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3063 if (BE (sbcset
== NULL
, 0))
3064 #endif /* RE_ENABLE_I18N */
3067 #ifdef RE_ENABLE_I18N
3074 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3075 if (BE (token
->type
== END_OF_RE
, 0))
3078 goto parse_bracket_exp_free_return
;
3080 if (token
->type
== OP_NON_MATCH_LIST
)
3082 #ifdef RE_ENABLE_I18N
3083 mbcset
->non_match
= 1;
3084 #endif /* not RE_ENABLE_I18N */
3086 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3087 bitset_set (sbcset
, '\n');
3088 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3089 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3090 if (BE (token
->type
== END_OF_RE
, 0))
3093 goto parse_bracket_exp_free_return
;
3097 /* We treat the first ']' as a normal character. */
3098 if (token
->type
== OP_CLOSE_BRACKET
)
3099 token
->type
= CHARACTER
;
3103 bracket_elem_t start_elem
, end_elem
;
3104 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3105 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3107 int token_len2
= 0, is_range_exp
= 0;
3110 start_elem
.opr
.name
= start_name_buf
;
3111 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3112 syntax
, first_round
);
3113 if (BE (ret
!= REG_NOERROR
, 0))
3116 goto parse_bracket_exp_free_return
;
3120 /* Get information about the next token. We need it in any case. */
3121 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3123 /* Do not check for ranges if we know they are not allowed. */
3124 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3126 if (BE (token
->type
== END_OF_RE
, 0))
3129 goto parse_bracket_exp_free_return
;
3131 if (token
->type
== OP_CHARSET_RANGE
)
3133 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3134 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3135 if (BE (token2
.type
== END_OF_RE
, 0))
3138 goto parse_bracket_exp_free_return
;
3140 if (token2
.type
== OP_CLOSE_BRACKET
)
3142 /* We treat the last '-' as a normal character. */
3143 re_string_skip_bytes (regexp
, -token_len
);
3144 token
->type
= CHARACTER
;
3151 if (is_range_exp
== 1)
3153 end_elem
.opr
.name
= end_name_buf
;
3154 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3156 if (BE (ret
!= REG_NOERROR
, 0))
3159 goto parse_bracket_exp_free_return
;
3162 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3165 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3166 &start_elem
, &end_elem
);
3168 # ifdef RE_ENABLE_I18N
3169 *err
= build_range_exp (sbcset
,
3170 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3171 &range_alloc
, &start_elem
, &end_elem
);
3173 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3175 #endif /* RE_ENABLE_I18N */
3176 if (BE (*err
!= REG_NOERROR
, 0))
3177 goto parse_bracket_exp_free_return
;
3181 switch (start_elem
.type
)
3184 bitset_set (sbcset
, start_elem
.opr
.ch
);
3186 #ifdef RE_ENABLE_I18N
3188 /* Check whether the array has enough space. */
3189 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3191 wchar_t *new_mbchars
;
3192 /* Not enough, realloc it. */
3193 /* +1 in case of mbcset->nmbchars is 0. */
3194 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3195 /* Use realloc since array is NULL if *alloc == 0. */
3196 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3198 if (BE (new_mbchars
== NULL
, 0))
3199 goto parse_bracket_exp_espace
;
3200 mbcset
->mbchars
= new_mbchars
;
3202 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3204 #endif /* RE_ENABLE_I18N */
3206 *err
= build_equiv_class (sbcset
,
3207 #ifdef RE_ENABLE_I18N
3208 mbcset
, &equiv_class_alloc
,
3209 #endif /* RE_ENABLE_I18N */
3210 start_elem
.opr
.name
);
3211 if (BE (*err
!= REG_NOERROR
, 0))
3212 goto parse_bracket_exp_free_return
;
3215 *err
= build_collating_symbol (sbcset
,
3216 #ifdef RE_ENABLE_I18N
3217 mbcset
, &coll_sym_alloc
,
3218 #endif /* RE_ENABLE_I18N */
3219 start_elem
.opr
.name
);
3220 if (BE (*err
!= REG_NOERROR
, 0))
3221 goto parse_bracket_exp_free_return
;
3224 *err
= build_charclass (regexp
->trans
, sbcset
,
3225 #ifdef RE_ENABLE_I18N
3226 mbcset
, &char_class_alloc
,
3227 #endif /* RE_ENABLE_I18N */
3228 start_elem
.opr
.name
, syntax
);
3229 if (BE (*err
!= REG_NOERROR
, 0))
3230 goto parse_bracket_exp_free_return
;
3237 if (BE (token
->type
== END_OF_RE
, 0))
3240 goto parse_bracket_exp_free_return
;
3242 if (token
->type
== OP_CLOSE_BRACKET
)
3246 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3248 /* If it is non-matching list. */
3250 bitset_not (sbcset
);
3252 #ifdef RE_ENABLE_I18N
3253 /* Ensure only single byte characters are set. */
3254 if (dfa
->mb_cur_max
> 1)
3255 bitset_mask (sbcset
, dfa
->sb_char
);
3257 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3258 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3259 || mbcset
->non_match
)))
3261 bin_tree_t
*mbc_tree
;
3263 /* Build a tree for complex bracket. */
3264 dfa
->has_mb_node
= 1;
3265 br_token
.type
= COMPLEX_BRACKET
;
3266 br_token
.opr
.mbcset
= mbcset
;
3267 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3268 if (BE (mbc_tree
== NULL
, 0))
3269 goto parse_bracket_exp_espace
;
3270 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3271 if (sbcset
[sbc_idx
])
3273 /* If there are no bits set in sbcset, there is no point
3274 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3275 if (sbc_idx
< BITSET_WORDS
)
3277 /* Build a tree for simple bracket. */
3278 br_token
.type
= SIMPLE_BRACKET
;
3279 br_token
.opr
.sbcset
= sbcset
;
3280 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3281 if (BE (work_tree
== NULL
, 0))
3282 goto parse_bracket_exp_espace
;
3284 /* Then join them by ALT node. */
3285 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3286 if (BE (work_tree
== NULL
, 0))
3287 goto parse_bracket_exp_espace
;
3292 work_tree
= mbc_tree
;
3296 #endif /* not RE_ENABLE_I18N */
3298 #ifdef RE_ENABLE_I18N
3299 free_charset (mbcset
);
3301 /* Build a tree for simple bracket. */
3302 br_token
.type
= SIMPLE_BRACKET
;
3303 br_token
.opr
.sbcset
= sbcset
;
3304 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3305 if (BE (work_tree
== NULL
, 0))
3306 goto parse_bracket_exp_espace
;
3310 parse_bracket_exp_espace
:
3312 parse_bracket_exp_free_return
:
3314 #ifdef RE_ENABLE_I18N
3315 free_charset (mbcset
);
3316 #endif /* RE_ENABLE_I18N */
3320 /* Parse an element in the bracket expression. */
3322 static reg_errcode_t
3323 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3324 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3325 reg_syntax_t syntax
, int accept_hyphen
)
3327 #ifdef RE_ENABLE_I18N
3329 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3330 if (cur_char_size
> 1)
3332 elem
->type
= MB_CHAR
;
3333 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3334 re_string_skip_bytes (regexp
, cur_char_size
);
3337 #endif /* RE_ENABLE_I18N */
3338 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3339 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3340 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3341 return parse_bracket_symbol (elem
, regexp
, token
);
3342 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3344 /* A '-' must only appear as anything but a range indicator before
3345 the closing bracket. Everything else is an error. */
3347 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3348 if (token2
.type
!= OP_CLOSE_BRACKET
)
3349 /* The actual error value is not standardized since this whole
3350 case is undefined. But ERANGE makes good sense. */
3353 elem
->type
= SB_CHAR
;
3354 elem
->opr
.ch
= token
->opr
.c
;
3358 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3359 such as [:<character_class>:], [.<collating_element>.], and
3360 [=<equivalent_class>=]. */
3362 static reg_errcode_t
3363 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3366 unsigned char ch
, delim
= token
->opr
.c
;
3368 if (re_string_eoi(regexp
))
3372 if (i
>= BRACKET_NAME_BUF_SIZE
)
3374 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3375 ch
= re_string_fetch_byte_case (regexp
);
3377 ch
= re_string_fetch_byte (regexp
);
3378 if (re_string_eoi(regexp
))
3380 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3382 elem
->opr
.name
[i
] = ch
;
3384 re_string_skip_bytes (regexp
, 1);
3385 elem
->opr
.name
[i
] = '\0';
3386 switch (token
->type
)
3388 case OP_OPEN_COLL_ELEM
:
3389 elem
->type
= COLL_SYM
;
3391 case OP_OPEN_EQUIV_CLASS
:
3392 elem
->type
= EQUIV_CLASS
;
3394 case OP_OPEN_CHAR_CLASS
:
3395 elem
->type
= CHAR_CLASS
;
3403 /* Helper function for parse_bracket_exp.
3404 Build the equivalence class which is represented by NAME.
3405 The result are written to MBCSET and SBCSET.
3406 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3407 is a pointer argument sinse we may update it. */
3409 static reg_errcode_t
3410 #ifdef RE_ENABLE_I18N
3411 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3412 int *equiv_class_alloc
, const unsigned char *name
)
3413 #else /* not RE_ENABLE_I18N */
3414 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3415 #endif /* not RE_ENABLE_I18N */
3418 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3421 const int32_t *table
, *indirect
;
3422 const unsigned char *weights
, *extra
, *cp
;
3423 unsigned char char_buf
[2];
3427 /* This #include defines a local function! */
3428 # include <locale/weight.h>
3429 /* Calculate the index for equivalence class. */
3431 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3432 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3433 _NL_COLLATE_WEIGHTMB
);
3434 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3435 _NL_COLLATE_EXTRAMB
);
3436 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3437 _NL_COLLATE_INDIRECTMB
);
3438 idx1
= findidx (&cp
, -1);
3439 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3440 /* This isn't a valid character. */
3441 return REG_ECOLLATE
;
3443 /* Build single byte matcing table for this equivalence class. */
3444 len
= weights
[idx1
& 0xffffff];
3445 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3449 idx2
= findidx (&cp
, 1);
3454 /* This isn't a valid character. */
3456 /* Compare only if the length matches and the collation rule
3457 index is the same. */
3458 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3462 while (cnt
<= len
&&
3463 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3464 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3468 bitset_set (sbcset
, ch
);
3471 /* Check whether the array has enough space. */
3472 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3474 /* Not enough, realloc it. */
3475 /* +1 in case of mbcset->nequiv_classes is 0. */
3476 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3477 /* Use realloc since the array is NULL if *alloc == 0. */
3478 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3480 new_equiv_class_alloc
);
3481 if (BE (new_equiv_classes
== NULL
, 0))
3483 mbcset
->equiv_classes
= new_equiv_classes
;
3484 *equiv_class_alloc
= new_equiv_class_alloc
;
3486 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3491 if (BE (strlen ((const char *) name
) != 1, 0))
3492 return REG_ECOLLATE
;
3493 bitset_set (sbcset
, *name
);
3498 /* Helper function for parse_bracket_exp.
3499 Build the character class which is represented by NAME.
3500 The result are written to MBCSET and SBCSET.
3501 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3502 is a pointer argument sinse we may update it. */
3504 static reg_errcode_t
3505 #ifdef RE_ENABLE_I18N
3506 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3507 re_charset_t
*mbcset
, int *char_class_alloc
,
3508 const unsigned char *class_name
, reg_syntax_t syntax
)
3509 #else /* not RE_ENABLE_I18N */
3510 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3511 const unsigned char *class_name
, reg_syntax_t syntax
)
3512 #endif /* not RE_ENABLE_I18N */
3515 const char *name
= (const char *) class_name
;
3517 /* In case of REG_ICASE "upper" and "lower" match the both of
3518 upper and lower cases. */
3519 if ((syntax
& RE_ICASE
)
3520 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3523 #ifdef RE_ENABLE_I18N
3524 /* Check the space of the arrays. */
3525 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3527 /* Not enough, realloc it. */
3528 /* +1 in case of mbcset->nchar_classes is 0. */
3529 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3530 /* Use realloc since array is NULL if *alloc == 0. */
3531 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3532 new_char_class_alloc
);
3533 if (BE (new_char_classes
== NULL
, 0))
3535 mbcset
->char_classes
= new_char_classes
;
3536 *char_class_alloc
= new_char_class_alloc
;
3538 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3539 #endif /* RE_ENABLE_I18N */
3541 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3543 if (BE (trans != NULL, 0)) \
3545 for (i = 0; i < SBC_MAX; ++i) \
3546 if (ctype_func (i)) \
3547 bitset_set (sbcset, trans[i]); \
3551 for (i = 0; i < SBC_MAX; ++i) \
3552 if (ctype_func (i)) \
3553 bitset_set (sbcset, i); \
3557 if (strcmp (name
, "alnum") == 0)
3558 BUILD_CHARCLASS_LOOP (isalnum
);
3559 else if (strcmp (name
, "cntrl") == 0)
3560 BUILD_CHARCLASS_LOOP (iscntrl
);
3561 else if (strcmp (name
, "lower") == 0)
3562 BUILD_CHARCLASS_LOOP (islower
);
3563 else if (strcmp (name
, "space") == 0)
3564 BUILD_CHARCLASS_LOOP (isspace
);
3565 else if (strcmp (name
, "alpha") == 0)
3566 BUILD_CHARCLASS_LOOP (isalpha
);
3567 else if (strcmp (name
, "digit") == 0)
3568 BUILD_CHARCLASS_LOOP (isdigit
);
3569 else if (strcmp (name
, "print") == 0)
3570 BUILD_CHARCLASS_LOOP (isprint
);
3571 else if (strcmp (name
, "upper") == 0)
3572 BUILD_CHARCLASS_LOOP (isupper
);
3573 else if (strcmp (name
, "blank") == 0)
3574 BUILD_CHARCLASS_LOOP (isblank
);
3575 else if (strcmp (name
, "graph") == 0)
3576 BUILD_CHARCLASS_LOOP (isgraph
);
3577 else if (strcmp (name
, "punct") == 0)
3578 BUILD_CHARCLASS_LOOP (ispunct
);
3579 else if (strcmp (name
, "xdigit") == 0)
3580 BUILD_CHARCLASS_LOOP (isxdigit
);
3588 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3589 const unsigned char *class_name
,
3590 const unsigned char *extra
, int non_match
,
3593 re_bitset_ptr_t sbcset
;
3594 #ifdef RE_ENABLE_I18N
3595 re_charset_t
*mbcset
;
3597 #endif /* not RE_ENABLE_I18N */
3599 re_token_t br_token
;
3602 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3603 #ifdef RE_ENABLE_I18N
3604 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3605 #endif /* RE_ENABLE_I18N */
3607 #ifdef RE_ENABLE_I18N
3608 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3609 #else /* not RE_ENABLE_I18N */
3610 if (BE (sbcset
== NULL
, 0))
3611 #endif /* not RE_ENABLE_I18N */
3619 #ifdef RE_ENABLE_I18N
3620 mbcset
->non_match
= 1;
3621 #endif /* not RE_ENABLE_I18N */
3624 /* We don't care the syntax in this case. */
3625 ret
= build_charclass (trans
, sbcset
,
3626 #ifdef RE_ENABLE_I18N
3628 #endif /* RE_ENABLE_I18N */
3631 if (BE (ret
!= REG_NOERROR
, 0))
3634 #ifdef RE_ENABLE_I18N
3635 free_charset (mbcset
);
3636 #endif /* RE_ENABLE_I18N */
3640 /* \w match '_' also. */
3641 for (; *extra
; extra
++)
3642 bitset_set (sbcset
, *extra
);
3644 /* If it is non-matching list. */
3646 bitset_not (sbcset
);
3648 #ifdef RE_ENABLE_I18N
3649 /* Ensure only single byte characters are set. */
3650 if (dfa
->mb_cur_max
> 1)
3651 bitset_mask (sbcset
, dfa
->sb_char
);
3654 /* Build a tree for simple bracket. */
3655 br_token
.type
= SIMPLE_BRACKET
;
3656 br_token
.opr
.sbcset
= sbcset
;
3657 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3658 if (BE (tree
== NULL
, 0))
3659 goto build_word_op_espace
;
3661 #ifdef RE_ENABLE_I18N
3662 if (dfa
->mb_cur_max
> 1)
3664 bin_tree_t
*mbc_tree
;
3665 /* Build a tree for complex bracket. */
3666 br_token
.type
= COMPLEX_BRACKET
;
3667 br_token
.opr
.mbcset
= mbcset
;
3668 dfa
->has_mb_node
= 1;
3669 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3670 if (BE (mbc_tree
== NULL
, 0))
3671 goto build_word_op_espace
;
3672 /* Then join them by ALT node. */
3673 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3674 if (BE (mbc_tree
!= NULL
, 1))
3679 free_charset (mbcset
);
3682 #else /* not RE_ENABLE_I18N */
3684 #endif /* not RE_ENABLE_I18N */
3686 build_word_op_espace
:
3688 #ifdef RE_ENABLE_I18N
3689 free_charset (mbcset
);
3690 #endif /* RE_ENABLE_I18N */
3695 /* This is intended for the expressions like "a{1,3}".
3696 Fetch a number from `input', and return the number.
3697 Return -1, if the number field is empty like "{,1}".
3698 Return -2, If an error is occured. */
3701 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3707 fetch_token (token
, input
, syntax
);
3709 if (BE (token
->type
== END_OF_RE
, 0))
3711 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3713 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3714 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3715 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3720 #ifdef RE_ENABLE_I18N
3722 free_charset (re_charset_t
*cset
)
3724 re_free (cset
->mbchars
);
3726 re_free (cset
->coll_syms
);
3727 re_free (cset
->equiv_classes
);
3728 re_free (cset
->range_starts
);
3729 re_free (cset
->range_ends
);
3731 re_free (cset
->char_classes
);
3734 #endif /* RE_ENABLE_I18N */
3736 /* Functions for binary tree operation. */
3738 /* Create a tree node. */
3741 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3742 re_token_type_t type
)
3746 return create_token_tree (dfa
, left
, right
, &t
);
3750 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3751 const re_token_t
*token
)
3754 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3756 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3758 if (storage
== NULL
)
3760 storage
->next
= dfa
->str_tree_storage
;
3761 dfa
->str_tree_storage
= storage
;
3762 dfa
->str_tree_storage_idx
= 0;
3764 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3766 tree
->parent
= NULL
;
3768 tree
->right
= right
;
3769 tree
->token
= *token
;
3770 tree
->token
.duplicated
= 0;
3771 tree
->token
.opt_subexp
= 0;
3774 tree
->node_idx
= -1;
3777 left
->parent
= tree
;
3779 right
->parent
= tree
;
3783 /* Mark the tree SRC as an optional subexpression.
3784 To be called from preorder or postorder. */
3786 static reg_errcode_t
3787 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3789 int idx
= (int) (long) extra
;
3790 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3791 node
->token
.opt_subexp
= 1;
3796 /* Free the allocated memory inside NODE. */
3799 free_token (re_token_t
*node
)
3801 #ifdef RE_ENABLE_I18N
3802 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3803 free_charset (node
->opr
.mbcset
);
3805 #endif /* RE_ENABLE_I18N */
3806 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3807 re_free (node
->opr
.sbcset
);
3810 /* Worker function for tree walking. Free the allocated memory inside NODE
3811 and its children. */
3813 static reg_errcode_t
3814 free_tree (void *extra
, bin_tree_t
*node
)
3816 free_token (&node
->token
);
3821 /* Duplicate the node SRC, and return new node. This is a preorder
3822 visit similar to the one implemented by the generic visitor, but
3823 we need more infrastructure to maintain two parallel trees --- so,
3824 it's easier to duplicate. */
3827 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3829 const bin_tree_t
*node
;
3830 bin_tree_t
*dup_root
;
3831 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3833 for (node
= root
; ; )
3835 /* Create a new tree and link it back to the current parent. */
3836 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3839 (*p_new
)->parent
= dup_node
;
3840 (*p_new
)->token
.duplicated
= 1;
3843 /* Go to the left node, or up and to the right. */
3847 p_new
= &dup_node
->left
;
3851 const bin_tree_t
*prev
= NULL
;
3852 while (node
->right
== prev
|| node
->right
== NULL
)
3855 node
= node
->parent
;
3856 dup_node
= dup_node
->parent
;
3861 p_new
= &dup_node
->right
;