1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007,2009
3 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
23 size_t length
, reg_syntax_t syntax
);
24 static void re_compile_fastmap_iter (regex_t
*bufp
,
25 const re_dfastate_t
*init_state
,
27 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
preorder (bin_tree_t
*root
,
38 reg_errcode_t (fn (void *, bin_tree_t
*)),
40 static reg_errcode_t
postorder (bin_tree_t
*root
,
41 reg_errcode_t (fn (void *, bin_tree_t
*)),
43 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
44 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
45 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
47 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
50 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
51 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
52 unsigned int constraint
);
53 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
54 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
56 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
57 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
59 static int peek_token (re_token_t
*token
, re_string_t
*input
,
60 reg_syntax_t syntax
) internal_function
;
61 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
62 reg_syntax_t syntax
, reg_errcode_t
*err
);
63 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
64 re_token_t
*token
, reg_syntax_t syntax
,
65 int nest
, reg_errcode_t
*err
);
66 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
67 re_token_t
*token
, reg_syntax_t syntax
,
68 int nest
, reg_errcode_t
*err
);
69 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
70 re_token_t
*token
, reg_syntax_t syntax
,
71 int nest
, reg_errcode_t
*err
);
72 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
73 re_token_t
*token
, reg_syntax_t syntax
,
74 int nest
, reg_errcode_t
*err
);
75 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
76 re_dfa_t
*dfa
, re_token_t
*token
,
77 reg_syntax_t syntax
, reg_errcode_t
*err
);
78 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
79 re_token_t
*token
, reg_syntax_t syntax
,
81 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
83 re_token_t
*token
, int token_len
,
87 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
91 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
93 int *equiv_class_alloc
,
94 const unsigned char *name
);
95 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
98 int *char_class_alloc
,
99 const unsigned char *class_name
,
100 reg_syntax_t syntax
);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
103 const unsigned char *name
);
104 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
106 const unsigned char *class_name
,
107 reg_syntax_t syntax
);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
110 RE_TRANSLATE_TYPE trans
,
111 const unsigned char *class_name
,
112 const unsigned char *extra
,
113 int non_match
, reg_errcode_t
*err
);
114 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
115 bin_tree_t
*left
, bin_tree_t
*right
,
116 re_token_type_t type
);
117 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
118 bin_tree_t
*left
, bin_tree_t
*right
,
119 const re_token_t
*token
);
120 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
121 static void free_token (re_token_t
*node
);
122 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
123 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 const char __re_error_msgid
[] attribute_hidden
=
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 const size_t __re_error_msgid_idx
[] attribute_hidden
=
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
215 re_compile_pattern (pattern
, length
, bufp
)
218 struct re_pattern_buffer
*bufp
;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
227 /* Match anchors at newline. */
228 bufp
->newline_anchor
= 1;
230 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
234 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
237 weak_alias (__re_compile_pattern
, re_compile_pattern
)
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options
;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
256 re_set_syntax (syntax
)
259 reg_syntax_t ret
= re_syntax_options
;
261 re_syntax_options
= syntax
;
265 weak_alias (__re_set_syntax
, re_set_syntax
)
269 re_compile_fastmap (bufp
)
270 struct re_pattern_buffer
*bufp
;
272 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
273 char *fastmap
= bufp
->fastmap
;
275 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
276 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_word
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_nl
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
281 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
282 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
283 bufp
->fastmap_accurate
= 1;
287 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
291 __attribute ((always_inline
))
292 re_set_fastmap (char *fastmap
, int icase
, int ch
)
296 fastmap
[tolower (ch
)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
303 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
306 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
308 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
309 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
311 int node
= init_state
->nodes
.elems
[node_cnt
];
312 re_token_type_t type
= dfa
->nodes
[node
].type
;
314 if (type
== CHARACTER
)
316 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
320 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
325 *p
++ = dfa
->nodes
[node
].opr
.c
;
326 while (++node
< dfa
->nodes_len
327 && dfa
->nodes
[node
].type
== CHARACTER
328 && dfa
->nodes
[node
].mb_partial
)
329 *p
++ = dfa
->nodes
[node
].opr
.c
;
330 memset (&state
, '\0', sizeof (state
));
331 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
333 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
335 re_set_fastmap (fastmap
, 0, buf
[0]);
339 else if (type
== SIMPLE_BRACKET
)
342 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
345 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
346 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
347 if (w
& ((bitset_word_t
) 1 << j
))
348 re_set_fastmap (fastmap
, icase
, ch
);
351 #ifdef RE_ENABLE_I18N
352 else if (type
== COMPLEX_BRACKET
)
354 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
358 /* See if we have to try all bytes which start multiple collation
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
365 && (cset
->ncoll_syms
|| cset
->nranges
))
367 const int32_t *table
= (const int32_t *)
368 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
369 for (i
= 0; i
< SBC_MAX
; ++i
)
371 re_set_fastmap (fastmap
, icase
, i
);
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa
->mb_cur_max
> 1
380 && (cset
->nchar_classes
|| cset
->non_match
382 || cset
->nequiv_classes
390 memset (&mbs
, 0, sizeof (mbs
));
391 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
392 re_set_fastmap (fastmap
, false, (int) c
);
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i
= 0; i
< cset
->nmbchars
; ++i
)
404 memset (&state
, '\0', sizeof (state
));
405 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
406 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
407 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
409 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
411 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
416 #endif /* RE_ENABLE_I18N */
417 else if (type
== OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type
== OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type
== END_OF_RE
)
423 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
424 if (type
== END_OF_RE
)
425 bufp
->can_be_null
= 1;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 `buffer' to the compiled pattern;
438 `used' to the length of the compiled pattern;
439 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 `fastmap' to an allocated space for the fastmap;
444 `fastmap_accurate' to zero;
445 `re_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (preg
, pattern
, cflags
)
469 regex_t
*__restrict preg
;
470 const char *__restrict pattern
;
474 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
475 : RE_SYNTAX_POSIX_BASIC
);
481 /* Try to allocate space for the fastmap. */
482 preg
->fastmap
= re_malloc (char, SBC_MAX
);
483 if (BE (preg
->fastmap
== NULL
, 0))
486 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
488 /* If REG_NEWLINE is set, newlines are treated differently. */
489 if (cflags
& REG_NEWLINE
)
490 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
491 syntax
&= ~RE_DOT_NEWLINE
;
492 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
493 /* It also changes the matching behavior. */
494 preg
->newline_anchor
= 1;
497 preg
->newline_anchor
= 0;
498 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
499 preg
->translate
= NULL
;
501 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
503 /* POSIX doesn't distinguish between an unmatched open-group and an
504 unmatched close-group: both are REG_EPAREN. */
505 if (ret
== REG_ERPAREN
)
508 /* We have already checked preg->fastmap != NULL. */
509 if (BE (ret
== REG_NOERROR
, 1))
510 /* Compute the fastmap now, since regexec cannot modify the pattern
511 buffer. This function never fails in this implementation. */
512 (void) re_compile_fastmap (preg
);
515 /* Some error occurred while compiling the expression. */
516 re_free (preg
->fastmap
);
517 preg
->fastmap
= NULL
;
523 weak_alias (__regcomp
, regcomp
)
526 /* Returns a message corresponding to an error code, ERRCODE, returned
527 from either regcomp or regexec. We don't use PREG here. */
530 regerror (errcode
, preg
, errbuf
, errbuf_size
)
532 const regex_t
*__restrict preg
;
533 char *__restrict errbuf
;
540 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
541 / sizeof (__re_error_msgid_idx
[0])), 0))
542 /* Only error codes returned by the rest of the code should be passed
543 to this routine. If we are given anything else, or if other regex
544 code generates an invalid error code, then the program has a bug.
545 Dump core so we can fix it. */
548 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
550 msg_size
= strlen (msg
) + 1; /* Includes the null. */
552 if (BE (errbuf_size
!= 0, 1))
554 if (BE (msg_size
> errbuf_size
, 0))
556 #if defined HAVE_MEMPCPY || defined _LIBC
557 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
559 memcpy (errbuf
, msg
, errbuf_size
- 1);
560 errbuf
[errbuf_size
- 1] = 0;
564 memcpy (errbuf
, msg
, msg_size
);
570 weak_alias (__regerror
, regerror
)
574 #ifdef RE_ENABLE_I18N
575 /* This static array is used for the map to single-byte characters when
576 UTF-8 is used. Otherwise we would allocate memory just to initialize
577 it the same all the time. UTF-8 is the preferred encoding so this is
578 a worthwhile optimization. */
579 static const bitset_t utf8_sb_map
=
581 /* Set the first 128 bits. */
582 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
588 free_dfa_content (re_dfa_t
*dfa
)
593 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
594 free_token (dfa
->nodes
+ i
);
595 re_free (dfa
->nexts
);
596 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
598 if (dfa
->eclosures
!= NULL
)
599 re_node_set_free (dfa
->eclosures
+ i
);
600 if (dfa
->inveclosures
!= NULL
)
601 re_node_set_free (dfa
->inveclosures
+ i
);
602 if (dfa
->edests
!= NULL
)
603 re_node_set_free (dfa
->edests
+ i
);
605 re_free (dfa
->edests
);
606 re_free (dfa
->eclosures
);
607 re_free (dfa
->inveclosures
);
608 re_free (dfa
->nodes
);
610 if (dfa
->state_table
)
611 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
613 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
614 for (j
= 0; j
< entry
->num
; ++j
)
616 re_dfastate_t
*state
= entry
->array
[j
];
619 re_free (entry
->array
);
621 re_free (dfa
->state_table
);
622 #ifdef RE_ENABLE_I18N
623 if (dfa
->sb_char
!= utf8_sb_map
)
624 re_free (dfa
->sb_char
);
626 re_free (dfa
->subexp_map
);
628 re_free (dfa
->re_str
);
635 /* Free dynamically allocated space used by PREG. */
641 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
642 if (BE (dfa
!= NULL
, 1))
643 free_dfa_content (dfa
);
647 re_free (preg
->fastmap
);
648 preg
->fastmap
= NULL
;
650 re_free (preg
->translate
);
651 preg
->translate
= NULL
;
654 weak_alias (__regfree
, regfree
)
657 /* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
660 #if defined _REGEX_RE_COMP || defined _LIBC
662 /* BSD has one and only one pattern buffer. */
663 static struct re_pattern_buffer re_comp_buf
;
667 /* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
680 if (!re_comp_buf
.buffer
)
681 return gettext ("No previous regular expression");
685 if (re_comp_buf
.buffer
)
687 fastmap
= re_comp_buf
.fastmap
;
688 re_comp_buf
.fastmap
= NULL
;
689 __regfree (&re_comp_buf
);
690 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
691 re_comp_buf
.fastmap
= fastmap
;
694 if (re_comp_buf
.fastmap
== NULL
)
696 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
697 if (re_comp_buf
.fastmap
== NULL
)
698 return (char *) gettext (__re_error_msgid
699 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
702 /* Since `re_exec' always passes NULL for the `regs' argument, we
703 don't need to initialize the pattern buffer fields which affect it. */
705 /* Match anchors at newlines. */
706 re_comp_buf
.newline_anchor
= 1;
708 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
713 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
714 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
718 libc_freeres_fn (free_mem
)
720 __regfree (&re_comp_buf
);
724 #endif /* _REGEX_RE_COMP */
726 /* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
731 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
734 reg_errcode_t err
= REG_NOERROR
;
738 /* Initialize the pattern buffer. */
739 preg
->fastmap_accurate
= 0;
740 preg
->syntax
= syntax
;
741 preg
->not_bol
= preg
->not_eol
= 0;
744 preg
->can_be_null
= 0;
745 preg
->regs_allocated
= REGS_UNALLOCATED
;
747 /* Initialize the dfa. */
748 dfa
= (re_dfa_t
*) preg
->buffer
;
749 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
758 preg
->allocated
= sizeof (re_dfa_t
);
759 preg
->buffer
= (unsigned char *) dfa
;
761 preg
->used
= sizeof (re_dfa_t
);
763 err
= init_dfa (dfa
, length
);
764 if (BE (err
!= REG_NOERROR
, 0))
766 free_dfa_content (dfa
);
772 /* Note: length+1 will not overflow since it is checked in init_dfa. */
773 dfa
->re_str
= re_malloc (char, length
+ 1);
774 strncpy (dfa
->re_str
, pattern
, length
+ 1);
777 __libc_lock_init (dfa
->lock
);
779 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
780 syntax
& RE_ICASE
, dfa
);
781 if (BE (err
!= REG_NOERROR
, 0))
783 re_compile_internal_free_return
:
784 free_workarea_compile (preg
);
785 re_string_destruct (®exp
);
786 free_dfa_content (dfa
);
792 /* Parse the regular expression, and build a structure tree. */
794 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
795 if (BE (dfa
->str_tree
== NULL
, 0))
796 goto re_compile_internal_free_return
;
798 /* Analyze the tree and create the nfa. */
799 err
= analyze (preg
);
800 if (BE (err
!= REG_NOERROR
, 0))
801 goto re_compile_internal_free_return
;
803 #ifdef RE_ENABLE_I18N
804 /* If possible, do searching in single byte encoding to speed things up. */
805 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
809 /* Then create the initial state of the dfa. */
810 err
= create_initial_state (dfa
);
812 /* Release work areas. */
813 free_workarea_compile (preg
);
814 re_string_destruct (®exp
);
816 if (BE (err
!= REG_NOERROR
, 0))
818 free_dfa_content (dfa
);
826 /* Initialize DFA. We use the length of the regular expression PAT_LEN
827 as the initial length of some arrays. */
830 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
832 unsigned int table_size
;
837 memset (dfa
, '\0', sizeof (re_dfa_t
));
839 /* Force allocation of str_tree_storage the first time. */
840 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
842 /* Avoid overflows. */
843 if (pat_len
== SIZE_MAX
)
846 dfa
->nodes_alloc
= pat_len
+ 1;
847 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
849 /* table_size = 2 ^ ceil(log pat_len) */
850 for (table_size
= 1; ; table_size
<<= 1)
851 if (table_size
> pat_len
)
854 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
855 dfa
->state_hash_mask
= table_size
- 1;
857 dfa
->mb_cur_max
= MB_CUR_MAX
;
859 if (dfa
->mb_cur_max
== 6
860 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
862 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
865 # ifdef HAVE_LANGINFO_CODESET
866 codeset_name
= nl_langinfo (CODESET
);
868 codeset_name
= getenv ("LC_ALL");
869 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
870 codeset_name
= getenv ("LC_CTYPE");
871 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
872 codeset_name
= getenv ("LANG");
873 if (codeset_name
== NULL
)
875 else if (strchr (codeset_name
, '.') != NULL
)
876 codeset_name
= strchr (codeset_name
, '.') + 1;
879 if (strcasecmp (codeset_name
, "UTF-8") == 0
880 || strcasecmp (codeset_name
, "UTF8") == 0)
883 /* We check exhaustively in the loop below if this charset is a
884 superset of ASCII. */
885 dfa
->map_notascii
= 0;
888 #ifdef RE_ENABLE_I18N
889 if (dfa
->mb_cur_max
> 1)
892 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
897 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
898 if (BE (dfa
->sb_char
== NULL
, 0))
901 /* Set the bits corresponding to single byte chars. */
902 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
903 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
905 wint_t wch
= __btowc (ch
);
907 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
909 if (isascii (ch
) && wch
!= ch
)
910 dfa
->map_notascii
= 1;
917 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
922 /* Initialize WORD_CHAR table, which indicate which character is
923 "word". In this case "word" means that it is the word construction
924 character used by some operators like "\<", "\>", etc. */
928 init_word_char (re_dfa_t
*dfa
)
931 dfa
->word_ops_used
= 1;
932 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
933 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
934 if (isalnum (ch
) || ch
== '_')
935 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
938 /* Free the work area which are only used while compiling. */
941 free_workarea_compile (regex_t
*preg
)
943 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
944 bin_tree_storage_t
*storage
, *next
;
945 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
947 next
= storage
->next
;
950 dfa
->str_tree_storage
= NULL
;
951 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
952 dfa
->str_tree
= NULL
;
953 re_free (dfa
->org_indices
);
954 dfa
->org_indices
= NULL
;
957 /* Create initial states for all contexts. */
960 create_initial_state (re_dfa_t
*dfa
)
964 re_node_set init_nodes
;
966 /* Initial states have the epsilon closure of the node which is
967 the first node of the regular expression. */
968 first
= dfa
->str_tree
->first
->node_idx
;
969 dfa
->init_node
= first
;
970 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
971 if (BE (err
!= REG_NOERROR
, 0))
974 /* The back-references which are in initial states can epsilon transit,
975 since in this case all of the subexpressions can be null.
976 Then we add epsilon closures of the nodes which are the next nodes of
977 the back-references. */
978 if (dfa
->nbackref
> 0)
979 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
981 int node_idx
= init_nodes
.elems
[i
];
982 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
985 if (type
!= OP_BACK_REF
)
987 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
989 re_token_t
*clexp_node
;
990 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
991 if (clexp_node
->type
== OP_CLOSE_SUBEXP
992 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
995 if (clexp_idx
== init_nodes
.nelem
)
998 if (type
== OP_BACK_REF
)
1000 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1001 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1003 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1009 /* It must be the first time to invoke acquire_state. */
1010 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1011 /* We don't check ERR here, since the initial state must not be NULL. */
1012 if (BE (dfa
->init_state
== NULL
, 0))
1014 if (dfa
->init_state
->has_constraint
)
1016 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1018 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1020 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1024 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1025 || dfa
->init_state_begbuf
== NULL
, 0))
1029 dfa
->init_state_word
= dfa
->init_state_nl
1030 = dfa
->init_state_begbuf
= dfa
->init_state
;
1032 re_node_set_free (&init_nodes
);
1036 #ifdef RE_ENABLE_I18N
1037 /* If it is possible to do searching in single byte encoding instead of UTF-8
1038 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1039 DFA nodes where needed. */
1042 optimize_utf8 (re_dfa_t
*dfa
)
1044 int node
, i
, mb_chars
= 0, has_period
= 0;
1046 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1047 switch (dfa
->nodes
[node
].type
)
1050 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1054 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1062 /* Word anchors etc. cannot be handled. It's okay to test
1063 opr.ctx_type since constraints (for all DFA nodes) are
1064 created by ORing one or more opr.ctx_type values. */
1074 case OP_DUP_ASTERISK
:
1075 case OP_OPEN_SUBEXP
:
1076 case OP_CLOSE_SUBEXP
:
1078 case COMPLEX_BRACKET
:
1080 case SIMPLE_BRACKET
:
1081 /* Just double check. The non-ASCII range starts at 0x80. */
1082 assert (0x80 % BITSET_WORD_BITS
== 0);
1083 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1084 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1091 if (mb_chars
|| has_period
)
1092 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1094 if (dfa
->nodes
[node
].type
== CHARACTER
1095 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1096 dfa
->nodes
[node
].mb_partial
= 0;
1097 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1098 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1101 /* The search can be in single byte locale. */
1102 dfa
->mb_cur_max
= 1;
1104 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1108 /* Analyze the structure tree, and calculate "first", "next", "edest",
1109 "eclosure", and "inveclosure". */
1111 static reg_errcode_t
1112 analyze (regex_t
*preg
)
1114 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1117 /* Allocate arrays. */
1118 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1119 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1120 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1121 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1122 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1123 || dfa
->eclosures
== NULL
, 0))
1126 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1127 if (dfa
->subexp_map
!= NULL
)
1130 for (i
= 0; i
< preg
->re_nsub
; i
++)
1131 dfa
->subexp_map
[i
] = i
;
1132 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1133 for (i
= 0; i
< preg
->re_nsub
; i
++)
1134 if (dfa
->subexp_map
[i
] != i
)
1136 if (i
== preg
->re_nsub
)
1138 free (dfa
->subexp_map
);
1139 dfa
->subexp_map
= NULL
;
1143 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1144 if (BE (ret
!= REG_NOERROR
, 0))
1146 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1147 if (BE (ret
!= REG_NOERROR
, 0))
1149 preorder (dfa
->str_tree
, calc_next
, dfa
);
1150 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1151 if (BE (ret
!= REG_NOERROR
, 0))
1153 ret
= calc_eclosure (dfa
);
1154 if (BE (ret
!= REG_NOERROR
, 0))
1157 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1158 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1159 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1162 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1163 if (BE (dfa
->inveclosures
== NULL
, 0))
1165 ret
= calc_inveclosure (dfa
);
1171 /* Our parse trees are very unbalanced, so we cannot use a stack to
1172 implement parse tree visits. Instead, we use parent pointers and
1173 some hairy code in these two functions. */
1174 static reg_errcode_t
1175 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1178 bin_tree_t
*node
, *prev
;
1180 for (node
= root
; ; )
1182 /* Descend down the tree, preferably to the left (or to the right
1183 if that's the only child). */
1184 while (node
->left
|| node
->right
)
1192 reg_errcode_t err
= fn (extra
, node
);
1193 if (BE (err
!= REG_NOERROR
, 0))
1195 if (node
->parent
== NULL
)
1198 node
= node
->parent
;
1200 /* Go up while we have a node that is reached from the right. */
1201 while (node
->right
== prev
|| node
->right
== NULL
);
1206 static reg_errcode_t
1207 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1212 for (node
= root
; ; )
1214 reg_errcode_t err
= fn (extra
, node
);
1215 if (BE (err
!= REG_NOERROR
, 0))
1218 /* Go to the left node, or up and to the right. */
1223 bin_tree_t
*prev
= NULL
;
1224 while (node
->right
== prev
|| node
->right
== NULL
)
1227 node
= node
->parent
;
1236 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1237 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1238 backreferences as well. Requires a preorder visit. */
1239 static reg_errcode_t
1240 optimize_subexps (void *extra
, bin_tree_t
*node
)
1242 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1244 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1246 int idx
= node
->token
.opr
.idx
;
1247 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1248 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1251 else if (node
->token
.type
== SUBEXP
1252 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1254 int other_idx
= node
->left
->token
.opr
.idx
;
1256 node
->left
= node
->left
->left
;
1258 node
->left
->parent
= node
;
1260 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1261 if (other_idx
< BITSET_WORD_BITS
)
1262 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1268 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1269 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1270 static reg_errcode_t
1271 lower_subexps (void *extra
, bin_tree_t
*node
)
1273 regex_t
*preg
= (regex_t
*) extra
;
1274 reg_errcode_t err
= REG_NOERROR
;
1276 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1278 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1280 node
->left
->parent
= node
;
1282 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1284 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1286 node
->right
->parent
= node
;
1293 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1295 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1296 bin_tree_t
*body
= node
->left
;
1297 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1300 /* We do not optimize empty subexpressions, because otherwise we may
1301 have bad CONCAT nodes with NULL children. This is obviously not
1302 very common, so we do not lose much. An example that triggers
1303 this case is the sed "script" /\(\)/x. */
1304 && node
->left
!= NULL
1305 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1306 || !(dfa
->used_bkref_map
1307 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1310 /* Convert the SUBEXP node to the concatenation of an
1311 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1312 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1313 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1314 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1315 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1316 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1322 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1323 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1327 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1328 nodes. Requires a postorder visit. */
1329 static reg_errcode_t
1330 calc_first (void *extra
, bin_tree_t
*node
)
1332 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1333 if (node
->token
.type
== CONCAT
)
1335 node
->first
= node
->left
->first
;
1336 node
->node_idx
= node
->left
->node_idx
;
1341 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1342 if (BE (node
->node_idx
== -1, 0))
1344 if (node
->token
.type
== ANCHOR
)
1345 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1350 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1351 static reg_errcode_t
1352 calc_next (void *extra
, bin_tree_t
*node
)
1354 switch (node
->token
.type
)
1356 case OP_DUP_ASTERISK
:
1357 node
->left
->next
= node
;
1360 node
->left
->next
= node
->right
->first
;
1361 node
->right
->next
= node
->next
;
1365 node
->left
->next
= node
->next
;
1367 node
->right
->next
= node
->next
;
1373 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1374 static reg_errcode_t
1375 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1377 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1378 int idx
= node
->node_idx
;
1379 reg_errcode_t err
= REG_NOERROR
;
1381 switch (node
->token
.type
)
1387 assert (node
->next
== NULL
);
1390 case OP_DUP_ASTERISK
:
1394 dfa
->has_plural_match
= 1;
1395 if (node
->left
!= NULL
)
1396 left
= node
->left
->first
->node_idx
;
1398 left
= node
->next
->node_idx
;
1399 if (node
->right
!= NULL
)
1400 right
= node
->right
->first
->node_idx
;
1402 right
= node
->next
->node_idx
;
1404 assert (right
> -1);
1405 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1410 case OP_OPEN_SUBEXP
:
1411 case OP_CLOSE_SUBEXP
:
1412 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1416 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1417 if (node
->token
.type
== OP_BACK_REF
)
1418 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1422 assert (!IS_EPSILON_NODE (node
->token
.type
));
1423 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1430 /* Duplicate the epsilon closure of the node ROOT_NODE.
1431 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1432 to their own constraint. */
1434 static reg_errcode_t
1436 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1437 int root_node
, unsigned int init_constraint
)
1439 int org_node
, clone_node
, ret
;
1440 unsigned int constraint
= init_constraint
;
1441 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1443 int org_dest
, clone_dest
;
1444 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1446 /* If the back reference epsilon-transit, its destination must
1447 also have the constraint. Then duplicate the epsilon closure
1448 of the destination of the back reference, and store it in
1449 edests of the back reference. */
1450 org_dest
= dfa
->nexts
[org_node
];
1451 re_node_set_empty (dfa
->edests
+ clone_node
);
1452 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1453 if (BE (clone_dest
== -1, 0))
1455 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1456 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1457 if (BE (ret
< 0, 0))
1460 else if (dfa
->edests
[org_node
].nelem
== 0)
1462 /* In case of the node can't epsilon-transit, don't duplicate the
1463 destination and store the original destination as the
1464 destination of the node. */
1465 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1468 else if (dfa
->edests
[org_node
].nelem
== 1)
1470 /* In case of the node can epsilon-transit, and it has only one
1472 org_dest
= dfa
->edests
[org_node
].elems
[0];
1473 re_node_set_empty (dfa
->edests
+ clone_node
);
1474 /* If the node is root_node itself, it means the epsilon clsoure
1475 has a loop. Then tie it to the destination of the root_node. */
1476 if (org_node
== root_node
&& clone_node
!= org_node
)
1478 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1479 if (BE (ret
< 0, 0))
1483 /* In case of the node has another constraint, add it. */
1484 constraint
|= dfa
->nodes
[org_node
].constraint
;
1485 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1486 if (BE (clone_dest
== -1, 0))
1488 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1489 if (BE (ret
< 0, 0))
1492 else /* dfa->edests[org_node].nelem == 2 */
1494 /* In case of the node can epsilon-transit, and it has two
1495 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1496 org_dest
= dfa
->edests
[org_node
].elems
[0];
1497 re_node_set_empty (dfa
->edests
+ clone_node
);
1498 /* Search for a duplicated node which satisfies the constraint. */
1499 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1500 if (clone_dest
== -1)
1502 /* There is no such duplicated node, create a new one. */
1504 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1505 if (BE (clone_dest
== -1, 0))
1507 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1508 if (BE (ret
< 0, 0))
1510 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1511 root_node
, constraint
);
1512 if (BE (err
!= REG_NOERROR
, 0))
1517 /* There is a duplicated node which satisfies the constraint,
1518 use it to avoid infinite loop. */
1519 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1520 if (BE (ret
< 0, 0))
1524 org_dest
= dfa
->edests
[org_node
].elems
[1];
1525 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1526 if (BE (clone_dest
== -1, 0))
1528 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1529 if (BE (ret
< 0, 0))
1532 org_node
= org_dest
;
1533 clone_node
= clone_dest
;
1538 /* Search for a node which is duplicated from the node ORG_NODE, and
1539 satisfies the constraint CONSTRAINT. */
1542 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1543 unsigned int constraint
)
1546 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1548 if (org_node
== dfa
->org_indices
[idx
]
1549 && constraint
== dfa
->nodes
[idx
].constraint
)
1550 return idx
; /* Found. */
1552 return -1; /* Not found. */
1555 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1556 Return the index of the new node, or -1 if insufficient storage is
1560 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1562 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1563 if (BE (dup_idx
!= -1, 1))
1565 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1566 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1567 dfa
->nodes
[dup_idx
].duplicated
= 1;
1569 /* Store the index of the original node. */
1570 dfa
->org_indices
[dup_idx
] = org_idx
;
1575 static reg_errcode_t
1576 calc_inveclosure (re_dfa_t
*dfa
)
1579 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1580 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1582 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1584 int *elems
= dfa
->eclosures
[src
].elems
;
1585 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1587 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1588 if (BE (ret
== -1, 0))
1596 /* Calculate "eclosure" for all the node in DFA. */
1598 static reg_errcode_t
1599 calc_eclosure (re_dfa_t
*dfa
)
1601 int node_idx
, incomplete
;
1603 assert (dfa
->nodes_len
> 0);
1606 /* For each nodes, calculate epsilon closure. */
1607 for (node_idx
= 0; ; ++node_idx
)
1610 re_node_set eclosure_elem
;
1611 if (node_idx
== dfa
->nodes_len
)
1620 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1623 /* If we have already calculated, skip it. */
1624 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1626 /* Calculate epsilon closure of `node_idx'. */
1627 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1628 if (BE (err
!= REG_NOERROR
, 0))
1631 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1634 re_node_set_free (&eclosure_elem
);
1640 /* Calculate epsilon closure of NODE. */
1642 static reg_errcode_t
1643 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1647 re_node_set eclosure
;
1649 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1650 if (BE (err
!= REG_NOERROR
, 0))
1653 /* This indicates that we are calculating this node now.
1654 We reference this value to avoid infinite loop. */
1655 dfa
->eclosures
[node
].nelem
= -1;
1657 /* If the current node has constraints, duplicate all nodes
1658 since they must inherit the constraints. */
1659 if (dfa
->nodes
[node
].constraint
1660 && dfa
->edests
[node
].nelem
1661 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1663 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1664 dfa
->nodes
[node
].constraint
);
1665 if (BE (err
!= REG_NOERROR
, 0))
1669 /* Expand each epsilon destination nodes. */
1670 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1671 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1673 re_node_set eclosure_elem
;
1674 int edest
= dfa
->edests
[node
].elems
[i
];
1675 /* If calculating the epsilon closure of `edest' is in progress,
1676 return intermediate result. */
1677 if (dfa
->eclosures
[edest
].nelem
== -1)
1682 /* If we haven't calculated the epsilon closure of `edest' yet,
1683 calculate now. Otherwise use calculated epsilon closure. */
1684 if (dfa
->eclosures
[edest
].nelem
== 0)
1686 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1687 if (BE (err
!= REG_NOERROR
, 0))
1691 eclosure_elem
= dfa
->eclosures
[edest
];
1692 /* Merge the epsilon closure of `edest'. */
1693 re_node_set_merge (&eclosure
, &eclosure_elem
);
1694 /* If the epsilon closure of `edest' is incomplete,
1695 the epsilon closure of this node is also incomplete. */
1696 if (dfa
->eclosures
[edest
].nelem
== 0)
1699 re_node_set_free (&eclosure_elem
);
1703 /* Epsilon closures include itself. */
1704 re_node_set_insert (&eclosure
, node
);
1705 if (incomplete
&& !root
)
1706 dfa
->eclosures
[node
].nelem
= 0;
1708 dfa
->eclosures
[node
] = eclosure
;
1709 *new_set
= eclosure
;
1713 /* Functions for token which are used in the parser. */
1715 /* Fetch a token from INPUT.
1716 We must not use this function inside bracket expressions. */
1720 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1722 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1725 /* Peek a token from INPUT, and return the length of the token.
1726 We must not use this function inside bracket expressions. */
1730 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1734 if (re_string_eoi (input
))
1736 token
->type
= END_OF_RE
;
1740 c
= re_string_peek_byte (input
, 0);
1743 token
->word_char
= 0;
1744 #ifdef RE_ENABLE_I18N
1745 token
->mb_partial
= 0;
1746 if (input
->mb_cur_max
> 1 &&
1747 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1749 token
->type
= CHARACTER
;
1750 token
->mb_partial
= 1;
1757 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1759 token
->type
= BACK_SLASH
;
1763 c2
= re_string_peek_byte_case (input
, 1);
1765 token
->type
= CHARACTER
;
1766 #ifdef RE_ENABLE_I18N
1767 if (input
->mb_cur_max
> 1)
1769 wint_t wc
= re_string_wchar_at (input
,
1770 re_string_cur_idx (input
) + 1);
1771 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1775 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1780 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1781 token
->type
= OP_ALT
;
1783 case '1': case '2': case '3': case '4': case '5':
1784 case '6': case '7': case '8': case '9':
1785 if (!(syntax
& RE_NO_BK_REFS
))
1787 token
->type
= OP_BACK_REF
;
1788 token
->opr
.idx
= c2
- '1';
1792 if (!(syntax
& RE_NO_GNU_OPS
))
1794 token
->type
= ANCHOR
;
1795 token
->opr
.ctx_type
= WORD_FIRST
;
1799 if (!(syntax
& RE_NO_GNU_OPS
))
1801 token
->type
= ANCHOR
;
1802 token
->opr
.ctx_type
= WORD_LAST
;
1806 if (!(syntax
& RE_NO_GNU_OPS
))
1808 token
->type
= ANCHOR
;
1809 token
->opr
.ctx_type
= WORD_DELIM
;
1813 if (!(syntax
& RE_NO_GNU_OPS
))
1815 token
->type
= ANCHOR
;
1816 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1820 if (!(syntax
& RE_NO_GNU_OPS
))
1821 token
->type
= OP_WORD
;
1824 if (!(syntax
& RE_NO_GNU_OPS
))
1825 token
->type
= OP_NOTWORD
;
1828 if (!(syntax
& RE_NO_GNU_OPS
))
1829 token
->type
= OP_SPACE
;
1832 if (!(syntax
& RE_NO_GNU_OPS
))
1833 token
->type
= OP_NOTSPACE
;
1836 if (!(syntax
& RE_NO_GNU_OPS
))
1838 token
->type
= ANCHOR
;
1839 token
->opr
.ctx_type
= BUF_FIRST
;
1843 if (!(syntax
& RE_NO_GNU_OPS
))
1845 token
->type
= ANCHOR
;
1846 token
->opr
.ctx_type
= BUF_LAST
;
1850 if (!(syntax
& RE_NO_BK_PARENS
))
1851 token
->type
= OP_OPEN_SUBEXP
;
1854 if (!(syntax
& RE_NO_BK_PARENS
))
1855 token
->type
= OP_CLOSE_SUBEXP
;
1858 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1859 token
->type
= OP_DUP_PLUS
;
1862 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1863 token
->type
= OP_DUP_QUESTION
;
1866 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1867 token
->type
= OP_OPEN_DUP_NUM
;
1870 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1871 token
->type
= OP_CLOSE_DUP_NUM
;
1879 token
->type
= CHARACTER
;
1880 #ifdef RE_ENABLE_I18N
1881 if (input
->mb_cur_max
> 1)
1883 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1884 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1888 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1893 if (syntax
& RE_NEWLINE_ALT
)
1894 token
->type
= OP_ALT
;
1897 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1898 token
->type
= OP_ALT
;
1901 token
->type
= OP_DUP_ASTERISK
;
1904 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1905 token
->type
= OP_DUP_PLUS
;
1908 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1909 token
->type
= OP_DUP_QUESTION
;
1912 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1913 token
->type
= OP_OPEN_DUP_NUM
;
1916 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1917 token
->type
= OP_CLOSE_DUP_NUM
;
1920 if (syntax
& RE_NO_BK_PARENS
)
1921 token
->type
= OP_OPEN_SUBEXP
;
1924 if (syntax
& RE_NO_BK_PARENS
)
1925 token
->type
= OP_CLOSE_SUBEXP
;
1928 token
->type
= OP_OPEN_BRACKET
;
1931 token
->type
= OP_PERIOD
;
1934 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1935 re_string_cur_idx (input
) != 0)
1937 char prev
= re_string_peek_byte (input
, -1);
1938 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1941 token
->type
= ANCHOR
;
1942 token
->opr
.ctx_type
= LINE_FIRST
;
1945 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1946 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1949 re_string_skip_bytes (input
, 1);
1950 peek_token (&next
, input
, syntax
);
1951 re_string_skip_bytes (input
, -1);
1952 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1955 token
->type
= ANCHOR
;
1956 token
->opr
.ctx_type
= LINE_LAST
;
1964 /* Peek a token from INPUT, and return the length of the token.
1965 We must not use this function out of bracket expressions. */
1969 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1972 if (re_string_eoi (input
))
1974 token
->type
= END_OF_RE
;
1977 c
= re_string_peek_byte (input
, 0);
1980 #ifdef RE_ENABLE_I18N
1981 if (input
->mb_cur_max
> 1 &&
1982 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1984 token
->type
= CHARACTER
;
1987 #endif /* RE_ENABLE_I18N */
1989 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1990 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1992 /* In this case, '\' escape a character. */
1994 re_string_skip_bytes (input
, 1);
1995 c2
= re_string_peek_byte (input
, 0);
1997 token
->type
= CHARACTER
;
2000 if (c
== '[') /* '[' is a special char in a bracket exps. */
2004 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2005 c2
= re_string_peek_byte (input
, 1);
2013 token
->type
= OP_OPEN_COLL_ELEM
;
2016 token
->type
= OP_OPEN_EQUIV_CLASS
;
2019 if (syntax
& RE_CHAR_CLASSES
)
2021 token
->type
= OP_OPEN_CHAR_CLASS
;
2024 /* else fall through. */
2026 token
->type
= CHARACTER
;
2036 token
->type
= OP_CHARSET_RANGE
;
2039 token
->type
= OP_CLOSE_BRACKET
;
2042 token
->type
= OP_NON_MATCH_LIST
;
2045 token
->type
= CHARACTER
;
2050 /* Functions for parser. */
2052 /* Entry point of the parser.
2053 Parse the regular expression REGEXP and return the structure tree.
2054 If an error is occured, ERR is set by error code, and return NULL.
2055 This function build the following tree, from regular expression <reg_exp>:
2061 CAT means concatenation.
2062 EOR means end of regular expression. */
2065 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2068 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2069 bin_tree_t
*tree
, *eor
, *root
;
2070 re_token_t current_token
;
2071 dfa
->syntax
= syntax
;
2072 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2073 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2074 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2076 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2078 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2081 if (BE (eor
== NULL
|| root
== NULL
, 0))
2089 /* This function build the following tree, from regular expression
2090 <branch1>|<branch2>:
2096 ALT means alternative, which represents the operator `|'. */
2099 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2100 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2102 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2103 bin_tree_t
*tree
, *branch
= NULL
;
2104 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2105 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2108 while (token
->type
== OP_ALT
)
2110 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2111 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2112 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2114 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2115 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2120 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2121 if (BE (tree
== NULL
, 0))
2130 /* This function build the following tree, from regular expression
2137 CAT means concatenation. */
2140 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2141 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2143 bin_tree_t
*tree
, *exp
;
2144 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2145 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2146 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2149 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2150 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2152 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2153 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2157 if (tree
!= NULL
&& exp
!= NULL
)
2159 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2166 else if (tree
== NULL
)
2168 /* Otherwise exp == NULL, we don't need to create new tree. */
2173 /* This function build the following tree, from regular expression a*:
2180 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2181 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2183 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2185 switch (token
->type
)
2188 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2189 if (BE (tree
== NULL
, 0))
2194 #ifdef RE_ENABLE_I18N
2195 if (dfa
->mb_cur_max
> 1)
2197 while (!re_string_eoi (regexp
)
2198 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2200 bin_tree_t
*mbc_remain
;
2201 fetch_token (token
, regexp
, syntax
);
2202 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2203 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2204 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2213 case OP_OPEN_SUBEXP
:
2214 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2215 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2218 case OP_OPEN_BRACKET
:
2219 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2220 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2224 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2229 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2230 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2231 if (BE (tree
== NULL
, 0))
2237 dfa
->has_mb_node
= 1;
2239 case OP_OPEN_DUP_NUM
:
2240 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2246 case OP_DUP_ASTERISK
:
2248 case OP_DUP_QUESTION
:
2249 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2254 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2256 fetch_token (token
, regexp
, syntax
);
2257 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2259 /* else fall through */
2260 case OP_CLOSE_SUBEXP
:
2261 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2262 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2267 /* else fall through */
2268 case OP_CLOSE_DUP_NUM
:
2269 /* We treat it as a normal character. */
2271 /* Then we can these characters as normal characters. */
2272 token
->type
= CHARACTER
;
2273 /* mb_partial and word_char bits should be initialized already
2275 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2276 if (BE (tree
== NULL
, 0))
2283 if ((token
->opr
.ctx_type
2284 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2285 && dfa
->word_ops_used
== 0)
2286 init_word_char (dfa
);
2287 if (token
->opr
.ctx_type
== WORD_DELIM
2288 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2290 bin_tree_t
*tree_first
, *tree_last
;
2291 if (token
->opr
.ctx_type
== WORD_DELIM
)
2293 token
->opr
.ctx_type
= WORD_FIRST
;
2294 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2295 token
->opr
.ctx_type
= WORD_LAST
;
2299 token
->opr
.ctx_type
= INSIDE_WORD
;
2300 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2301 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2303 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2304 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2305 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2313 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2314 if (BE (tree
== NULL
, 0))
2320 /* We must return here, since ANCHORs can't be followed
2321 by repetition operators.
2322 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2323 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2324 fetch_token (token
, regexp
, syntax
);
2327 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2328 if (BE (tree
== NULL
, 0))
2333 if (dfa
->mb_cur_max
> 1)
2334 dfa
->has_mb_node
= 1;
2338 tree
= build_charclass_op (dfa
, regexp
->trans
,
2339 (const unsigned char *) "alnum",
2340 (const unsigned char *) "_",
2341 token
->type
== OP_NOTWORD
, err
);
2342 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2347 tree
= build_charclass_op (dfa
, regexp
->trans
,
2348 (const unsigned char *) "space",
2349 (const unsigned char *) "",
2350 token
->type
== OP_NOTSPACE
, err
);
2351 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2361 /* Must not happen? */
2367 fetch_token (token
, regexp
, syntax
);
2369 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2370 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2372 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2373 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2375 /* In BRE consecutive duplications are not allowed. */
2376 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2377 && (token
->type
== OP_DUP_ASTERISK
2378 || token
->type
== OP_OPEN_DUP_NUM
))
2388 /* This function build the following tree, from regular expression
2396 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2397 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2399 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2402 cur_nsub
= preg
->re_nsub
++;
2404 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2406 /* The subexpression may be a null string. */
2407 if (token
->type
== OP_CLOSE_SUBEXP
)
2411 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2412 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2414 if (BE (*err
!= REG_NOERROR
, 0))
2418 if (cur_nsub
<= '9' - '1')
2419 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2421 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2422 if (BE (tree
== NULL
, 0))
2427 tree
->token
.opr
.idx
= cur_nsub
;
2431 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2434 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2435 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2437 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2438 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2439 re_token_t start_token
= *token
;
2441 if (token
->type
== OP_OPEN_DUP_NUM
)
2444 start
= fetch_number (regexp
, token
, syntax
);
2447 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2448 start
= 0; /* We treat "{,m}" as "{0,m}". */
2451 *err
= REG_BADBR
; /* <re>{} is invalid. */
2455 if (BE (start
!= -2, 1))
2457 /* We treat "{n}" as "{n,n}". */
2458 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2459 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2460 ? fetch_number (regexp
, token
, syntax
) : -2));
2462 if (BE (start
== -2 || end
== -2, 0))
2464 /* Invalid sequence. */
2465 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2467 if (token
->type
== END_OF_RE
)
2475 /* If the syntax bit is set, rollback. */
2476 re_string_set_index (regexp
, start_idx
);
2477 *token
= start_token
;
2478 token
->type
= CHARACTER
;
2479 /* mb_partial and word_char bits should be already initialized by
2484 if (BE (end
!= -1 && start
> end
, 0))
2486 /* First number greater than second. */
2493 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2494 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2497 fetch_token (token
, regexp
, syntax
);
2499 if (BE (elem
== NULL
, 0))
2501 if (BE (start
== 0 && end
== 0, 0))
2503 postorder (elem
, free_tree
, NULL
);
2507 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2508 if (BE (start
> 0, 0))
2511 for (i
= 2; i
<= start
; ++i
)
2513 elem
= duplicate_tree (elem
, dfa
);
2514 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2515 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2516 goto parse_dup_op_espace
;
2522 /* Duplicate ELEM before it is marked optional. */
2523 elem
= duplicate_tree (elem
, dfa
);
2529 if (elem
->token
.type
== SUBEXP
)
2530 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2532 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2533 if (BE (tree
== NULL
, 0))
2534 goto parse_dup_op_espace
;
2536 /* This loop is actually executed only when end != -1,
2537 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2538 already created the start+1-th copy. */
2539 for (i
= start
+ 2; i
<= end
; ++i
)
2541 elem
= duplicate_tree (elem
, dfa
);
2542 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2543 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2544 goto parse_dup_op_espace
;
2546 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2547 if (BE (tree
== NULL
, 0))
2548 goto parse_dup_op_espace
;
2552 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2556 parse_dup_op_espace
:
2561 /* Size of the names for collating symbol/equivalence_class/character_class.
2562 I'm not sure, but maybe enough. */
2563 #define BRACKET_NAME_BUF_SIZE 32
2566 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2567 Build the range expression which starts from START_ELEM, and ends
2568 at END_ELEM. The result are written to MBCSET and SBCSET.
2569 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2570 mbcset->range_ends, is a pointer argument sinse we may
2573 static reg_errcode_t
2575 # ifdef RE_ENABLE_I18N
2576 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2577 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2578 # else /* not RE_ENABLE_I18N */
2579 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2580 bracket_elem_t
*end_elem
)
2581 # endif /* not RE_ENABLE_I18N */
2583 unsigned int start_ch
, end_ch
;
2584 /* Equivalence Classes and Character Classes can't be a range start/end. */
2585 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2586 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2590 /* We can handle no multi character collating elements without libc
2592 if (BE ((start_elem
->type
== COLL_SYM
2593 && strlen ((char *) start_elem
->opr
.name
) > 1)
2594 || (end_elem
->type
== COLL_SYM
2595 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2596 return REG_ECOLLATE
;
2598 # ifdef RE_ENABLE_I18N
2603 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2605 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2606 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2608 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2609 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2611 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2612 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2613 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2614 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2615 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2616 return REG_ECOLLATE
;
2617 cmp_buf
[0] = start_wc
;
2618 cmp_buf
[4] = end_wc
;
2619 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2622 /* Got valid collation sequence values, add them as a new entry.
2623 However, for !_LIBC we have no collation elements: if the
2624 character set is single byte, the single byte character set
2625 that we build below suffices. parse_bracket_exp passes
2626 no MBCSET if dfa->mb_cur_max == 1. */
2629 /* Check the space of the arrays. */
2630 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2632 /* There is not enough space, need realloc. */
2633 wchar_t *new_array_start
, *new_array_end
;
2636 /* +1 in case of mbcset->nranges is 0. */
2637 new_nranges
= 2 * mbcset
->nranges
+ 1;
2638 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639 are NULL if *range_alloc == 0. */
2640 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2642 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2645 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2648 mbcset
->range_starts
= new_array_start
;
2649 mbcset
->range_ends
= new_array_end
;
2650 *range_alloc
= new_nranges
;
2653 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2654 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2657 /* Build the table for single byte characters. */
2658 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2661 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2662 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2663 bitset_set (sbcset
, wc
);
2666 # else /* not RE_ENABLE_I18N */
2669 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2670 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2672 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2673 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2675 if (start_ch
> end_ch
)
2677 /* Build the table for single byte characters. */
2678 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2679 if (start_ch
<= ch
&& ch
<= end_ch
)
2680 bitset_set (sbcset
, ch
);
2682 # endif /* not RE_ENABLE_I18N */
2685 #endif /* not _LIBC */
2688 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2689 Build the collating element which is represented by NAME.
2690 The result are written to MBCSET and SBCSET.
2691 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2692 pointer argument since we may update it. */
2694 static reg_errcode_t
2696 # ifdef RE_ENABLE_I18N
2697 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2698 int *coll_sym_alloc
, const unsigned char *name
)
2699 # else /* not RE_ENABLE_I18N */
2700 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2701 # endif /* not RE_ENABLE_I18N */
2703 size_t name_len
= strlen ((const char *) name
);
2704 if (BE (name_len
!= 1, 0))
2705 return REG_ECOLLATE
;
2708 bitset_set (sbcset
, name
[0]);
2712 #endif /* not _LIBC */
2714 /* This function parse bracket expression like "[abc]", "[a-c]",
2718 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2719 reg_syntax_t syntax
, reg_errcode_t
*err
)
2722 const unsigned char *collseqmb
;
2723 const char *collseqwc
;
2726 const int32_t *symb_table
;
2727 const unsigned char *extra
;
2729 /* Local function for parse_bracket_exp used in _LIBC environement.
2730 Seek the collating symbol entry correspondings to NAME.
2731 Return the index of the symbol in the SYMB_TABLE. */
2734 __attribute ((always_inline
))
2735 seek_collating_symbol_entry (name
, name_len
)
2736 const unsigned char *name
;
2739 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2740 int32_t elem
= hash
% table_size
;
2741 if (symb_table
[2 * elem
] != 0)
2743 int32_t second
= hash
% (table_size
- 2) + 1;
2747 /* First compare the hashing value. */
2748 if (symb_table
[2 * elem
] == hash
2749 /* Compare the length of the name. */
2750 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2751 /* Compare the name. */
2752 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2755 /* Yep, this is the entry. */
2762 while (symb_table
[2 * elem
] != 0);
2767 /* Local function for parse_bracket_exp used in _LIBC environment.
2768 Look up the collation sequence value of BR_ELEM.
2769 Return the value if succeeded, UINT_MAX otherwise. */
2771 auto inline unsigned int
2772 __attribute ((always_inline
))
2773 lookup_collation_sequence_value (br_elem
)
2774 bracket_elem_t
*br_elem
;
2776 if (br_elem
->type
== SB_CHAR
)
2779 if (MB_CUR_MAX == 1)
2782 return collseqmb
[br_elem
->opr
.ch
];
2785 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2786 return __collseq_table_lookup (collseqwc
, wc
);
2789 else if (br_elem
->type
== MB_CHAR
)
2792 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2794 else if (br_elem
->type
== COLL_SYM
)
2796 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2800 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2802 if (symb_table
[2 * elem
] != 0)
2804 /* We found the entry. */
2805 idx
= symb_table
[2 * elem
+ 1];
2806 /* Skip the name of collating element name. */
2807 idx
+= 1 + extra
[idx
];
2808 /* Skip the byte sequence of the collating element. */
2809 idx
+= 1 + extra
[idx
];
2810 /* Adjust for the alignment. */
2811 idx
= (idx
+ 3) & ~3;
2812 /* Skip the multibyte collation sequence value. */
2813 idx
+= sizeof (unsigned int);
2814 /* Skip the wide char sequence of the collating element. */
2815 idx
+= sizeof (unsigned int) *
2816 (1 + *(unsigned int *) (extra
+ idx
));
2817 /* Return the collation sequence value. */
2818 return *(unsigned int *) (extra
+ idx
);
2820 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2822 /* No valid character. Match it as a single byte
2824 return collseqmb
[br_elem
->opr
.name
[0]];
2827 else if (sym_name_len
== 1)
2828 return collseqmb
[br_elem
->opr
.name
[0]];
2833 /* Local function for parse_bracket_exp used in _LIBC environement.
2834 Build the range expression which starts from START_ELEM, and ends
2835 at END_ELEM. The result are written to MBCSET and SBCSET.
2836 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2837 mbcset->range_ends, is a pointer argument sinse we may
2840 auto inline reg_errcode_t
2841 __attribute ((always_inline
))
2842 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2843 re_charset_t
*mbcset
;
2846 bracket_elem_t
*start_elem
, *end_elem
;
2849 uint32_t start_collseq
;
2850 uint32_t end_collseq
;
2852 /* Equivalence Classes and Character Classes can't be a range
2854 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2855 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2859 start_collseq
= lookup_collation_sequence_value (start_elem
);
2860 end_collseq
= lookup_collation_sequence_value (end_elem
);
2861 /* Check start/end collation sequence values. */
2862 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2863 return REG_ECOLLATE
;
2864 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2867 /* Got valid collation sequence values, add them as a new entry.
2868 However, if we have no collation elements, and the character set
2869 is single byte, the single byte character set that we
2870 build below suffices. */
2871 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2873 /* Check the space of the arrays. */
2874 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2876 /* There is not enough space, need realloc. */
2877 uint32_t *new_array_start
;
2878 uint32_t *new_array_end
;
2881 /* +1 in case of mbcset->nranges is 0. */
2882 new_nranges
= 2 * mbcset
->nranges
+ 1;
2883 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2885 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2888 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2891 mbcset
->range_starts
= new_array_start
;
2892 mbcset
->range_ends
= new_array_end
;
2893 *range_alloc
= new_nranges
;
2896 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2897 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2900 /* Build the table for single byte characters. */
2901 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2903 uint32_t ch_collseq
;
2905 if (MB_CUR_MAX == 1)
2908 ch_collseq
= collseqmb
[ch
];
2910 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2911 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2912 bitset_set (sbcset
, ch
);
2917 /* Local function for parse_bracket_exp used in _LIBC environement.
2918 Build the collating element which is represented by NAME.
2919 The result are written to MBCSET and SBCSET.
2920 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2921 pointer argument sinse we may update it. */
2923 auto inline reg_errcode_t
2924 __attribute ((always_inline
))
2925 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2926 re_charset_t
*mbcset
;
2927 int *coll_sym_alloc
;
2929 const unsigned char *name
;
2932 size_t name_len
= strlen ((const char *) name
);
2935 elem
= seek_collating_symbol_entry (name
, name_len
);
2936 if (symb_table
[2 * elem
] != 0)
2938 /* We found the entry. */
2939 idx
= symb_table
[2 * elem
+ 1];
2940 /* Skip the name of collating element name. */
2941 idx
+= 1 + extra
[idx
];
2943 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2945 /* No valid character, treat it as a normal
2947 bitset_set (sbcset
, name
[0]);
2951 return REG_ECOLLATE
;
2953 /* Got valid collation sequence, add it as a new entry. */
2954 /* Check the space of the arrays. */
2955 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2957 /* Not enough, realloc it. */
2958 /* +1 in case of mbcset->ncoll_syms is 0. */
2959 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2960 /* Use realloc since mbcset->coll_syms is NULL
2962 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2963 new_coll_sym_alloc
);
2964 if (BE (new_coll_syms
== NULL
, 0))
2966 mbcset
->coll_syms
= new_coll_syms
;
2967 *coll_sym_alloc
= new_coll_sym_alloc
;
2969 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2974 if (BE (name_len
!= 1, 0))
2975 return REG_ECOLLATE
;
2978 bitset_set (sbcset
, name
[0]);
2985 re_token_t br_token
;
2986 re_bitset_ptr_t sbcset
;
2987 #ifdef RE_ENABLE_I18N
2988 re_charset_t
*mbcset
;
2989 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2990 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2991 #endif /* not RE_ENABLE_I18N */
2993 bin_tree_t
*work_tree
;
2995 int first_round
= 1;
2997 collseqmb
= (const unsigned char *)
2998 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2999 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3005 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3006 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3007 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3008 _NL_COLLATE_SYMB_TABLEMB
);
3009 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3010 _NL_COLLATE_SYMB_EXTRAMB
);
3013 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3014 #ifdef RE_ENABLE_I18N
3015 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3016 #endif /* RE_ENABLE_I18N */
3017 #ifdef RE_ENABLE_I18N
3018 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3020 if (BE (sbcset
== NULL
, 0))
3021 #endif /* RE_ENABLE_I18N */
3027 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3028 if (BE (token
->type
== END_OF_RE
, 0))
3031 goto parse_bracket_exp_free_return
;
3033 if (token
->type
== OP_NON_MATCH_LIST
)
3035 #ifdef RE_ENABLE_I18N
3036 mbcset
->non_match
= 1;
3037 #endif /* not RE_ENABLE_I18N */
3039 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3040 bitset_set (sbcset
, '\n');
3041 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3042 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3043 if (BE (token
->type
== END_OF_RE
, 0))
3046 goto parse_bracket_exp_free_return
;
3050 /* We treat the first ']' as a normal character. */
3051 if (token
->type
== OP_CLOSE_BRACKET
)
3052 token
->type
= CHARACTER
;
3056 bracket_elem_t start_elem
, end_elem
;
3057 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3058 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3060 int token_len2
= 0, is_range_exp
= 0;
3063 start_elem
.opr
.name
= start_name_buf
;
3064 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3065 syntax
, first_round
);
3066 if (BE (ret
!= REG_NOERROR
, 0))
3069 goto parse_bracket_exp_free_return
;
3073 /* Get information about the next token. We need it in any case. */
3074 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3076 /* Do not check for ranges if we know they are not allowed. */
3077 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3079 if (BE (token
->type
== END_OF_RE
, 0))
3082 goto parse_bracket_exp_free_return
;
3084 if (token
->type
== OP_CHARSET_RANGE
)
3086 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3087 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3088 if (BE (token2
.type
== END_OF_RE
, 0))
3091 goto parse_bracket_exp_free_return
;
3093 if (token2
.type
== OP_CLOSE_BRACKET
)
3095 /* We treat the last '-' as a normal character. */
3096 re_string_skip_bytes (regexp
, -token_len
);
3097 token
->type
= CHARACTER
;
3104 if (is_range_exp
== 1)
3106 end_elem
.opr
.name
= end_name_buf
;
3107 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3109 if (BE (ret
!= REG_NOERROR
, 0))
3112 goto parse_bracket_exp_free_return
;
3115 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3118 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3119 &start_elem
, &end_elem
);
3121 # ifdef RE_ENABLE_I18N
3122 *err
= build_range_exp (sbcset
,
3123 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3124 &range_alloc
, &start_elem
, &end_elem
);
3126 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3128 #endif /* RE_ENABLE_I18N */
3129 if (BE (*err
!= REG_NOERROR
, 0))
3130 goto parse_bracket_exp_free_return
;
3134 switch (start_elem
.type
)
3137 bitset_set (sbcset
, start_elem
.opr
.ch
);
3139 #ifdef RE_ENABLE_I18N
3141 /* Check whether the array has enough space. */
3142 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3144 wchar_t *new_mbchars
;
3145 /* Not enough, realloc it. */
3146 /* +1 in case of mbcset->nmbchars is 0. */
3147 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3148 /* Use realloc since array is NULL if *alloc == 0. */
3149 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3151 if (BE (new_mbchars
== NULL
, 0))
3152 goto parse_bracket_exp_espace
;
3153 mbcset
->mbchars
= new_mbchars
;
3155 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3157 #endif /* RE_ENABLE_I18N */
3159 *err
= build_equiv_class (sbcset
,
3160 #ifdef RE_ENABLE_I18N
3161 mbcset
, &equiv_class_alloc
,
3162 #endif /* RE_ENABLE_I18N */
3163 start_elem
.opr
.name
);
3164 if (BE (*err
!= REG_NOERROR
, 0))
3165 goto parse_bracket_exp_free_return
;
3168 *err
= build_collating_symbol (sbcset
,
3169 #ifdef RE_ENABLE_I18N
3170 mbcset
, &coll_sym_alloc
,
3171 #endif /* RE_ENABLE_I18N */
3172 start_elem
.opr
.name
);
3173 if (BE (*err
!= REG_NOERROR
, 0))
3174 goto parse_bracket_exp_free_return
;
3177 *err
= build_charclass (regexp
->trans
, sbcset
,
3178 #ifdef RE_ENABLE_I18N
3179 mbcset
, &char_class_alloc
,
3180 #endif /* RE_ENABLE_I18N */
3181 start_elem
.opr
.name
, syntax
);
3182 if (BE (*err
!= REG_NOERROR
, 0))
3183 goto parse_bracket_exp_free_return
;
3190 if (BE (token
->type
== END_OF_RE
, 0))
3193 goto parse_bracket_exp_free_return
;
3195 if (token
->type
== OP_CLOSE_BRACKET
)
3199 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3201 /* If it is non-matching list. */
3203 bitset_not (sbcset
);
3205 #ifdef RE_ENABLE_I18N
3206 /* Ensure only single byte characters are set. */
3207 if (dfa
->mb_cur_max
> 1)
3208 bitset_mask (sbcset
, dfa
->sb_char
);
3210 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3211 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3212 || mbcset
->non_match
)))
3214 bin_tree_t
*mbc_tree
;
3216 /* Build a tree for complex bracket. */
3217 dfa
->has_mb_node
= 1;
3218 br_token
.type
= COMPLEX_BRACKET
;
3219 br_token
.opr
.mbcset
= mbcset
;
3220 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3221 if (BE (mbc_tree
== NULL
, 0))
3222 goto parse_bracket_exp_espace
;
3223 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3224 if (sbcset
[sbc_idx
])
3226 /* If there are no bits set in sbcset, there is no point
3227 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3228 if (sbc_idx
< BITSET_WORDS
)
3230 /* Build a tree for simple bracket. */
3231 br_token
.type
= SIMPLE_BRACKET
;
3232 br_token
.opr
.sbcset
= sbcset
;
3233 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3234 if (BE (work_tree
== NULL
, 0))
3235 goto parse_bracket_exp_espace
;
3237 /* Then join them by ALT node. */
3238 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3239 if (BE (work_tree
== NULL
, 0))
3240 goto parse_bracket_exp_espace
;
3245 work_tree
= mbc_tree
;
3249 #endif /* not RE_ENABLE_I18N */
3251 #ifdef RE_ENABLE_I18N
3252 free_charset (mbcset
);
3254 /* Build a tree for simple bracket. */
3255 br_token
.type
= SIMPLE_BRACKET
;
3256 br_token
.opr
.sbcset
= sbcset
;
3257 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3258 if (BE (work_tree
== NULL
, 0))
3259 goto parse_bracket_exp_espace
;
3263 parse_bracket_exp_espace
:
3265 parse_bracket_exp_free_return
:
3267 #ifdef RE_ENABLE_I18N
3268 free_charset (mbcset
);
3269 #endif /* RE_ENABLE_I18N */
3273 /* Parse an element in the bracket expression. */
3275 static reg_errcode_t
3276 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3277 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3278 reg_syntax_t syntax
, int accept_hyphen
)
3280 #ifdef RE_ENABLE_I18N
3282 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3283 if (cur_char_size
> 1)
3285 elem
->type
= MB_CHAR
;
3286 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3287 re_string_skip_bytes (regexp
, cur_char_size
);
3290 #endif /* RE_ENABLE_I18N */
3291 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3292 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3293 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3294 return parse_bracket_symbol (elem
, regexp
, token
);
3295 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3297 /* A '-' must only appear as anything but a range indicator before
3298 the closing bracket. Everything else is an error. */
3300 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3301 if (token2
.type
!= OP_CLOSE_BRACKET
)
3302 /* The actual error value is not standardized since this whole
3303 case is undefined. But ERANGE makes good sense. */
3306 elem
->type
= SB_CHAR
;
3307 elem
->opr
.ch
= token
->opr
.c
;
3311 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3312 such as [:<character_class>:], [.<collating_element>.], and
3313 [=<equivalent_class>=]. */
3315 static reg_errcode_t
3316 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3319 unsigned char ch
, delim
= token
->opr
.c
;
3321 if (re_string_eoi(regexp
))
3325 if (i
>= BRACKET_NAME_BUF_SIZE
)
3327 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3328 ch
= re_string_fetch_byte_case (regexp
);
3330 ch
= re_string_fetch_byte (regexp
);
3331 if (re_string_eoi(regexp
))
3333 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3335 elem
->opr
.name
[i
] = ch
;
3337 re_string_skip_bytes (regexp
, 1);
3338 elem
->opr
.name
[i
] = '\0';
3339 switch (token
->type
)
3341 case OP_OPEN_COLL_ELEM
:
3342 elem
->type
= COLL_SYM
;
3344 case OP_OPEN_EQUIV_CLASS
:
3345 elem
->type
= EQUIV_CLASS
;
3347 case OP_OPEN_CHAR_CLASS
:
3348 elem
->type
= CHAR_CLASS
;
3356 /* Helper function for parse_bracket_exp.
3357 Build the equivalence class which is represented by NAME.
3358 The result are written to MBCSET and SBCSET.
3359 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3360 is a pointer argument sinse we may update it. */
3362 static reg_errcode_t
3363 #ifdef RE_ENABLE_I18N
3364 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3365 int *equiv_class_alloc
, const unsigned char *name
)
3366 #else /* not RE_ENABLE_I18N */
3367 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3368 #endif /* not RE_ENABLE_I18N */
3371 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3374 const int32_t *table
, *indirect
;
3375 const unsigned char *weights
, *extra
, *cp
;
3376 unsigned char char_buf
[2];
3380 /* This #include defines a local function! */
3381 # include <locale/weight.h>
3382 /* Calculate the index for equivalence class. */
3384 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3385 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3386 _NL_COLLATE_WEIGHTMB
);
3387 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3388 _NL_COLLATE_EXTRAMB
);
3389 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3390 _NL_COLLATE_INDIRECTMB
);
3391 idx1
= findidx (&cp
);
3392 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3393 /* This isn't a valid character. */
3394 return REG_ECOLLATE
;
3396 /* Build single byte matcing table for this equivalence class. */
3397 char_buf
[1] = (unsigned char) '\0';
3398 len
= weights
[idx1
& 0xffffff];
3399 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3403 idx2
= findidx (&cp
);
3408 /* This isn't a valid character. */
3410 /* Compare only if the length matches and the collation rule
3411 index is the same. */
3412 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3416 while (cnt
<= len
&&
3417 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3418 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3422 bitset_set (sbcset
, ch
);
3425 /* Check whether the array has enough space. */
3426 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3428 /* Not enough, realloc it. */
3429 /* +1 in case of mbcset->nequiv_classes is 0. */
3430 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3431 /* Use realloc since the array is NULL if *alloc == 0. */
3432 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3434 new_equiv_class_alloc
);
3435 if (BE (new_equiv_classes
== NULL
, 0))
3437 mbcset
->equiv_classes
= new_equiv_classes
;
3438 *equiv_class_alloc
= new_equiv_class_alloc
;
3440 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3445 if (BE (strlen ((const char *) name
) != 1, 0))
3446 return REG_ECOLLATE
;
3447 bitset_set (sbcset
, *name
);
3452 /* Helper function for parse_bracket_exp.
3453 Build the character class which is represented by NAME.
3454 The result are written to MBCSET and SBCSET.
3455 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3456 is a pointer argument sinse we may update it. */
3458 static reg_errcode_t
3459 #ifdef RE_ENABLE_I18N
3460 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3461 re_charset_t
*mbcset
, int *char_class_alloc
,
3462 const unsigned char *class_name
, reg_syntax_t syntax
)
3463 #else /* not RE_ENABLE_I18N */
3464 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3465 const unsigned char *class_name
, reg_syntax_t syntax
)
3466 #endif /* not RE_ENABLE_I18N */
3469 const char *name
= (const char *) class_name
;
3471 /* In case of REG_ICASE "upper" and "lower" match the both of
3472 upper and lower cases. */
3473 if ((syntax
& RE_ICASE
)
3474 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3477 #ifdef RE_ENABLE_I18N
3478 /* Check the space of the arrays. */
3479 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3481 /* Not enough, realloc it. */
3482 /* +1 in case of mbcset->nchar_classes is 0. */
3483 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3484 /* Use realloc since array is NULL if *alloc == 0. */
3485 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3486 new_char_class_alloc
);
3487 if (BE (new_char_classes
== NULL
, 0))
3489 mbcset
->char_classes
= new_char_classes
;
3490 *char_class_alloc
= new_char_class_alloc
;
3492 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3493 #endif /* RE_ENABLE_I18N */
3495 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3497 if (BE (trans != NULL, 0)) \
3499 for (i = 0; i < SBC_MAX; ++i) \
3500 if (ctype_func (i)) \
3501 bitset_set (sbcset, trans[i]); \
3505 for (i = 0; i < SBC_MAX; ++i) \
3506 if (ctype_func (i)) \
3507 bitset_set (sbcset, i); \
3511 if (strcmp (name
, "alnum") == 0)
3512 BUILD_CHARCLASS_LOOP (isalnum
);
3513 else if (strcmp (name
, "cntrl") == 0)
3514 BUILD_CHARCLASS_LOOP (iscntrl
);
3515 else if (strcmp (name
, "lower") == 0)
3516 BUILD_CHARCLASS_LOOP (islower
);
3517 else if (strcmp (name
, "space") == 0)
3518 BUILD_CHARCLASS_LOOP (isspace
);
3519 else if (strcmp (name
, "alpha") == 0)
3520 BUILD_CHARCLASS_LOOP (isalpha
);
3521 else if (strcmp (name
, "digit") == 0)
3522 BUILD_CHARCLASS_LOOP (isdigit
);
3523 else if (strcmp (name
, "print") == 0)
3524 BUILD_CHARCLASS_LOOP (isprint
);
3525 else if (strcmp (name
, "upper") == 0)
3526 BUILD_CHARCLASS_LOOP (isupper
);
3527 else if (strcmp (name
, "blank") == 0)
3528 BUILD_CHARCLASS_LOOP (isblank
);
3529 else if (strcmp (name
, "graph") == 0)
3530 BUILD_CHARCLASS_LOOP (isgraph
);
3531 else if (strcmp (name
, "punct") == 0)
3532 BUILD_CHARCLASS_LOOP (ispunct
);
3533 else if (strcmp (name
, "xdigit") == 0)
3534 BUILD_CHARCLASS_LOOP (isxdigit
);
3542 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3543 const unsigned char *class_name
,
3544 const unsigned char *extra
, int non_match
,
3547 re_bitset_ptr_t sbcset
;
3548 #ifdef RE_ENABLE_I18N
3549 re_charset_t
*mbcset
;
3551 #endif /* not RE_ENABLE_I18N */
3553 re_token_t br_token
;
3556 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3557 #ifdef RE_ENABLE_I18N
3558 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3559 #endif /* RE_ENABLE_I18N */
3561 #ifdef RE_ENABLE_I18N
3562 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3563 #else /* not RE_ENABLE_I18N */
3564 if (BE (sbcset
== NULL
, 0))
3565 #endif /* not RE_ENABLE_I18N */
3573 #ifdef RE_ENABLE_I18N
3574 mbcset
->non_match
= 1;
3575 #endif /* not RE_ENABLE_I18N */
3578 /* We don't care the syntax in this case. */
3579 ret
= build_charclass (trans
, sbcset
,
3580 #ifdef RE_ENABLE_I18N
3582 #endif /* RE_ENABLE_I18N */
3585 if (BE (ret
!= REG_NOERROR
, 0))
3588 #ifdef RE_ENABLE_I18N
3589 free_charset (mbcset
);
3590 #endif /* RE_ENABLE_I18N */
3594 /* \w match '_' also. */
3595 for (; *extra
; extra
++)
3596 bitset_set (sbcset
, *extra
);
3598 /* If it is non-matching list. */
3600 bitset_not (sbcset
);
3602 #ifdef RE_ENABLE_I18N
3603 /* Ensure only single byte characters are set. */
3604 if (dfa
->mb_cur_max
> 1)
3605 bitset_mask (sbcset
, dfa
->sb_char
);
3608 /* Build a tree for simple bracket. */
3609 br_token
.type
= SIMPLE_BRACKET
;
3610 br_token
.opr
.sbcset
= sbcset
;
3611 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3612 if (BE (tree
== NULL
, 0))
3613 goto build_word_op_espace
;
3615 #ifdef RE_ENABLE_I18N
3616 if (dfa
->mb_cur_max
> 1)
3618 bin_tree_t
*mbc_tree
;
3619 /* Build a tree for complex bracket. */
3620 br_token
.type
= COMPLEX_BRACKET
;
3621 br_token
.opr
.mbcset
= mbcset
;
3622 dfa
->has_mb_node
= 1;
3623 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3624 if (BE (mbc_tree
== NULL
, 0))
3625 goto build_word_op_espace
;
3626 /* Then join them by ALT node. */
3627 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3628 if (BE (mbc_tree
!= NULL
, 1))
3633 free_charset (mbcset
);
3636 #else /* not RE_ENABLE_I18N */
3638 #endif /* not RE_ENABLE_I18N */
3640 build_word_op_espace
:
3642 #ifdef RE_ENABLE_I18N
3643 free_charset (mbcset
);
3644 #endif /* RE_ENABLE_I18N */
3649 /* This is intended for the expressions like "a{1,3}".
3650 Fetch a number from `input', and return the number.
3651 Return -1, if the number field is empty like "{,1}".
3652 Return -2, If an error is occured. */
3655 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3661 fetch_token (token
, input
, syntax
);
3663 if (BE (token
->type
== END_OF_RE
, 0))
3665 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3667 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3668 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3669 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3674 #ifdef RE_ENABLE_I18N
3676 free_charset (re_charset_t
*cset
)
3678 re_free (cset
->mbchars
);
3680 re_free (cset
->coll_syms
);
3681 re_free (cset
->equiv_classes
);
3682 re_free (cset
->range_starts
);
3683 re_free (cset
->range_ends
);
3685 re_free (cset
->char_classes
);
3688 #endif /* RE_ENABLE_I18N */
3690 /* Functions for binary tree operation. */
3692 /* Create a tree node. */
3695 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3696 re_token_type_t type
)
3700 return create_token_tree (dfa
, left
, right
, &t
);
3704 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3705 const re_token_t
*token
)
3708 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3710 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3712 if (storage
== NULL
)
3714 storage
->next
= dfa
->str_tree_storage
;
3715 dfa
->str_tree_storage
= storage
;
3716 dfa
->str_tree_storage_idx
= 0;
3718 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3720 tree
->parent
= NULL
;
3722 tree
->right
= right
;
3723 tree
->token
= *token
;
3724 tree
->token
.duplicated
= 0;
3725 tree
->token
.opt_subexp
= 0;
3728 tree
->node_idx
= -1;
3731 left
->parent
= tree
;
3733 right
->parent
= tree
;
3737 /* Mark the tree SRC as an optional subexpression.
3738 To be called from preorder or postorder. */
3740 static reg_errcode_t
3741 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3743 int idx
= (int) (long) extra
;
3744 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3745 node
->token
.opt_subexp
= 1;
3750 /* Free the allocated memory inside NODE. */
3753 free_token (re_token_t
*node
)
3755 #ifdef RE_ENABLE_I18N
3756 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3757 free_charset (node
->opr
.mbcset
);
3759 #endif /* RE_ENABLE_I18N */
3760 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3761 re_free (node
->opr
.sbcset
);
3764 /* Worker function for tree walking. Free the allocated memory inside NODE
3765 and its children. */
3767 static reg_errcode_t
3768 free_tree (void *extra
, bin_tree_t
*node
)
3770 free_token (&node
->token
);
3775 /* Duplicate the node SRC, and return new node. This is a preorder
3776 visit similar to the one implemented by the generic visitor, but
3777 we need more infrastructure to maintain two parallel trees --- so,
3778 it's easier to duplicate. */
3781 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3783 const bin_tree_t
*node
;
3784 bin_tree_t
*dup_root
;
3785 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3787 for (node
= root
; ; )
3789 /* Create a new tree and link it back to the current parent. */
3790 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3793 (*p_new
)->parent
= dup_node
;
3794 (*p_new
)->token
.duplicated
= 1;
3797 /* Go to the left node, or up and to the right. */
3801 p_new
= &dup_node
->left
;
3805 const bin_tree_t
*prev
= NULL
;
3806 while (node
->right
== prev
|| node
->right
== NULL
)
3809 node
= node
->parent
;
3810 dup_node
= dup_node
->parent
;
3815 p_new
= &dup_node
->right
;