1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
38 static reg_errcode_t
preorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
postorder (bin_tree_t
*root
,
42 reg_errcode_t (fn (void *, bin_tree_t
*)),
44 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
45 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
46 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
48 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
52 int top_clone_node
, int root_node
,
53 unsigned int constraint
);
54 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
55 unsigned int constraint
);
56 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
57 unsigned int constraint
);
58 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
59 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
61 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
62 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
64 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
66 static int peek_token (re_token_t
*token
, re_string_t
*input
,
68 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
70 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
71 reg_syntax_t syntax
, reg_errcode_t
*err
);
72 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
73 re_token_t
*token
, reg_syntax_t syntax
,
74 int nest
, reg_errcode_t
*err
);
75 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
76 re_token_t
*token
, reg_syntax_t syntax
,
77 int nest
, reg_errcode_t
*err
);
78 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
79 re_token_t
*token
, reg_syntax_t syntax
,
80 int nest
, reg_errcode_t
*err
);
81 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
82 re_token_t
*token
, reg_syntax_t syntax
,
83 int nest
, reg_errcode_t
*err
);
84 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
85 re_dfa_t
*dfa
, re_token_t
*token
,
86 reg_syntax_t syntax
, reg_errcode_t
*err
);
87 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
88 re_token_t
*token
, reg_syntax_t syntax
,
90 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
92 re_token_t
*token
, int token_len
,
96 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
102 re_charset_t
*mbcset
, int *range_alloc
,
103 bracket_elem_t
*start_elem
,
104 bracket_elem_t
*end_elem
);
105 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
106 re_charset_t
*mbcset
,
108 const unsigned char *name
);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
111 bracket_elem_t
*start_elem
,
112 bracket_elem_t
*end_elem
);
113 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
114 const unsigned char *name
);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
119 re_charset_t
*mbcset
,
120 int *equiv_class_alloc
,
121 const unsigned char *name
);
122 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
123 re_bitset_ptr_t sbcset
,
124 re_charset_t
*mbcset
,
125 int *char_class_alloc
,
126 const unsigned char *class_name
,
127 reg_syntax_t syntax
);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
130 const unsigned char *name
);
131 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
132 re_bitset_ptr_t sbcset
,
133 const unsigned char *class_name
,
134 reg_syntax_t syntax
);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
137 unsigned RE_TRANSLATE_TYPE trans
,
138 const unsigned char *class_name
,
139 const unsigned char *extra
,
140 int non_match
, reg_errcode_t
*err
);
141 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
142 bin_tree_t
*left
, bin_tree_t
*right
,
143 re_token_type_t type
);
144 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
145 bin_tree_t
*left
, bin_tree_t
*right
,
146 const re_token_t
*token
);
147 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
148 static void free_token (re_token_t
*node
);
149 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
150 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
157 const char __re_error_msgid
[] attribute_hidden
=
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
211 const size_t __re_error_msgid_idx
[] attribute_hidden
=
232 /* Entry points for GNU code. */
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
242 re_compile_pattern (pattern
, length
, bufp
)
245 struct re_pattern_buffer
*bufp
;
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
254 /* Match anchors at newline. */
255 bufp
->newline_anchor
= 1;
257 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
261 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
264 weak_alias (__re_compile_pattern
, re_compile_pattern
)
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options
;
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
283 re_set_syntax (syntax
)
286 reg_syntax_t ret
= re_syntax_options
;
288 re_syntax_options
= syntax
;
292 weak_alias (__re_set_syntax
, re_set_syntax
)
296 re_compile_fastmap (bufp
)
297 struct re_pattern_buffer
*bufp
;
299 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
300 char *fastmap
= bufp
->fastmap
;
302 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
303 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_word
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_nl
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
308 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
309 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
310 bufp
->fastmap_accurate
= 1;
314 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
318 __attribute ((always_inline
))
319 re_set_fastmap (char *fastmap
, int icase
, int ch
)
323 fastmap
[tolower (ch
)] = 1;
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
330 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
332 const re_dfastate_t
*init_state
;
335 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
337 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
338 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
340 int node
= init_state
->nodes
.elems
[node_cnt
];
341 re_token_type_t type
= dfa
->nodes
[node
].type
;
343 if (type
== CHARACTER
)
345 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
349 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
354 *p
++ = dfa
->nodes
[node
].opr
.c
;
355 while (++node
< dfa
->nodes_len
356 && dfa
->nodes
[node
].type
== CHARACTER
357 && dfa
->nodes
[node
].mb_partial
)
358 *p
++ = dfa
->nodes
[node
].opr
.c
;
359 memset (&state
, 0, sizeof (state
));
360 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
362 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
363 re_set_fastmap (fastmap
, 0, buf
[0]);
367 else if (type
== SIMPLE_BRACKET
)
370 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
371 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
372 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
373 re_set_fastmap (fastmap
, icase
, ch
);
375 #ifdef RE_ENABLE_I18N
376 else if (type
== COMPLEX_BRACKET
)
379 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
380 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
381 || cset
->nranges
|| cset
->nchar_classes
)
384 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
393 const int32_t *table
= (const int32_t *)
394 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
395 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
396 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
398 re_set_fastmap (fastmap
, icase
, ch
);
401 if (dfa
->mb_cur_max
> 1)
402 for (i
= 0; i
< SBC_MAX
; ++i
)
403 if (__btowc (i
) == WEOF
)
404 re_set_fastmap (fastmap
, icase
, i
);
405 # endif /* not _LIBC */
407 for (i
= 0; i
< cset
->nmbchars
; ++i
)
411 memset (&state
, '\0', sizeof (state
));
412 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
413 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
414 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
416 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
417 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
421 #endif /* RE_ENABLE_I18N */
422 else if (type
== OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type
== OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type
== END_OF_RE
)
428 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
429 if (type
== END_OF_RE
)
430 bufp
->can_be_null
= 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg
, pattern
, cflags
)
474 regex_t
*__restrict preg
;
475 const char *__restrict pattern
;
479 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC
);
486 /* Try to allocate space for the fastmap. */
487 preg
->fastmap
= re_malloc (char, SBC_MAX
);
488 if (BE (preg
->fastmap
== NULL
, 0))
491 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags
& REG_NEWLINE
)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax
&= ~RE_DOT_NEWLINE
;
497 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
498 /* It also changes the matching behavior. */
499 preg
->newline_anchor
= 1;
502 preg
->newline_anchor
= 0;
503 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
504 preg
->translate
= NULL
;
506 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret
== REG_ERPAREN
)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret
== REG_NOERROR
, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg
);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg
->fastmap
);
522 preg
->fastmap
= NULL
;
528 weak_alias (__regcomp
, regcomp
)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode
, preg
, errbuf
, errbuf_size
)
545 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
546 / sizeof (__re_error_msgid_idx
[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
553 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
555 msg_size
= strlen (msg
) + 1; /* Includes the null. */
557 if (BE (errbuf_size
!= 0, 1))
559 if (BE (msg_size
> errbuf_size
, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
564 memcpy (errbuf
, msg
, errbuf_size
- 1);
565 errbuf
[errbuf_size
- 1] = 0;
569 memcpy (errbuf
, msg
, msg_size
);
575 weak_alias (__regerror
, regerror
)
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map
=
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
590 # error "Add case for new unsigned int size"
597 free_dfa_content (re_dfa_t
*dfa
)
602 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
603 free_token (dfa
->nodes
+ i
);
604 re_free (dfa
->nexts
);
605 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
607 if (dfa
->eclosures
!= NULL
)
608 re_node_set_free (dfa
->eclosures
+ i
);
609 if (dfa
->inveclosures
!= NULL
)
610 re_node_set_free (dfa
->inveclosures
+ i
);
611 if (dfa
->edests
!= NULL
)
612 re_node_set_free (dfa
->edests
+ i
);
614 re_free (dfa
->edests
);
615 re_free (dfa
->eclosures
);
616 re_free (dfa
->inveclosures
);
617 re_free (dfa
->nodes
);
619 if (dfa
->state_table
)
620 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
622 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
623 for (j
= 0; j
< entry
->num
; ++j
)
625 re_dfastate_t
*state
= entry
->array
[j
];
628 re_free (entry
->array
);
630 re_free (dfa
->state_table
);
631 #ifdef RE_ENABLE_I18N
632 if (dfa
->sb_char
!= utf8_sb_map
)
633 re_free (dfa
->sb_char
);
635 re_free (dfa
->subexp_map
);
637 re_free (dfa
->re_str
);
644 /* Free dynamically allocated space used by PREG. */
650 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
651 if (BE (dfa
!= NULL
, 1))
652 free_dfa_content (dfa
);
656 re_free (preg
->fastmap
);
657 preg
->fastmap
= NULL
;
659 re_free (preg
->translate
);
660 preg
->translate
= NULL
;
663 weak_alias (__regfree
, regfree
)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf
;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
689 if (!re_comp_buf
.buffer
)
690 return gettext ("No previous regular expression");
694 if (re_comp_buf
.buffer
)
696 fastmap
= re_comp_buf
.fastmap
;
697 re_comp_buf
.fastmap
= NULL
;
698 __regfree (&re_comp_buf
);
699 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
700 re_comp_buf
.fastmap
= fastmap
;
703 if (re_comp_buf
.fastmap
== NULL
)
705 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
706 if (re_comp_buf
.fastmap
== NULL
)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf
.newline_anchor
= 1;
717 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
727 libc_freeres_fn (free_mem
)
729 __regfree (&re_comp_buf
);
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
740 re_compile_internal (preg
, pattern
, length
, syntax
)
742 const char * pattern
;
746 reg_errcode_t err
= REG_NOERROR
;
750 /* Initialize the pattern buffer. */
751 preg
->fastmap_accurate
= 0;
752 preg
->syntax
= syntax
;
753 preg
->not_bol
= preg
->not_eol
= 0;
756 preg
->can_be_null
= 0;
757 preg
->regs_allocated
= REGS_UNALLOCATED
;
759 /* Initialize the dfa. */
760 dfa
= (re_dfa_t
*) preg
->buffer
;
761 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
770 preg
->allocated
= sizeof (re_dfa_t
);
771 preg
->buffer
= (unsigned char *) dfa
;
773 preg
->used
= sizeof (re_dfa_t
);
775 err
= init_dfa (dfa
, length
);
776 if (BE (err
!= REG_NOERROR
, 0))
778 free_dfa_content (dfa
);
784 dfa
->re_str
= re_malloc (char, length
+ 1);
785 strncpy (dfa
->re_str
, pattern
, length
+ 1);
788 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
789 syntax
& RE_ICASE
, dfa
);
790 if (BE (err
!= REG_NOERROR
, 0))
792 re_compile_internal_free_return
:
793 free_workarea_compile (preg
);
794 re_string_destruct (®exp
);
795 free_dfa_content (dfa
);
801 /* Parse the regular expression, and build a structure tree. */
803 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
804 if (BE (dfa
->str_tree
== NULL
, 0))
805 goto re_compile_internal_free_return
;
807 /* Analyze the tree and create the nfa. */
808 err
= analyze (preg
);
809 if (BE (err
!= REG_NOERROR
, 0))
810 goto re_compile_internal_free_return
;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
818 /* Then create the initial state of the dfa. */
819 err
= create_initial_state (dfa
);
821 /* Release work areas. */
822 free_workarea_compile (preg
);
823 re_string_destruct (®exp
);
825 if (BE (err
!= REG_NOERROR
, 0))
827 free_dfa_content (dfa
);
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
839 init_dfa (dfa
, pat_len
)
848 memset (dfa
, '\0', sizeof (re_dfa_t
));
850 /* Force allocation of str_tree_storage the first time. */
851 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
853 dfa
->nodes_alloc
= pat_len
+ 1;
854 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
856 dfa
->states_alloc
= pat_len
+ 1;
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
860 if (table_size
> pat_len
)
863 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
864 dfa
->state_hash_mask
= table_size
- 1;
866 dfa
->mb_cur_max
= MB_CUR_MAX
;
868 if (dfa
->mb_cur_max
== 6
869 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
871 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name
= nl_langinfo (CODESET
);
877 codeset_name
= getenv ("LC_ALL");
878 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
879 codeset_name
= getenv ("LC_CTYPE");
880 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
881 codeset_name
= getenv ("LANG");
882 if (codeset_name
== NULL
)
884 else if (strchr (codeset_name
, '.') != NULL
)
885 codeset_name
= strchr (codeset_name
, '.') + 1;
888 if (strcasecmp (codeset_name
, "UTF-8") == 0
889 || strcasecmp (codeset_name
, "UTF8") == 0)
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa
->map_notascii
= 0;
897 #ifdef RE_ENABLE_I18N
898 if (dfa
->mb_cur_max
> 1)
901 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
906 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
907 if (BE (dfa
->sb_char
== NULL
, 0))
910 /* Clear all bits by, then set those corresponding to single
912 bitset_empty (dfa
->sb_char
);
914 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
915 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
917 wchar_t wch
= __btowc (ch
);
919 dfa
->sb_char
[i
] |= 1 << j
;
921 if (isascii (ch
) && wch
!= (wchar_t) ch
)
922 dfa
->map_notascii
= 1;
929 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
943 dfa
->word_ops_used
= 1;
944 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
945 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
946 if (isalnum (ch
) || ch
== '_')
947 dfa
->word_char
[i
] |= 1 << j
;
950 /* Free the work area which are only used while compiling. */
953 free_workarea_compile (preg
)
956 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
957 bin_tree_storage_t
*storage
, *next
;
958 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
960 next
= storage
->next
;
963 dfa
->str_tree_storage
= NULL
;
964 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
965 dfa
->str_tree
= NULL
;
966 re_free (dfa
->org_indices
);
967 dfa
->org_indices
= NULL
;
970 /* Create initial states for all contexts. */
973 create_initial_state (dfa
)
978 re_node_set init_nodes
;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first
= dfa
->str_tree
->first
->node_idx
;
983 dfa
->init_node
= first
;
984 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
985 if (BE (err
!= REG_NOERROR
, 0))
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa
->nbackref
> 0)
993 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
995 int node_idx
= init_nodes
.elems
[i
];
996 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
999 if (type
!= OP_BACK_REF
)
1001 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1003 re_token_t
*clexp_node
;
1004 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1005 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1006 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1009 if (clexp_idx
== init_nodes
.nelem
)
1012 if (type
== OP_BACK_REF
)
1014 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1015 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1017 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa
->init_state
== NULL
, 0))
1028 if (dfa
->init_state
->has_constraint
)
1030 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1032 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1034 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1038 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1039 || dfa
->init_state_begbuf
== NULL
, 0))
1043 dfa
->init_state_word
= dfa
->init_state_nl
1044 = dfa
->init_state_begbuf
= dfa
->init_state
;
1046 re_node_set_free (&init_nodes
);
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1059 int node
, i
, mb_chars
= 0, has_period
= 0;
1061 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1062 switch (dfa
->nodes
[node
].type
)
1065 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1069 switch (dfa
->nodes
[node
].opr
.idx
)
1077 /* Word anchors etc. cannot be handled. */
1087 case OP_DUP_ASTERISK
:
1088 case OP_OPEN_SUBEXP
:
1089 case OP_CLOSE_SUBEXP
:
1091 case COMPLEX_BRACKET
:
1093 case SIMPLE_BRACKET
:
1094 /* Just double check. */
1095 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1096 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1103 if (mb_chars
|| has_period
)
1104 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1106 if (dfa
->nodes
[node
].type
== CHARACTER
1107 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1108 dfa
->nodes
[node
].mb_partial
= 0;
1109 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1110 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1113 /* The search can be in single byte locale. */
1114 dfa
->mb_cur_max
= 1;
1116 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121 "eclosure", and "inveclosure". */
1123 static reg_errcode_t
1127 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1130 /* Allocate arrays. */
1131 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1132 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1133 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1134 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1135 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1136 || dfa
->eclosures
== NULL
, 0))
1139 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1140 if (dfa
->subexp_map
!= NULL
)
1143 for (i
= 0; i
< preg
->re_nsub
; i
++)
1144 dfa
->subexp_map
[i
] = i
;
1145 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1146 for (i
= 0; i
< preg
->re_nsub
; i
++)
1147 if (dfa
->subexp_map
[i
] != i
)
1149 if (i
== preg
->re_nsub
)
1151 free (dfa
->subexp_map
);
1152 dfa
->subexp_map
= NULL
;
1156 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1157 if (BE (ret
!= REG_NOERROR
, 0))
1159 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1160 if (BE (ret
!= REG_NOERROR
, 0))
1162 preorder (dfa
->str_tree
, calc_next
, dfa
);
1163 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1164 if (BE (ret
!= REG_NOERROR
, 0))
1166 ret
= calc_eclosure (dfa
);
1167 if (BE (ret
!= REG_NOERROR
, 0))
1170 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1172 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1175 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1176 if (BE (dfa
->inveclosures
== NULL
, 0))
1178 ret
= calc_inveclosure (dfa
);
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185 implement parse tree visits. Instead, we use parent pointers and
1186 some hairy code in these two functions. */
1187 static reg_errcode_t
1188 postorder (root
, fn
, extra
)
1190 reg_errcode_t (fn (void *, bin_tree_t
*));
1193 bin_tree_t
*node
, *prev
;
1195 for (node
= root
; ; )
1197 /* Descend down the tree, preferably to the left (or to the right
1198 if that's the only child). */
1199 while (node
->left
|| node
->right
)
1207 reg_errcode_t err
= fn (extra
, node
);
1208 if (BE (err
!= REG_NOERROR
, 0))
1210 if (node
->parent
== NULL
)
1213 node
= node
->parent
;
1215 /* Go up while we have a node that is reached from the right. */
1216 while (node
->right
== prev
|| node
->right
== NULL
);
1221 static reg_errcode_t
1222 preorder (root
, fn
, extra
)
1224 reg_errcode_t (fn (void *, bin_tree_t
*));
1229 for (node
= root
; ; )
1231 reg_errcode_t err
= fn (extra
, node
);
1232 if (BE (err
!= REG_NOERROR
, 0))
1235 /* Go to the left node, or up and to the right. */
1240 bin_tree_t
*prev
= NULL
;
1241 while (node
->right
== prev
|| node
->right
== NULL
)
1244 node
= node
->parent
;
1253 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1254 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1255 backreferences as well. Requires a preorder visit. */
1256 static reg_errcode_t
1257 optimize_subexps (extra
, node
)
1261 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1263 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1265 int idx
= node
->token
.opr
.idx
;
1266 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1267 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1270 else if (node
->token
.type
== SUBEXP
1271 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1273 int other_idx
= node
->left
->token
.opr
.idx
;
1275 node
->left
= node
->left
->left
;
1277 node
->left
->parent
= node
;
1279 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1280 if (other_idx
< 8 * sizeof (dfa
->used_bkref_map
))
1281 dfa
->used_bkref_map
&= ~(1 << other_idx
);
1287 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1288 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1289 static reg_errcode_t
1290 lower_subexps (extra
, node
)
1294 regex_t
*preg
= (regex_t
*) extra
;
1295 reg_errcode_t err
= REG_NOERROR
;
1297 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1299 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1301 node
->left
->parent
= node
;
1303 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1305 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1307 node
->right
->parent
= node
;
1314 lower_subexp (err
, preg
, node
)
1319 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1320 bin_tree_t
*body
= node
->left
;
1321 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1324 && (node
->token
.opr
.idx
>= 8 * sizeof (dfa
->used_bkref_map
)
1325 || !(dfa
->used_bkref_map
& (1 << node
->token
.opr
.idx
))))
1328 /* Convert the SUBEXP node to the concatenation of an
1329 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1330 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1331 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1332 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1333 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1334 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1340 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1341 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1345 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1346 nodes. Requires a postorder visit. */
1347 static reg_errcode_t
1348 calc_first (extra
, node
)
1352 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1353 if (node
->token
.type
== CONCAT
)
1355 node
->first
= node
->left
->first
;
1356 node
->node_idx
= node
->left
->node_idx
;
1361 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1362 if (BE (node
->node_idx
== -1, 0))
1368 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1369 static reg_errcode_t
1370 calc_next (extra
, node
)
1374 switch (node
->token
.type
)
1376 case OP_DUP_ASTERISK
:
1377 node
->left
->next
= node
;
1380 node
->left
->next
= node
->right
->first
;
1381 node
->right
->next
= node
->next
;
1385 node
->left
->next
= node
->next
;
1387 node
->right
->next
= node
->next
;
1393 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1394 static reg_errcode_t
1395 link_nfa_nodes (extra
, node
)
1399 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1400 int idx
= node
->node_idx
;
1401 reg_errcode_t err
= REG_NOERROR
;
1403 switch (node
->token
.type
)
1409 assert (node
->next
== NULL
);
1412 case OP_DUP_ASTERISK
:
1416 dfa
->has_plural_match
= 1;
1417 if (node
->left
!= NULL
)
1418 left
= node
->left
->first
->node_idx
;
1420 left
= node
->next
->node_idx
;
1421 if (node
->right
!= NULL
)
1422 right
= node
->right
->first
->node_idx
;
1424 right
= node
->next
->node_idx
;
1426 assert (right
> -1);
1427 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1432 case OP_OPEN_SUBEXP
:
1433 case OP_CLOSE_SUBEXP
:
1434 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1438 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1439 if (node
->token
.type
== OP_BACK_REF
)
1440 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1444 assert (!IS_EPSILON_NODE (node
->token
.type
));
1445 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1452 /* Duplicate the epsilon closure of the node ROOT_NODE.
1453 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1454 to their own constraint. */
1456 static reg_errcode_t
1457 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1460 int top_org_node
, top_clone_node
, root_node
;
1461 unsigned int init_constraint
;
1464 int org_node
, clone_node
, ret
;
1465 unsigned int constraint
= init_constraint
;
1466 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1468 int org_dest
, clone_dest
;
1469 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1471 /* If the back reference epsilon-transit, its destination must
1472 also have the constraint. Then duplicate the epsilon closure
1473 of the destination of the back reference, and store it in
1474 edests of the back reference. */
1475 org_dest
= dfa
->nexts
[org_node
];
1476 re_node_set_empty (dfa
->edests
+ clone_node
);
1477 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1478 if (BE (err
!= REG_NOERROR
, 0))
1480 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1481 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1482 if (BE (ret
< 0, 0))
1485 else if (dfa
->edests
[org_node
].nelem
== 0)
1487 /* In case of the node can't epsilon-transit, don't duplicate the
1488 destination and store the original destination as the
1489 destination of the node. */
1490 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1493 else if (dfa
->edests
[org_node
].nelem
== 1)
1495 /* In case of the node can epsilon-transit, and it has only one
1497 org_dest
= dfa
->edests
[org_node
].elems
[0];
1498 re_node_set_empty (dfa
->edests
+ clone_node
);
1499 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1501 /* In case of the node has another constraint, append it. */
1502 if (org_node
== root_node
&& clone_node
!= org_node
)
1504 /* ...but if the node is root_node itself, it means the
1505 epsilon closure have a loop, then tie it to the
1506 destination of the root_node. */
1507 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1509 if (BE (ret
< 0, 0))
1513 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1515 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1516 if (BE (err
!= REG_NOERROR
, 0))
1518 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1519 if (BE (ret
< 0, 0))
1522 else /* dfa->edests[org_node].nelem == 2 */
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1526 org_dest
= dfa
->edests
[org_node
].elems
[0];
1527 re_node_set_empty (dfa
->edests
+ clone_node
);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1530 if (clone_dest
== -1)
1532 /* There are no such a duplicated node, create a new one. */
1533 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1534 if (BE (err
!= REG_NOERROR
, 0))
1536 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1537 if (BE (ret
< 0, 0))
1539 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1540 root_node
, constraint
);
1541 if (BE (err
!= REG_NOERROR
, 0))
1546 /* There are a duplicated node which satisfy the constraint,
1547 use it to avoid infinite loop. */
1548 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1549 if (BE (ret
< 0, 0))
1553 org_dest
= dfa
->edests
[org_node
].elems
[1];
1554 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1555 if (BE (err
!= REG_NOERROR
, 0))
1557 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1558 if (BE (ret
< 0, 0))
1561 org_node
= org_dest
;
1562 clone_node
= clone_dest
;
1567 /* Search for a node which is duplicated from the node ORG_NODE, and
1568 satisfies the constraint CONSTRAINT. */
1571 search_duplicated_node (dfa
, org_node
, constraint
)
1574 unsigned int constraint
;
1577 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1579 if (org_node
== dfa
->org_indices
[idx
]
1580 && constraint
== dfa
->nodes
[idx
].constraint
)
1581 return idx
; /* Found. */
1583 return -1; /* Not found. */
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1588 otherwise return the error code. */
1590 static reg_errcode_t
1591 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1593 int *new_idx
, org_idx
;
1594 unsigned int constraint
;
1596 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1597 if (BE (dup_idx
== -1, 0))
1599 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1600 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1601 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1602 dfa
->nodes
[dup_idx
].duplicated
= 1;
1604 /* Store the index of the original node. */
1605 dfa
->org_indices
[dup_idx
] = org_idx
;
1610 static reg_errcode_t
1611 calc_inveclosure (dfa
)
1615 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1616 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1618 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1620 int *elems
= dfa
->eclosures
[src
].elems
;
1621 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1623 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1624 if (BE (ret
== -1, 0))
1632 /* Calculate "eclosure" for all the node in DFA. */
1634 static reg_errcode_t
1638 int node_idx
, incomplete
;
1640 assert (dfa
->nodes_len
> 0);
1643 /* For each nodes, calculate epsilon closure. */
1644 for (node_idx
= 0; ; ++node_idx
)
1647 re_node_set eclosure_elem
;
1648 if (node_idx
== dfa
->nodes_len
)
1657 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1660 /* If we have already calculated, skip it. */
1661 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1665 if (BE (err
!= REG_NOERROR
, 0))
1668 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1671 re_node_set_free (&eclosure_elem
);
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1681 re_node_set
*new_set
;
1686 unsigned int constraint
;
1688 re_node_set eclosure
;
1690 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1691 if (BE (err
!= REG_NOERROR
, 0))
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa
->eclosures
[node
].nelem
= -1;
1698 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1699 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1700 /* If the current node has constraints, duplicate all nodes.
1701 Since they must inherit the constraints. */
1703 && dfa
->edests
[node
].nelem
1704 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1706 int org_node
, cur_node
;
1707 org_node
= cur_node
= node
;
1708 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1709 if (BE (err
!= REG_NOERROR
, 0))
1713 /* Expand each epsilon destination nodes. */
1714 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1715 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1717 re_node_set eclosure_elem
;
1718 int edest
= dfa
->edests
[node
].elems
[i
];
1719 /* If calculating the epsilon closure of `edest' is in progress,
1720 return intermediate result. */
1721 if (dfa
->eclosures
[edest
].nelem
== -1)
1726 /* If we haven't calculated the epsilon closure of `edest' yet,
1727 calculate now. Otherwise use calculated epsilon closure. */
1728 if (dfa
->eclosures
[edest
].nelem
== 0)
1730 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1731 if (BE (err
!= REG_NOERROR
, 0))
1735 eclosure_elem
= dfa
->eclosures
[edest
];
1736 /* Merge the epsilon closure of `edest'. */
1737 re_node_set_merge (&eclosure
, &eclosure_elem
);
1738 /* If the epsilon closure of `edest' is incomplete,
1739 the epsilon closure of this node is also incomplete. */
1740 if (dfa
->eclosures
[edest
].nelem
== 0)
1743 re_node_set_free (&eclosure_elem
);
1747 /* Epsilon closures include itself. */
1748 re_node_set_insert (&eclosure
, node
);
1749 if (incomplete
&& !root
)
1750 dfa
->eclosures
[node
].nelem
= 0;
1752 dfa
->eclosures
[node
] = eclosure
;
1753 *new_set
= eclosure
;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1763 fetch_token (result
, input
, syntax
)
1766 reg_syntax_t syntax
;
1768 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1771 /* Peek a token from INPUT, and return the length of the token.
1772 We must not use this function inside bracket expressions. */
1775 peek_token (token
, input
, syntax
)
1778 reg_syntax_t syntax
;
1782 if (re_string_eoi (input
))
1784 token
->type
= END_OF_RE
;
1788 c
= re_string_peek_byte (input
, 0);
1791 token
->word_char
= 0;
1792 #ifdef RE_ENABLE_I18N
1793 token
->mb_partial
= 0;
1794 if (input
->mb_cur_max
> 1 &&
1795 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1797 token
->type
= CHARACTER
;
1798 token
->mb_partial
= 1;
1805 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1807 token
->type
= BACK_SLASH
;
1811 c2
= re_string_peek_byte_case (input
, 1);
1813 token
->type
= CHARACTER
;
1814 #ifdef RE_ENABLE_I18N
1815 if (input
->mb_cur_max
> 1)
1817 wint_t wc
= re_string_wchar_at (input
,
1818 re_string_cur_idx (input
) + 1);
1819 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1823 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1828 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1829 token
->type
= OP_ALT
;
1831 case '1': case '2': case '3': case '4': case '5':
1832 case '6': case '7': case '8': case '9':
1833 if (!(syntax
& RE_NO_BK_REFS
))
1835 token
->type
= OP_BACK_REF
;
1836 token
->opr
.idx
= c2
- '1';
1840 if (!(syntax
& RE_NO_GNU_OPS
))
1842 token
->type
= ANCHOR
;
1843 token
->opr
.ctx_type
= WORD_FIRST
;
1847 if (!(syntax
& RE_NO_GNU_OPS
))
1849 token
->type
= ANCHOR
;
1850 token
->opr
.ctx_type
= WORD_LAST
;
1854 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= ANCHOR
;
1857 token
->opr
.ctx_type
= WORD_DELIM
;
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1863 token
->type
= ANCHOR
;
1864 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1868 if (!(syntax
& RE_NO_GNU_OPS
))
1869 token
->type
= OP_WORD
;
1872 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= OP_NOTWORD
;
1876 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= OP_SPACE
;
1880 if (!(syntax
& RE_NO_GNU_OPS
))
1881 token
->type
= OP_NOTSPACE
;
1884 if (!(syntax
& RE_NO_GNU_OPS
))
1886 token
->type
= ANCHOR
;
1887 token
->opr
.ctx_type
= BUF_FIRST
;
1891 if (!(syntax
& RE_NO_GNU_OPS
))
1893 token
->type
= ANCHOR
;
1894 token
->opr
.ctx_type
= BUF_LAST
;
1898 if (!(syntax
& RE_NO_BK_PARENS
))
1899 token
->type
= OP_OPEN_SUBEXP
;
1902 if (!(syntax
& RE_NO_BK_PARENS
))
1903 token
->type
= OP_CLOSE_SUBEXP
;
1906 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1907 token
->type
= OP_DUP_PLUS
;
1910 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1911 token
->type
= OP_DUP_QUESTION
;
1914 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1915 token
->type
= OP_OPEN_DUP_NUM
;
1918 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1919 token
->type
= OP_CLOSE_DUP_NUM
;
1927 token
->type
= CHARACTER
;
1928 #ifdef RE_ENABLE_I18N
1929 if (input
->mb_cur_max
> 1)
1931 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1932 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1936 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1941 if (syntax
& RE_NEWLINE_ALT
)
1942 token
->type
= OP_ALT
;
1945 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1946 token
->type
= OP_ALT
;
1949 token
->type
= OP_DUP_ASTERISK
;
1952 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1953 token
->type
= OP_DUP_PLUS
;
1956 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1957 token
->type
= OP_DUP_QUESTION
;
1960 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1961 token
->type
= OP_OPEN_DUP_NUM
;
1964 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1965 token
->type
= OP_CLOSE_DUP_NUM
;
1968 if (syntax
& RE_NO_BK_PARENS
)
1969 token
->type
= OP_OPEN_SUBEXP
;
1972 if (syntax
& RE_NO_BK_PARENS
)
1973 token
->type
= OP_CLOSE_SUBEXP
;
1976 token
->type
= OP_OPEN_BRACKET
;
1979 token
->type
= OP_PERIOD
;
1982 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1983 re_string_cur_idx (input
) != 0)
1985 char prev
= re_string_peek_byte (input
, -1);
1986 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1989 token
->type
= ANCHOR
;
1990 token
->opr
.ctx_type
= LINE_FIRST
;
1993 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1994 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1997 re_string_skip_bytes (input
, 1);
1998 peek_token (&next
, input
, syntax
);
1999 re_string_skip_bytes (input
, -1);
2000 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2003 token
->type
= ANCHOR
;
2004 token
->opr
.ctx_type
= LINE_LAST
;
2012 /* Peek a token from INPUT, and return the length of the token.
2013 We must not use this function out of bracket expressions. */
2016 peek_token_bracket (token
, input
, syntax
)
2019 reg_syntax_t syntax
;
2022 if (re_string_eoi (input
))
2024 token
->type
= END_OF_RE
;
2027 c
= re_string_peek_byte (input
, 0);
2030 #ifdef RE_ENABLE_I18N
2031 if (input
->mb_cur_max
> 1 &&
2032 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2034 token
->type
= CHARACTER
;
2037 #endif /* RE_ENABLE_I18N */
2039 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2040 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2042 /* In this case, '\' escape a character. */
2044 re_string_skip_bytes (input
, 1);
2045 c2
= re_string_peek_byte (input
, 0);
2047 token
->type
= CHARACTER
;
2050 if (c
== '[') /* '[' is a special char in a bracket exps. */
2054 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2055 c2
= re_string_peek_byte (input
, 1);
2063 token
->type
= OP_OPEN_COLL_ELEM
;
2066 token
->type
= OP_OPEN_EQUIV_CLASS
;
2069 if (syntax
& RE_CHAR_CLASSES
)
2071 token
->type
= OP_OPEN_CHAR_CLASS
;
2074 /* else fall through. */
2076 token
->type
= CHARACTER
;
2086 token
->type
= OP_CHARSET_RANGE
;
2089 token
->type
= OP_CLOSE_BRACKET
;
2092 token
->type
= OP_NON_MATCH_LIST
;
2095 token
->type
= CHARACTER
;
2100 /* Functions for parser. */
2102 /* Entry point of the parser.
2103 Parse the regular expression REGEXP and return the structure tree.
2104 If an error is occured, ERR is set by error code, and return NULL.
2105 This function build the following tree, from regular expression <reg_exp>:
2111 CAT means concatenation.
2112 EOR means end of regular expression. */
2115 parse (regexp
, preg
, syntax
, err
)
2116 re_string_t
*regexp
;
2118 reg_syntax_t syntax
;
2121 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2122 bin_tree_t
*tree
, *eor
, *root
;
2123 re_token_t current_token
;
2124 dfa
->syntax
= syntax
;
2125 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2126 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2127 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2129 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2131 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2134 if (BE (eor
== NULL
|| root
== NULL
, 0))
2142 /* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
2149 ALT means alternative, which represents the operator `|'. */
2152 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2153 re_string_t
*regexp
;
2156 reg_syntax_t syntax
;
2160 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2161 bin_tree_t
*tree
, *branch
= NULL
;
2162 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2163 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2166 while (token
->type
== OP_ALT
)
2168 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2169 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2170 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2172 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2173 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2178 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2179 if (BE (tree
== NULL
, 0))
2188 /* This function build the following tree, from regular expression
2195 CAT means concatenation. */
2198 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2199 re_string_t
*regexp
;
2202 reg_syntax_t syntax
;
2206 bin_tree_t
*tree
, *exp
;
2207 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2208 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2209 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2212 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2213 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2215 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2216 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2220 if (tree
!= NULL
&& exp
!= NULL
)
2222 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2229 else if (tree
== NULL
)
2231 /* Otherwise exp == NULL, we don't need to create new tree. */
2236 /* This function build the following tree, from regular expression a*:
2243 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2244 re_string_t
*regexp
;
2247 reg_syntax_t syntax
;
2251 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2253 switch (token
->type
)
2256 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2257 if (BE (tree
== NULL
, 0))
2262 #ifdef RE_ENABLE_I18N
2263 if (dfa
->mb_cur_max
> 1)
2265 while (!re_string_eoi (regexp
)
2266 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2268 bin_tree_t
*mbc_remain
;
2269 fetch_token (token
, regexp
, syntax
);
2270 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2271 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2272 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2281 case OP_OPEN_SUBEXP
:
2282 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2283 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2286 case OP_OPEN_BRACKET
:
2287 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2288 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2292 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2297 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2298 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2299 if (BE (tree
== NULL
, 0))
2305 dfa
->has_mb_node
= 1;
2307 case OP_OPEN_DUP_NUM
:
2308 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2314 case OP_DUP_ASTERISK
:
2316 case OP_DUP_QUESTION
:
2317 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2322 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2324 fetch_token (token
, regexp
, syntax
);
2325 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2327 /* else fall through */
2328 case OP_CLOSE_SUBEXP
:
2329 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2330 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2335 /* else fall through */
2336 case OP_CLOSE_DUP_NUM
:
2337 /* We treat it as a normal character. */
2339 /* Then we can these characters as normal characters. */
2340 token
->type
= CHARACTER
;
2341 /* mb_partial and word_char bits should be initialized already
2343 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2344 if (BE (tree
== NULL
, 0))
2351 if ((token
->opr
.ctx_type
2352 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2353 && dfa
->word_ops_used
== 0)
2354 init_word_char (dfa
);
2355 if (token
->opr
.ctx_type
== WORD_DELIM
2356 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2358 bin_tree_t
*tree_first
, *tree_last
;
2359 if (token
->opr
.ctx_type
== WORD_DELIM
)
2361 token
->opr
.ctx_type
= WORD_FIRST
;
2362 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2363 token
->opr
.ctx_type
= WORD_LAST
;
2367 token
->opr
.ctx_type
= INSIDE_WORD
;
2368 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2369 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2371 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2372 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2373 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2381 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2382 if (BE (tree
== NULL
, 0))
2388 /* We must return here, since ANCHORs can't be followed
2389 by repetition operators.
2390 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2391 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2392 fetch_token (token
, regexp
, syntax
);
2395 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2396 if (BE (tree
== NULL
, 0))
2401 if (dfa
->mb_cur_max
> 1)
2402 dfa
->has_mb_node
= 1;
2406 tree
= build_charclass_op (dfa
, regexp
->trans
,
2407 (const unsigned char *) "alnum",
2408 (const unsigned char *) "_",
2409 token
->type
== OP_NOTWORD
, err
);
2410 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2415 tree
= build_charclass_op (dfa
, regexp
->trans
,
2416 (const unsigned char *) "space",
2417 (const unsigned char *) "",
2418 token
->type
== OP_NOTSPACE
, err
);
2419 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2429 /* Must not happen? */
2435 fetch_token (token
, regexp
, syntax
);
2437 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2438 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2440 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2441 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2443 /* In BRE consecutive duplications are not allowed. */
2444 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2445 && (token
->type
== OP_DUP_ASTERISK
2446 || token
->type
== OP_OPEN_DUP_NUM
))
2456 /* This function build the following tree, from regular expression
2464 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2465 re_string_t
*regexp
;
2468 reg_syntax_t syntax
;
2472 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2475 cur_nsub
= preg
->re_nsub
++;
2477 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2479 /* The subexpression may be a null string. */
2480 if (token
->type
== OP_CLOSE_SUBEXP
)
2484 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2485 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2487 if (BE (*err
!= REG_NOERROR
, 0))
2490 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2492 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2493 if (BE (tree
== NULL
, 0))
2498 tree
->token
.opr
.idx
= cur_nsub
;
2502 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2505 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2507 re_string_t
*regexp
;
2510 reg_syntax_t syntax
;
2513 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2514 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2515 re_token_t start_token
= *token
;
2517 if (token
->type
== OP_OPEN_DUP_NUM
)
2520 start
= fetch_number (regexp
, token
, syntax
);
2523 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2524 start
= 0; /* We treat "{,m}" as "{0,m}". */
2527 *err
= REG_BADBR
; /* <re>{} is invalid. */
2531 if (BE (start
!= -2, 1))
2533 /* We treat "{n}" as "{n,n}". */
2534 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2535 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2536 ? fetch_number (regexp
, token
, syntax
) : -2));
2538 if (BE (start
== -2 || end
== -2, 0))
2540 /* Invalid sequence. */
2541 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2543 if (token
->type
== END_OF_RE
)
2551 /* If the syntax bit is set, rollback. */
2552 re_string_set_index (regexp
, start_idx
);
2553 *token
= start_token
;
2554 token
->type
= CHARACTER
;
2555 /* mb_partial and word_char bits should be already initialized by
2560 if (BE (end
!= -1 && start
> end
, 0))
2562 /* First number greater than second. */
2569 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2570 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2573 fetch_token (token
, regexp
, syntax
);
2575 if (BE (elem
== NULL
, 0))
2577 if (BE (start
== 0 && end
== 0, 0))
2579 postorder (elem
, free_tree
, NULL
);
2583 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2584 if (BE (start
> 0, 0))
2587 for (i
= 2; i
<= start
; ++i
)
2589 elem
= duplicate_tree (elem
, dfa
);
2590 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2591 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2592 goto parse_dup_op_espace
;
2598 /* Duplicate ELEM before it is marked optional. */
2599 elem
= duplicate_tree (elem
, dfa
);
2605 if (elem
->token
.type
== SUBEXP
)
2606 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2608 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2609 if (BE (tree
== NULL
, 0))
2610 goto parse_dup_op_espace
;
2612 /* This loop is actually executed only when end != -1,
2613 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2614 already created the start+1-th copy. */
2615 for (i
= start
+ 2; i
<= end
; ++i
)
2617 elem
= duplicate_tree (elem
, dfa
);
2618 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2619 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2620 goto parse_dup_op_espace
;
2622 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2623 if (BE (tree
== NULL
, 0))
2624 goto parse_dup_op_espace
;
2628 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2632 parse_dup_op_espace
:
2637 /* Size of the names for collating symbol/equivalence_class/character_class.
2638 I'm not sure, but maybe enough. */
2639 #define BRACKET_NAME_BUF_SIZE 32
2642 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2643 Build the range expression which starts from START_ELEM, and ends
2644 at END_ELEM. The result are written to MBCSET and SBCSET.
2645 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2646 mbcset->range_ends, is a pointer argument sinse we may
2649 static reg_errcode_t
2650 # ifdef RE_ENABLE_I18N
2651 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2652 re_charset_t
*mbcset
;
2654 # else /* not RE_ENABLE_I18N */
2655 build_range_exp (sbcset
, start_elem
, end_elem
)
2656 # endif /* not RE_ENABLE_I18N */
2657 re_bitset_ptr_t sbcset
;
2658 bracket_elem_t
*start_elem
, *end_elem
;
2660 unsigned int start_ch
, end_ch
;
2661 /* Equivalence Classes and Character Classes can't be a range start/end. */
2662 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2663 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2667 /* We can handle no multi character collating elements without libc
2669 if (BE ((start_elem
->type
== COLL_SYM
2670 && strlen ((char *) start_elem
->opr
.name
) > 1)
2671 || (end_elem
->type
== COLL_SYM
2672 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2673 return REG_ECOLLATE
;
2675 # ifdef RE_ENABLE_I18N
2677 wchar_t wc
, start_wc
, end_wc
;
2678 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2680 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2681 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2683 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2684 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2686 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2687 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2688 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2689 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2690 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2691 return REG_ECOLLATE
;
2692 cmp_buf
[0] = start_wc
;
2693 cmp_buf
[4] = end_wc
;
2694 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2697 /* Got valid collation sequence values, add them as a new entry.
2698 However, for !_LIBC we have no collation elements: if the
2699 character set is single byte, the single byte character set
2700 that we build below suffices. parse_bracket_exp passes
2701 no MBCSET if dfa->mb_cur_max == 1. */
2704 /* Check the space of the arrays. */
2705 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2707 /* There is not enough space, need realloc. */
2708 wchar_t *new_array_start
, *new_array_end
;
2711 /* +1 in case of mbcset->nranges is 0. */
2712 new_nranges
= 2 * mbcset
->nranges
+ 1;
2713 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2714 are NULL if *range_alloc == 0. */
2715 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2717 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2720 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2723 mbcset
->range_starts
= new_array_start
;
2724 mbcset
->range_ends
= new_array_end
;
2725 *range_alloc
= new_nranges
;
2728 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2729 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2732 /* Build the table for single byte characters. */
2733 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2736 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2737 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2738 bitset_set (sbcset
, wc
);
2741 # else /* not RE_ENABLE_I18N */
2744 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2745 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2747 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2748 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2750 if (start_ch
> end_ch
)
2752 /* Build the table for single byte characters. */
2753 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2754 if (start_ch
<= ch
&& ch
<= end_ch
)
2755 bitset_set (sbcset
, ch
);
2757 # endif /* not RE_ENABLE_I18N */
2760 #endif /* not _LIBC */
2763 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2764 Build the collating element which is represented by NAME.
2765 The result are written to MBCSET and SBCSET.
2766 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2767 pointer argument since we may update it. */
2769 static reg_errcode_t
2770 # ifdef RE_ENABLE_I18N
2771 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2772 re_charset_t
*mbcset
;
2773 int *coll_sym_alloc
;
2774 # else /* not RE_ENABLE_I18N */
2775 build_collating_symbol (sbcset
, name
)
2776 # endif /* not RE_ENABLE_I18N */
2777 re_bitset_ptr_t sbcset
;
2778 const unsigned char *name
;
2780 size_t name_len
= strlen ((const char *) name
);
2781 if (BE (name_len
!= 1, 0))
2782 return REG_ECOLLATE
;
2785 bitset_set (sbcset
, name
[0]);
2789 #endif /* not _LIBC */
2791 /* This function parse bracket expression like "[abc]", "[a-c]",
2795 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2796 re_string_t
*regexp
;
2799 reg_syntax_t syntax
;
2803 const unsigned char *collseqmb
;
2804 const char *collseqwc
;
2807 const int32_t *symb_table
;
2808 const unsigned char *extra
;
2810 /* Local function for parse_bracket_exp used in _LIBC environement.
2811 Seek the collating symbol entry correspondings to NAME.
2812 Return the index of the symbol in the SYMB_TABLE. */
2815 __attribute ((always_inline
))
2816 seek_collating_symbol_entry (name
, name_len
)
2817 const unsigned char *name
;
2820 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2821 int32_t elem
= hash
% table_size
;
2822 int32_t second
= hash
% (table_size
- 2);
2823 while (symb_table
[2 * elem
] != 0)
2825 /* First compare the hashing value. */
2826 if (symb_table
[2 * elem
] == hash
2827 /* Compare the length of the name. */
2828 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2829 /* Compare the name. */
2830 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2833 /* Yep, this is the entry. */
2843 /* Local function for parse_bracket_exp used in _LIBC environement.
2844 Look up the collation sequence value of BR_ELEM.
2845 Return the value if succeeded, UINT_MAX otherwise. */
2847 auto inline unsigned int
2848 __attribute ((always_inline
))
2849 lookup_collation_sequence_value (br_elem
)
2850 bracket_elem_t
*br_elem
;
2852 if (br_elem
->type
== SB_CHAR
)
2855 if (MB_CUR_MAX == 1)
2858 return collseqmb
[br_elem
->opr
.ch
];
2861 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2862 return __collseq_table_lookup (collseqwc
, wc
);
2865 else if (br_elem
->type
== MB_CHAR
)
2867 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2869 else if (br_elem
->type
== COLL_SYM
)
2871 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2875 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2877 if (symb_table
[2 * elem
] != 0)
2879 /* We found the entry. */
2880 idx
= symb_table
[2 * elem
+ 1];
2881 /* Skip the name of collating element name. */
2882 idx
+= 1 + extra
[idx
];
2883 /* Skip the byte sequence of the collating element. */
2884 idx
+= 1 + extra
[idx
];
2885 /* Adjust for the alignment. */
2886 idx
= (idx
+ 3) & ~3;
2887 /* Skip the multibyte collation sequence value. */
2888 idx
+= sizeof (unsigned int);
2889 /* Skip the wide char sequence of the collating element. */
2890 idx
+= sizeof (unsigned int) *
2891 (1 + *(unsigned int *) (extra
+ idx
));
2892 /* Return the collation sequence value. */
2893 return *(unsigned int *) (extra
+ idx
);
2895 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2897 /* No valid character. Match it as a single byte
2899 return collseqmb
[br_elem
->opr
.name
[0]];
2902 else if (sym_name_len
== 1)
2903 return collseqmb
[br_elem
->opr
.name
[0]];
2908 /* Local function for parse_bracket_exp used in _LIBC environement.
2909 Build the range expression which starts from START_ELEM, and ends
2910 at END_ELEM. The result are written to MBCSET and SBCSET.
2911 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2912 mbcset->range_ends, is a pointer argument sinse we may
2915 auto inline reg_errcode_t
2916 __attribute ((always_inline
))
2917 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2918 re_charset_t
*mbcset
;
2920 re_bitset_ptr_t sbcset
;
2921 bracket_elem_t
*start_elem
, *end_elem
;
2924 uint32_t start_collseq
;
2925 uint32_t end_collseq
;
2927 /* Equivalence Classes and Character Classes can't be a range
2929 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2930 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2934 start_collseq
= lookup_collation_sequence_value (start_elem
);
2935 end_collseq
= lookup_collation_sequence_value (end_elem
);
2936 /* Check start/end collation sequence values. */
2937 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2938 return REG_ECOLLATE
;
2939 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2942 /* Got valid collation sequence values, add them as a new entry.
2943 However, if we have no collation elements, and the character set
2944 is single byte, the single byte character set that we
2945 build below suffices. */
2946 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2948 /* Check the space of the arrays. */
2949 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2951 /* There is not enough space, need realloc. */
2952 uint32_t *new_array_start
;
2953 uint32_t *new_array_end
;
2956 /* +1 in case of mbcset->nranges is 0. */
2957 new_nranges
= 2 * mbcset
->nranges
+ 1;
2958 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2960 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2963 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2966 mbcset
->range_starts
= new_array_start
;
2967 mbcset
->range_ends
= new_array_end
;
2968 *range_alloc
= new_nranges
;
2971 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2972 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2975 /* Build the table for single byte characters. */
2976 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2978 uint32_t ch_collseq
;
2980 if (MB_CUR_MAX == 1)
2983 ch_collseq
= collseqmb
[ch
];
2985 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2986 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2987 bitset_set (sbcset
, ch
);
2992 /* Local function for parse_bracket_exp used in _LIBC environement.
2993 Build the collating element which is represented by NAME.
2994 The result are written to MBCSET and SBCSET.
2995 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2996 pointer argument sinse we may update it. */
2998 auto inline reg_errcode_t
2999 __attribute ((always_inline
))
3000 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
3001 re_charset_t
*mbcset
;
3002 int *coll_sym_alloc
;
3003 re_bitset_ptr_t sbcset
;
3004 const unsigned char *name
;
3007 size_t name_len
= strlen ((const char *) name
);
3010 elem
= seek_collating_symbol_entry (name
, name_len
);
3011 if (symb_table
[2 * elem
] != 0)
3013 /* We found the entry. */
3014 idx
= symb_table
[2 * elem
+ 1];
3015 /* Skip the name of collating element name. */
3016 idx
+= 1 + extra
[idx
];
3018 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3020 /* No valid character, treat it as a normal
3022 bitset_set (sbcset
, name
[0]);
3026 return REG_ECOLLATE
;
3028 /* Got valid collation sequence, add it as a new entry. */
3029 /* Check the space of the arrays. */
3030 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3032 /* Not enough, realloc it. */
3033 /* +1 in case of mbcset->ncoll_syms is 0. */
3034 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3035 /* Use realloc since mbcset->coll_syms is NULL
3037 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3038 new_coll_sym_alloc
);
3039 if (BE (new_coll_syms
== NULL
, 0))
3041 mbcset
->coll_syms
= new_coll_syms
;
3042 *coll_sym_alloc
= new_coll_sym_alloc
;
3044 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3049 if (BE (name_len
!= 1, 0))
3050 return REG_ECOLLATE
;
3053 bitset_set (sbcset
, name
[0]);
3060 re_token_t br_token
;
3061 re_bitset_ptr_t sbcset
;
3062 #ifdef RE_ENABLE_I18N
3063 re_charset_t
*mbcset
;
3064 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3065 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3066 #endif /* not RE_ENABLE_I18N */
3068 bin_tree_t
*work_tree
;
3070 int first_round
= 1;
3072 collseqmb
= (const unsigned char *)
3073 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3074 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3080 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3081 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3082 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3083 _NL_COLLATE_SYMB_TABLEMB
);
3084 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3085 _NL_COLLATE_SYMB_EXTRAMB
);
3088 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3089 #ifdef RE_ENABLE_I18N
3090 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3091 #endif /* RE_ENABLE_I18N */
3092 #ifdef RE_ENABLE_I18N
3093 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3095 if (BE (sbcset
== NULL
, 0))
3096 #endif /* RE_ENABLE_I18N */
3102 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3103 if (BE (token
->type
== END_OF_RE
, 0))
3106 goto parse_bracket_exp_free_return
;
3108 if (token
->type
== OP_NON_MATCH_LIST
)
3110 #ifdef RE_ENABLE_I18N
3111 mbcset
->non_match
= 1;
3112 #endif /* not RE_ENABLE_I18N */
3114 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3115 bitset_set (sbcset
, '\0');
3116 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3117 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3118 if (BE (token
->type
== END_OF_RE
, 0))
3121 goto parse_bracket_exp_free_return
;
3125 /* We treat the first ']' as a normal character. */
3126 if (token
->type
== OP_CLOSE_BRACKET
)
3127 token
->type
= CHARACTER
;
3131 bracket_elem_t start_elem
, end_elem
;
3132 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3133 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3135 int token_len2
= 0, is_range_exp
= 0;
3138 start_elem
.opr
.name
= start_name_buf
;
3139 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3140 syntax
, first_round
);
3141 if (BE (ret
!= REG_NOERROR
, 0))
3144 goto parse_bracket_exp_free_return
;
3148 /* Get information about the next token. We need it in any case. */
3149 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3151 /* Do not check for ranges if we know they are not allowed. */
3152 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3154 if (BE (token
->type
== END_OF_RE
, 0))
3157 goto parse_bracket_exp_free_return
;
3159 if (token
->type
== OP_CHARSET_RANGE
)
3161 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3162 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3163 if (BE (token2
.type
== END_OF_RE
, 0))
3166 goto parse_bracket_exp_free_return
;
3168 if (token2
.type
== OP_CLOSE_BRACKET
)
3170 /* We treat the last '-' as a normal character. */
3171 re_string_skip_bytes (regexp
, -token_len
);
3172 token
->type
= CHARACTER
;
3179 if (is_range_exp
== 1)
3181 end_elem
.opr
.name
= end_name_buf
;
3182 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3184 if (BE (ret
!= REG_NOERROR
, 0))
3187 goto parse_bracket_exp_free_return
;
3190 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3193 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3194 &start_elem
, &end_elem
);
3196 # ifdef RE_ENABLE_I18N
3197 *err
= build_range_exp (sbcset
,
3198 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3199 &range_alloc
, &start_elem
, &end_elem
);
3201 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3203 #endif /* RE_ENABLE_I18N */
3204 if (BE (*err
!= REG_NOERROR
, 0))
3205 goto parse_bracket_exp_free_return
;
3209 switch (start_elem
.type
)
3212 bitset_set (sbcset
, start_elem
.opr
.ch
);
3214 #ifdef RE_ENABLE_I18N
3216 /* Check whether the array has enough space. */
3217 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3219 wchar_t *new_mbchars
;
3220 /* Not enough, realloc it. */
3221 /* +1 in case of mbcset->nmbchars is 0. */
3222 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3223 /* Use realloc since array is NULL if *alloc == 0. */
3224 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3226 if (BE (new_mbchars
== NULL
, 0))
3227 goto parse_bracket_exp_espace
;
3228 mbcset
->mbchars
= new_mbchars
;
3230 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3232 #endif /* RE_ENABLE_I18N */
3234 *err
= build_equiv_class (sbcset
,
3235 #ifdef RE_ENABLE_I18N
3236 mbcset
, &equiv_class_alloc
,
3237 #endif /* RE_ENABLE_I18N */
3238 start_elem
.opr
.name
);
3239 if (BE (*err
!= REG_NOERROR
, 0))
3240 goto parse_bracket_exp_free_return
;
3243 *err
= build_collating_symbol (sbcset
,
3244 #ifdef RE_ENABLE_I18N
3245 mbcset
, &coll_sym_alloc
,
3246 #endif /* RE_ENABLE_I18N */
3247 start_elem
.opr
.name
);
3248 if (BE (*err
!= REG_NOERROR
, 0))
3249 goto parse_bracket_exp_free_return
;
3252 *err
= build_charclass (regexp
->trans
, sbcset
,
3253 #ifdef RE_ENABLE_I18N
3254 mbcset
, &char_class_alloc
,
3255 #endif /* RE_ENABLE_I18N */
3256 start_elem
.opr
.name
, syntax
);
3257 if (BE (*err
!= REG_NOERROR
, 0))
3258 goto parse_bracket_exp_free_return
;
3265 if (BE (token
->type
== END_OF_RE
, 0))
3268 goto parse_bracket_exp_free_return
;
3270 if (token
->type
== OP_CLOSE_BRACKET
)
3274 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3276 /* If it is non-matching list. */
3278 bitset_not (sbcset
);
3280 #ifdef RE_ENABLE_I18N
3281 /* Ensure only single byte characters are set. */
3282 if (dfa
->mb_cur_max
> 1)
3283 bitset_mask (sbcset
, dfa
->sb_char
);
3285 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3286 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3287 || mbcset
->non_match
)))
3289 bin_tree_t
*mbc_tree
;
3291 /* Build a tree for complex bracket. */
3292 dfa
->has_mb_node
= 1;
3293 br_token
.type
= COMPLEX_BRACKET
;
3294 br_token
.opr
.mbcset
= mbcset
;
3295 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3296 if (BE (mbc_tree
== NULL
, 0))
3297 goto parse_bracket_exp_espace
;
3298 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3299 if (sbcset
[sbc_idx
])
3301 /* If there are no bits set in sbcset, there is no point
3302 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3303 if (sbc_idx
< BITSET_UINTS
)
3305 /* Build a tree for simple bracket. */
3306 br_token
.type
= SIMPLE_BRACKET
;
3307 br_token
.opr
.sbcset
= sbcset
;
3308 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3309 if (BE (work_tree
== NULL
, 0))
3310 goto parse_bracket_exp_espace
;
3312 /* Then join them by ALT node. */
3313 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3314 if (BE (work_tree
== NULL
, 0))
3315 goto parse_bracket_exp_espace
;
3320 work_tree
= mbc_tree
;
3324 #endif /* not RE_ENABLE_I18N */
3326 #ifdef RE_ENABLE_I18N
3327 free_charset (mbcset
);
3329 /* Build a tree for simple bracket. */
3330 br_token
.type
= SIMPLE_BRACKET
;
3331 br_token
.opr
.sbcset
= sbcset
;
3332 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3333 if (BE (work_tree
== NULL
, 0))
3334 goto parse_bracket_exp_espace
;
3338 parse_bracket_exp_espace
:
3340 parse_bracket_exp_free_return
:
3342 #ifdef RE_ENABLE_I18N
3343 free_charset (mbcset
);
3344 #endif /* RE_ENABLE_I18N */
3348 /* Parse an element in the bracket expression. */
3350 static reg_errcode_t
3351 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3353 bracket_elem_t
*elem
;
3354 re_string_t
*regexp
;
3358 reg_syntax_t syntax
;
3361 #ifdef RE_ENABLE_I18N
3363 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3364 if (cur_char_size
> 1)
3366 elem
->type
= MB_CHAR
;
3367 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3368 re_string_skip_bytes (regexp
, cur_char_size
);
3371 #endif /* RE_ENABLE_I18N */
3372 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3373 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3374 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3375 return parse_bracket_symbol (elem
, regexp
, token
);
3376 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3378 /* A '-' must only appear as anything but a range indicator before
3379 the closing bracket. Everything else is an error. */
3381 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3382 if (token2
.type
!= OP_CLOSE_BRACKET
)
3383 /* The actual error value is not standardized since this whole
3384 case is undefined. But ERANGE makes good sense. */
3387 elem
->type
= SB_CHAR
;
3388 elem
->opr
.ch
= token
->opr
.c
;
3392 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3393 such as [:<character_class>:], [.<collating_element>.], and
3394 [=<equivalent_class>=]. */
3396 static reg_errcode_t
3397 parse_bracket_symbol (elem
, regexp
, token
)
3398 bracket_elem_t
*elem
;
3399 re_string_t
*regexp
;
3402 unsigned char ch
, delim
= token
->opr
.c
;
3404 if (re_string_eoi(regexp
))
3408 if (i
>= BRACKET_NAME_BUF_SIZE
)
3410 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3411 ch
= re_string_fetch_byte_case (regexp
);
3413 ch
= re_string_fetch_byte (regexp
);
3414 if (re_string_eoi(regexp
))
3416 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3418 elem
->opr
.name
[i
] = ch
;
3420 re_string_skip_bytes (regexp
, 1);
3421 elem
->opr
.name
[i
] = '\0';
3422 switch (token
->type
)
3424 case OP_OPEN_COLL_ELEM
:
3425 elem
->type
= COLL_SYM
;
3427 case OP_OPEN_EQUIV_CLASS
:
3428 elem
->type
= EQUIV_CLASS
;
3430 case OP_OPEN_CHAR_CLASS
:
3431 elem
->type
= CHAR_CLASS
;
3439 /* Helper function for parse_bracket_exp.
3440 Build the equivalence class which is represented by NAME.
3441 The result are written to MBCSET and SBCSET.
3442 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3443 is a pointer argument sinse we may update it. */
3445 static reg_errcode_t
3446 #ifdef RE_ENABLE_I18N
3447 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3448 re_charset_t
*mbcset
;
3449 int *equiv_class_alloc
;
3450 #else /* not RE_ENABLE_I18N */
3451 build_equiv_class (sbcset
, name
)
3452 #endif /* not RE_ENABLE_I18N */
3453 re_bitset_ptr_t sbcset
;
3454 const unsigned char *name
;
3457 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3460 const int32_t *table
, *indirect
;
3461 const unsigned char *weights
, *extra
, *cp
;
3462 unsigned char char_buf
[2];
3466 /* This #include defines a local function! */
3467 # include <locale/weight.h>
3468 /* Calculate the index for equivalence class. */
3470 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3471 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3472 _NL_COLLATE_WEIGHTMB
);
3473 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3474 _NL_COLLATE_EXTRAMB
);
3475 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3476 _NL_COLLATE_INDIRECTMB
);
3477 idx1
= findidx (&cp
);
3478 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3479 /* This isn't a valid character. */
3480 return REG_ECOLLATE
;
3482 /* Build single byte matcing table for this equivalence class. */
3483 char_buf
[1] = (unsigned char) '\0';
3484 len
= weights
[idx1
];
3485 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3489 idx2
= findidx (&cp
);
3494 /* This isn't a valid character. */
3496 if (len
== weights
[idx2
])
3499 while (cnt
<= len
&&
3500 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3504 bitset_set (sbcset
, ch
);
3507 /* Check whether the array has enough space. */
3508 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3510 /* Not enough, realloc it. */
3511 /* +1 in case of mbcset->nequiv_classes is 0. */
3512 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3513 /* Use realloc since the array is NULL if *alloc == 0. */
3514 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3516 new_equiv_class_alloc
);
3517 if (BE (new_equiv_classes
== NULL
, 0))
3519 mbcset
->equiv_classes
= new_equiv_classes
;
3520 *equiv_class_alloc
= new_equiv_class_alloc
;
3522 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3527 if (BE (strlen ((const char *) name
) != 1, 0))
3528 return REG_ECOLLATE
;
3529 bitset_set (sbcset
, *name
);
3534 /* Helper function for parse_bracket_exp.
3535 Build the character class which is represented by NAME.
3536 The result are written to MBCSET and SBCSET.
3537 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3538 is a pointer argument sinse we may update it. */
3540 static reg_errcode_t
3541 #ifdef RE_ENABLE_I18N
3542 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3543 re_charset_t
*mbcset
;
3544 int *char_class_alloc
;
3545 #else /* not RE_ENABLE_I18N */
3546 build_charclass (trans
, sbcset
, class_name
, syntax
)
3547 #endif /* not RE_ENABLE_I18N */
3548 unsigned RE_TRANSLATE_TYPE trans
;
3549 re_bitset_ptr_t sbcset
;
3550 const unsigned char *class_name
;
3551 reg_syntax_t syntax
;
3554 const char *name
= (const char *) class_name
;
3556 /* In case of REG_ICASE "upper" and "lower" match the both of
3557 upper and lower cases. */
3558 if ((syntax
& RE_ICASE
)
3559 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3562 #ifdef RE_ENABLE_I18N
3563 /* Check the space of the arrays. */
3564 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3566 /* Not enough, realloc it. */
3567 /* +1 in case of mbcset->nchar_classes is 0. */
3568 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3569 /* Use realloc since array is NULL if *alloc == 0. */
3570 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3571 new_char_class_alloc
);
3572 if (BE (new_char_classes
== NULL
, 0))
3574 mbcset
->char_classes
= new_char_classes
;
3575 *char_class_alloc
= new_char_class_alloc
;
3577 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3578 #endif /* RE_ENABLE_I18N */
3580 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3581 for (i = 0; i < SBC_MAX; ++i) \
3583 if (ctype_func (i)) \
3585 int ch = trans ? trans[i] : i; \
3586 bitset_set (sbcset, ch); \
3590 if (strcmp (name
, "alnum") == 0)
3591 BUILD_CHARCLASS_LOOP (isalnum
)
3592 else if (strcmp (name
, "cntrl") == 0)
3593 BUILD_CHARCLASS_LOOP (iscntrl
)
3594 else if (strcmp (name
, "lower") == 0)
3595 BUILD_CHARCLASS_LOOP (islower
)
3596 else if (strcmp (name
, "space") == 0)
3597 BUILD_CHARCLASS_LOOP (isspace
)
3598 else if (strcmp (name
, "alpha") == 0)
3599 BUILD_CHARCLASS_LOOP (isalpha
)
3600 else if (strcmp (name
, "digit") == 0)
3601 BUILD_CHARCLASS_LOOP (isdigit
)
3602 else if (strcmp (name
, "print") == 0)
3603 BUILD_CHARCLASS_LOOP (isprint
)
3604 else if (strcmp (name
, "upper") == 0)
3605 BUILD_CHARCLASS_LOOP (isupper
)
3606 else if (strcmp (name
, "blank") == 0)
3607 BUILD_CHARCLASS_LOOP (isblank
)
3608 else if (strcmp (name
, "graph") == 0)
3609 BUILD_CHARCLASS_LOOP (isgraph
)
3610 else if (strcmp (name
, "punct") == 0)
3611 BUILD_CHARCLASS_LOOP (ispunct
)
3612 else if (strcmp (name
, "xdigit") == 0)
3613 BUILD_CHARCLASS_LOOP (isxdigit
)
3621 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3623 unsigned RE_TRANSLATE_TYPE trans
;
3624 const unsigned char *class_name
;
3625 const unsigned char *extra
;
3629 re_bitset_ptr_t sbcset
;
3630 #ifdef RE_ENABLE_I18N
3631 re_charset_t
*mbcset
;
3633 #endif /* not RE_ENABLE_I18N */
3635 re_token_t br_token
;
3638 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3639 #ifdef RE_ENABLE_I18N
3640 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3641 #endif /* RE_ENABLE_I18N */
3643 #ifdef RE_ENABLE_I18N
3644 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3645 #else /* not RE_ENABLE_I18N */
3646 if (BE (sbcset
== NULL
, 0))
3647 #endif /* not RE_ENABLE_I18N */
3655 #ifdef RE_ENABLE_I18N
3657 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3658 bitset_set(cset->sbcset, '\0');
3660 mbcset
->non_match
= 1;
3661 #endif /* not RE_ENABLE_I18N */
3664 /* We don't care the syntax in this case. */
3665 ret
= build_charclass (trans
, sbcset
,
3666 #ifdef RE_ENABLE_I18N
3668 #endif /* RE_ENABLE_I18N */
3671 if (BE (ret
!= REG_NOERROR
, 0))
3674 #ifdef RE_ENABLE_I18N
3675 free_charset (mbcset
);
3676 #endif /* RE_ENABLE_I18N */
3680 /* \w match '_' also. */
3681 for (; *extra
; extra
++)
3682 bitset_set (sbcset
, *extra
);
3684 /* If it is non-matching list. */
3686 bitset_not (sbcset
);
3688 #ifdef RE_ENABLE_I18N
3689 /* Ensure only single byte characters are set. */
3690 if (dfa
->mb_cur_max
> 1)
3691 bitset_mask (sbcset
, dfa
->sb_char
);
3694 /* Build a tree for simple bracket. */
3695 br_token
.type
= SIMPLE_BRACKET
;
3696 br_token
.opr
.sbcset
= sbcset
;
3697 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3698 if (BE (tree
== NULL
, 0))
3699 goto build_word_op_espace
;
3701 #ifdef RE_ENABLE_I18N
3702 if (dfa
->mb_cur_max
> 1)
3704 bin_tree_t
*mbc_tree
;
3705 /* Build a tree for complex bracket. */
3706 br_token
.type
= COMPLEX_BRACKET
;
3707 br_token
.opr
.mbcset
= mbcset
;
3708 dfa
->has_mb_node
= 1;
3709 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3710 if (BE (mbc_tree
== NULL
, 0))
3711 goto build_word_op_espace
;
3712 /* Then join them by ALT node. */
3713 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3714 if (BE (mbc_tree
!= NULL
, 1))
3719 free_charset (mbcset
);
3722 #else /* not RE_ENABLE_I18N */
3724 #endif /* not RE_ENABLE_I18N */
3726 build_word_op_espace
:
3728 #ifdef RE_ENABLE_I18N
3729 free_charset (mbcset
);
3730 #endif /* RE_ENABLE_I18N */
3735 /* This is intended for the expressions like "a{1,3}".
3736 Fetch a number from `input', and return the number.
3737 Return -1, if the number field is empty like "{,1}".
3738 Return -2, If an error is occured. */
3741 fetch_number (input
, token
, syntax
)
3744 reg_syntax_t syntax
;
3750 fetch_token (token
, input
, syntax
);
3752 if (BE (token
->type
== END_OF_RE
, 0))
3754 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3756 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3757 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3758 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3763 #ifdef RE_ENABLE_I18N
3765 free_charset (re_charset_t
*cset
)
3767 re_free (cset
->mbchars
);
3769 re_free (cset
->coll_syms
);
3770 re_free (cset
->equiv_classes
);
3771 re_free (cset
->range_starts
);
3772 re_free (cset
->range_ends
);
3774 re_free (cset
->char_classes
);
3777 #endif /* RE_ENABLE_I18N */
3779 /* Functions for binary tree operation. */
3781 /* Create a tree node. */
3784 create_tree (dfa
, left
, right
, type
)
3788 re_token_type_t type
;
3792 return create_token_tree (dfa
, left
, right
, &t
);
3796 create_token_tree (dfa
, left
, right
, token
)
3800 const re_token_t
*token
;
3803 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3805 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3807 if (storage
== NULL
)
3809 storage
->next
= dfa
->str_tree_storage
;
3810 dfa
->str_tree_storage
= storage
;
3811 dfa
->str_tree_storage_idx
= 0;
3813 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3815 tree
->parent
= NULL
;
3817 tree
->right
= right
;
3818 tree
->token
= *token
;
3819 tree
->token
.duplicated
= 0;
3820 tree
->token
.opt_subexp
= 0;
3823 tree
->node_idx
= -1;
3826 left
->parent
= tree
;
3828 right
->parent
= tree
;
3832 /* Mark the tree SRC as an optional subexpression.
3833 To be called from preorder or postorder. */
3835 static reg_errcode_t
3836 mark_opt_subexp (extra
, node
)
3840 int idx
= (int) (long) extra
;
3841 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3842 node
->token
.opt_subexp
= 1;
3847 /* Free the allocated memory inside NODE. */
3850 free_token (re_token_t
*node
)
3852 #ifdef RE_ENABLE_I18N
3853 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3854 free_charset (node
->opr
.mbcset
);
3856 #endif /* RE_ENABLE_I18N */
3857 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3858 re_free (node
->opr
.sbcset
);
3861 /* Worker function for tree walking. Free the allocated memory inside NODE
3862 and its children. */
3864 static reg_errcode_t
3865 free_tree (void *extra
, bin_tree_t
*node
)
3867 free_token (&node
->token
);
3872 /* Duplicate the node SRC, and return new node. This is a preorder
3873 visit similar to the one implemented by the generic visitor, but
3874 we need more infrastructure to maintain two parallel trees --- so,
3875 it's easier to duplicate. */
3878 duplicate_tree (root
, dfa
)
3879 const bin_tree_t
*root
;
3882 const bin_tree_t
*node
;
3883 bin_tree_t
*dup_root
;
3884 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3886 for (node
= root
; ; )
3888 /* Create a new tree and link it back to the current parent. */
3889 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3892 (*p_new
)->parent
= dup_node
;
3893 (*p_new
)->token
.duplicated
= 1;
3896 /* Go to the left node, or up and to the right. */
3900 p_new
= &dup_node
->left
;
3904 const bin_tree_t
*prev
= NULL
;
3905 while (node
->right
== prev
|| node
->right
== NULL
)
3908 node
= node
->parent
;
3909 dup_node
= dup_node
->parent
;
3914 p_new
= &dup_node
->right
;