1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 struct subexp_optimize
42 static bin_tree_t
*optimize_subexps (struct subexp_optimize
*so
,
43 bin_tree_t
*node
, int sidx
, int depth
);
44 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
45 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
46 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
47 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
48 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
49 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
50 int top_clone_node
, int root_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
53 unsigned int constraint
);
54 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
55 unsigned int constraint
);
56 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
57 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
59 static void calc_inveclosure (re_dfa_t
*dfa
);
60 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
62 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
64 static int peek_token (re_token_t
*token
, re_string_t
*input
,
66 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
68 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
69 reg_syntax_t syntax
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
80 re_token_t
*token
, reg_syntax_t syntax
,
81 int nest
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
83 re_dfa_t
*dfa
, re_token_t
*token
,
84 reg_syntax_t syntax
, reg_errcode_t
*err
);
85 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
86 re_token_t
*token
, reg_syntax_t syntax
,
88 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
90 re_token_t
*token
, int token_len
,
94 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
100 re_charset_t
*mbcset
, int *range_alloc
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 re_charset_t
*mbcset
,
106 const unsigned char *name
);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
109 bracket_elem_t
*start_elem
,
110 bracket_elem_t
*end_elem
);
111 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
112 const unsigned char *name
);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
117 re_charset_t
*mbcset
,
118 int *equiv_class_alloc
,
119 const unsigned char *name
);
120 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
121 re_bitset_ptr_t sbcset
,
122 re_charset_t
*mbcset
,
123 int *char_class_alloc
,
124 const unsigned char *class_name
,
125 reg_syntax_t syntax
);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
128 const unsigned char *name
);
129 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
130 re_bitset_ptr_t sbcset
,
131 const unsigned char *class_name
,
132 reg_syntax_t syntax
);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
135 unsigned RE_TRANSLATE_TYPE trans
,
136 const unsigned char *class_name
,
137 const unsigned char *extra
,
138 int non_match
, reg_errcode_t
*err
);
139 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
140 bin_tree_t
*left
, bin_tree_t
*right
,
141 re_token_type_t type
, int index
);
142 static bin_tree_t
*re_dfa_add_tree_node (re_dfa_t
*dfa
,
143 bin_tree_t
*left
, bin_tree_t
*right
,
144 const re_token_t
*token
)
145 __attribute ((noinline
));
146 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
147 static void mark_opt_subexp (const bin_tree_t
*src
, re_dfa_t
*dfa
);
148 static void mark_opt_subexp_iter (const bin_tree_t
*src
, re_dfa_t
*dfa
, int idx
);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid
[] attribute_hidden
=
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx
[] attribute_hidden
=
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
240 re_compile_pattern (pattern
, length
, bufp
)
243 struct re_pattern_buffer
*bufp
;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
252 /* Match anchors at newline. */
253 bufp
->newline_anchor
= 1;
255 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
259 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
262 weak_alias (__re_compile_pattern
, re_compile_pattern
)
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options
;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
281 re_set_syntax (syntax
)
284 reg_syntax_t ret
= re_syntax_options
;
286 re_syntax_options
= syntax
;
290 weak_alias (__re_set_syntax
, re_set_syntax
)
294 re_compile_fastmap (bufp
)
295 struct re_pattern_buffer
*bufp
;
297 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
298 char *fastmap
= bufp
->fastmap
;
300 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
301 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
302 if (dfa
->init_state
!= dfa
->init_state_word
)
303 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_nl
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
308 bufp
->fastmap_accurate
= 1;
312 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
316 __attribute ((always_inline
))
317 re_set_fastmap (char *fastmap
, int icase
, int ch
)
321 fastmap
[tolower (ch
)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
328 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
330 const re_dfastate_t
*init_state
;
333 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
335 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
336 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
338 int node
= init_state
->nodes
.elems
[node_cnt
];
339 re_token_type_t type
= dfa
->nodes
[node
].type
;
341 if (type
== CHARACTER
)
343 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
347 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
352 *p
++ = dfa
->nodes
[node
].opr
.c
;
353 while (++node
< dfa
->nodes_len
354 && dfa
->nodes
[node
].type
== CHARACTER
355 && dfa
->nodes
[node
].mb_partial
)
356 *p
++ = dfa
->nodes
[node
].opr
.c
;
357 memset (&state
, 0, sizeof (state
));
358 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
360 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
361 re_set_fastmap (fastmap
, 0, buf
[0]);
365 else if (type
== SIMPLE_BRACKET
)
368 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
369 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
370 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
371 re_set_fastmap (fastmap
, icase
, ch
);
373 #ifdef RE_ENABLE_I18N
374 else if (type
== COMPLEX_BRACKET
)
377 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
378 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
379 || cset
->nranges
|| cset
->nchar_classes
)
382 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
391 const int32_t *table
= (const int32_t *)
392 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
393 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
394 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
396 re_set_fastmap (fastmap
, icase
, ch
);
399 if (dfa
->mb_cur_max
> 1)
400 for (i
= 0; i
< SBC_MAX
; ++i
)
401 if (__btowc (i
) == WEOF
)
402 re_set_fastmap (fastmap
, icase
, i
);
403 # endif /* not _LIBC */
405 for (i
= 0; i
< cset
->nmbchars
; ++i
)
409 memset (&state
, '\0', sizeof (state
));
410 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
411 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
412 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
414 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
415 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
419 #endif /* RE_ENABLE_I18N */
420 else if (type
== OP_PERIOD
421 #ifdef RE_ENABLE_I18N
422 || type
== OP_UTF8_PERIOD
423 #endif /* RE_ENABLE_I18N */
424 || type
== END_OF_RE
)
426 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
427 if (type
== END_OF_RE
)
428 bufp
->can_be_null
= 1;
434 /* Entry point for POSIX code. */
435 /* regcomp takes a regular expression as a string and compiles it.
437 PREG is a regex_t *. We do not expect any fields to be initialized,
438 since POSIX says we shouldn't. Thus, we set
440 `buffer' to the compiled pattern;
441 `used' to the length of the compiled pattern;
442 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
443 REG_EXTENDED bit in CFLAGS is set; otherwise, to
444 RE_SYNTAX_POSIX_BASIC;
445 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
446 `fastmap' to an allocated space for the fastmap;
447 `fastmap_accurate' to zero;
448 `re_nsub' to the number of subexpressions in PATTERN.
450 PATTERN is the address of the pattern string.
452 CFLAGS is a series of bits which affect compilation.
454 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
455 use POSIX basic syntax.
457 If REG_NEWLINE is set, then . and [^...] don't match newline.
458 Also, regexec will try a match beginning after every newline.
460 If REG_ICASE is set, then we considers upper- and lowercase
461 versions of letters to be equivalent when matching.
463 If REG_NOSUB is set, then when PREG is passed to regexec, that
464 routine will report only success or failure, and nothing about the
467 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
468 the return codes and their meanings.) */
471 regcomp (preg
, pattern
, cflags
)
472 regex_t
*__restrict preg
;
473 const char *__restrict pattern
;
477 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
478 : RE_SYNTAX_POSIX_BASIC
);
484 /* Try to allocate space for the fastmap. */
485 preg
->fastmap
= re_malloc (char, SBC_MAX
);
486 if (BE (preg
->fastmap
== NULL
, 0))
489 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
491 /* If REG_NEWLINE is set, newlines are treated differently. */
492 if (cflags
& REG_NEWLINE
)
493 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
494 syntax
&= ~RE_DOT_NEWLINE
;
495 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
496 /* It also changes the matching behavior. */
497 preg
->newline_anchor
= 1;
500 preg
->newline_anchor
= 0;
501 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
502 preg
->translate
= NULL
;
504 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
506 /* POSIX doesn't distinguish between an unmatched open-group and an
507 unmatched close-group: both are REG_EPAREN. */
508 if (ret
== REG_ERPAREN
)
511 /* We have already checked preg->fastmap != NULL. */
512 if (BE (ret
== REG_NOERROR
, 1))
513 /* Compute the fastmap now, since regexec cannot modify the pattern
514 buffer. This function never fails in this implementation. */
515 (void) re_compile_fastmap (preg
);
518 /* Some error occurred while compiling the expression. */
519 re_free (preg
->fastmap
);
520 preg
->fastmap
= NULL
;
526 weak_alias (__regcomp
, regcomp
)
529 /* Returns a message corresponding to an error code, ERRCODE, returned
530 from either regcomp or regexec. We don't use PREG here. */
533 regerror (errcode
, preg
, errbuf
, errbuf_size
)
543 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
544 / sizeof (__re_error_msgid_idx
[0])), 0))
545 /* Only error codes returned by the rest of the code should be passed
546 to this routine. If we are given anything else, or if other regex
547 code generates an invalid error code, then the program has a bug.
548 Dump core so we can fix it. */
551 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
553 msg_size
= strlen (msg
) + 1; /* Includes the null. */
555 if (BE (errbuf_size
!= 0, 1))
557 if (BE (msg_size
> errbuf_size
, 0))
559 #if defined HAVE_MEMPCPY || defined _LIBC
560 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
562 memcpy (errbuf
, msg
, errbuf_size
- 1);
563 errbuf
[errbuf_size
- 1] = 0;
567 memcpy (errbuf
, msg
, msg_size
);
573 weak_alias (__regerror
, regerror
)
577 #ifdef RE_ENABLE_I18N
578 /* This static array is used for the map to single-byte characters when
579 UTF-8 is used. Otherwise we would allocate memory just to initialize
580 it the same all the time. UTF-8 is the preferred encoding so this is
581 a worthwhile optimization. */
582 static const bitset utf8_sb_map
=
584 /* Set the first 128 bits. */
585 # if UINT_MAX == 0xffffffff
586 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
588 # error "Add case for new unsigned int size"
595 free_dfa_content (re_dfa_t
*dfa
)
600 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
602 re_token_t
*node
= dfa
->nodes
+ i
;
603 #ifdef RE_ENABLE_I18N
604 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
605 free_charset (node
->opr
.mbcset
);
607 #endif /* RE_ENABLE_I18N */
608 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
609 re_free (node
->opr
.sbcset
);
611 re_free (dfa
->nexts
);
612 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
614 if (dfa
->eclosures
!= NULL
)
615 re_node_set_free (dfa
->eclosures
+ i
);
616 if (dfa
->inveclosures
!= NULL
)
617 re_node_set_free (dfa
->inveclosures
+ i
);
618 if (dfa
->edests
!= NULL
)
619 re_node_set_free (dfa
->edests
+ i
);
621 re_free (dfa
->edests
);
622 re_free (dfa
->eclosures
);
623 re_free (dfa
->inveclosures
);
624 re_free (dfa
->nodes
);
626 if (dfa
->state_table
)
627 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
629 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
630 for (j
= 0; j
< entry
->num
; ++j
)
632 re_dfastate_t
*state
= entry
->array
[j
];
635 re_free (entry
->array
);
637 re_free (dfa
->state_table
);
638 #ifdef RE_ENABLE_I18N
639 if (dfa
->sb_char
!= utf8_sb_map
)
640 re_free (dfa
->sb_char
);
642 re_free (dfa
->subexp_map
);
644 re_free (dfa
->re_str
);
651 /* Free dynamically allocated space used by PREG. */
657 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
658 if (BE (dfa
!= NULL
, 1))
659 free_dfa_content (dfa
);
663 re_free (preg
->fastmap
);
664 preg
->fastmap
= NULL
;
666 re_free (preg
->translate
);
667 preg
->translate
= NULL
;
670 weak_alias (__regfree
, regfree
)
673 /* Entry points compatible with 4.2 BSD regex library. We don't define
674 them unless specifically requested. */
676 #if defined _REGEX_RE_COMP || defined _LIBC
678 /* BSD has one and only one pattern buffer. */
679 static struct re_pattern_buffer re_comp_buf
;
683 /* Make these definitions weak in libc, so POSIX programs can redefine
684 these names if they don't use our functions, and still use
685 regcomp/regexec above without link errors. */
696 if (!re_comp_buf
.buffer
)
697 return gettext ("No previous regular expression");
701 if (re_comp_buf
.buffer
)
703 fastmap
= re_comp_buf
.fastmap
;
704 re_comp_buf
.fastmap
= NULL
;
705 __regfree (&re_comp_buf
);
706 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
707 re_comp_buf
.fastmap
= fastmap
;
710 if (re_comp_buf
.fastmap
== NULL
)
712 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
713 if (re_comp_buf
.fastmap
== NULL
)
714 return (char *) gettext (__re_error_msgid
715 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
718 /* Since `re_exec' always passes NULL for the `regs' argument, we
719 don't need to initialize the pattern buffer fields which affect it. */
721 /* Match anchors at newlines. */
722 re_comp_buf
.newline_anchor
= 1;
724 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
729 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
730 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
734 libc_freeres_fn (free_mem
)
736 __regfree (&re_comp_buf
);
740 #endif /* _REGEX_RE_COMP */
742 /* Internal entry point.
743 Compile the regular expression PATTERN, whose length is LENGTH.
744 SYNTAX indicate regular expression's syntax. */
747 re_compile_internal (preg
, pattern
, length
, syntax
)
749 const char * pattern
;
753 reg_errcode_t err
= REG_NOERROR
;
757 /* Initialize the pattern buffer. */
758 preg
->fastmap_accurate
= 0;
759 preg
->syntax
= syntax
;
760 preg
->not_bol
= preg
->not_eol
= 0;
763 preg
->can_be_null
= 0;
764 preg
->regs_allocated
= REGS_UNALLOCATED
;
766 /* Initialize the dfa. */
767 dfa
= (re_dfa_t
*) preg
->buffer
;
768 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
777 preg
->allocated
= sizeof (re_dfa_t
);
778 preg
->buffer
= (unsigned char *) dfa
;
780 preg
->used
= sizeof (re_dfa_t
);
782 err
= init_dfa (dfa
, length
);
783 if (BE (err
!= REG_NOERROR
, 0))
785 free_dfa_content (dfa
);
791 dfa
->re_str
= re_malloc (char, length
+ 1);
792 strncpy (dfa
->re_str
, pattern
, length
+ 1);
795 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
796 syntax
& RE_ICASE
, dfa
);
797 if (BE (err
!= REG_NOERROR
, 0))
799 re_compile_internal_free_return
:
800 free_workarea_compile (preg
);
801 re_string_destruct (®exp
);
802 free_dfa_content (dfa
);
808 /* Parse the regular expression, and build a structure tree. */
810 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
811 if (BE (dfa
->str_tree
== NULL
, 0))
812 goto re_compile_internal_free_return
;
814 #ifdef RE_ENABLE_I18N
815 /* If possible, do searching in single byte encoding to speed things up. */
816 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
820 if (preg
->re_nsub
> 0)
822 struct subexp_optimize so
;
825 so
.nodes
= dfa
->nodes
;
826 so
.no_sub
= preg
->no_sub
;
827 so
.re_nsub
= preg
->re_nsub
;
828 dfa
->str_tree
= optimize_subexps (&so
, dfa
->str_tree
, -1, 0);
831 /* Analyze the tree and collect information which is necessary to
834 if (BE (err
!= REG_NOERROR
, 0))
835 goto re_compile_internal_free_return
;
837 /* Then create the initial state of the dfa. */
838 err
= create_initial_state (dfa
);
840 /* Release work areas. */
841 free_workarea_compile (preg
);
842 re_string_destruct (®exp
);
844 if (BE (err
!= REG_NOERROR
, 0))
846 free_dfa_content (dfa
);
854 /* Initialize DFA. We use the length of the regular expression PAT_LEN
855 as the initial length of some arrays. */
858 init_dfa (dfa
, pat_len
)
867 memset (dfa
, '\0', sizeof (re_dfa_t
));
869 /* Force allocation of str_tree_storage the first time. */
870 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
872 dfa
->nodes_alloc
= pat_len
+ 1;
873 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
875 dfa
->states_alloc
= pat_len
+ 1;
877 /* table_size = 2 ^ ceil(log pat_len) */
878 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
879 if (table_size
> pat_len
)
882 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
883 dfa
->state_hash_mask
= table_size
- 1;
885 dfa
->mb_cur_max
= MB_CUR_MAX
;
887 if (dfa
->mb_cur_max
== 6
888 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
890 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
893 # ifdef HAVE_LANGINFO_CODESET
894 codeset_name
= nl_langinfo (CODESET
);
896 codeset_name
= getenv ("LC_ALL");
897 if (codeset_name
== NULL
|| codeset
[0] == '\0')
898 codeset_name
= getenv ("LC_CTYPE");
899 if (codeset_name
== NULL
|| codeset
[0] == '\0')
900 codeset_name
= getenv ("LANG");
901 if (codeset_name
== NULL
)
903 else if (strchr (codeset_name
, '.') != NULL
)
904 codeset_name
= strchr (codeset_name
, '.') + 1;
907 if (strcasecmp (codeset_name
, "UTF-8") == 0
908 || strcasecmp (codeset_name
, "UTF8") == 0)
911 /* We check exhaustively in the loop below if this charset is a
912 superset of ASCII. */
913 dfa
->map_notascii
= 0;
916 #ifdef RE_ENABLE_I18N
917 if (dfa
->mb_cur_max
> 1)
920 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
925 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
926 if (BE (dfa
->sb_char
== NULL
, 0))
929 /* Clear all bits by, then set those corresponding to single
931 bitset_empty (dfa
->sb_char
);
933 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
934 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
936 wchar_t wch
= __btowc (ch
);
938 dfa
->sb_char
[i
] |= 1 << j
;
940 if (isascii (ch
) && wch
!= (wchar_t) ch
)
941 dfa
->map_notascii
= 1;
948 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
953 /* Initialize WORD_CHAR table, which indicate which character is
954 "word". In this case "word" means that it is the word construction
955 character used by some operators like "\<", "\>", etc. */
962 dfa
->word_ops_used
= 1;
963 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
964 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
965 if (isalnum (ch
) || ch
== '_')
966 dfa
->word_char
[i
] |= 1 << j
;
969 /* Free the work area which are only used while compiling. */
972 free_workarea_compile (preg
)
975 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
976 bin_tree_storage_t
*storage
, *next
;
977 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
979 next
= storage
->next
;
982 dfa
->str_tree_storage
= NULL
;
983 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
984 dfa
->str_tree
= NULL
;
985 re_free (dfa
->org_indices
);
986 dfa
->org_indices
= NULL
;
989 /* Create initial states for all contexts. */
992 create_initial_state (dfa
)
997 re_node_set init_nodes
;
999 /* Initial states have the epsilon closure of the node which is
1000 the first node of the regular expression. */
1001 first
= dfa
->str_tree
->first
;
1002 dfa
->init_node
= first
;
1003 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1004 if (BE (err
!= REG_NOERROR
, 0))
1007 /* The back-references which are in initial states can epsilon transit,
1008 since in this case all of the subexpressions can be null.
1009 Then we add epsilon closures of the nodes which are the next nodes of
1010 the back-references. */
1011 if (dfa
->nbackref
> 0)
1012 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1014 int node_idx
= init_nodes
.elems
[i
];
1015 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1018 if (type
!= OP_BACK_REF
)
1020 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1022 re_token_t
*clexp_node
;
1023 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1024 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1025 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1028 if (clexp_idx
== init_nodes
.nelem
)
1031 if (type
== OP_BACK_REF
)
1033 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1034 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1036 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1042 /* It must be the first time to invoke acquire_state. */
1043 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1044 /* We don't check ERR here, since the initial state must not be NULL. */
1045 if (BE (dfa
->init_state
== NULL
, 0))
1047 if (dfa
->init_state
->has_constraint
)
1049 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1051 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1053 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1057 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1058 || dfa
->init_state_begbuf
== NULL
, 0))
1062 dfa
->init_state_word
= dfa
->init_state_nl
1063 = dfa
->init_state_begbuf
= dfa
->init_state
;
1065 re_node_set_free (&init_nodes
);
1069 #ifdef RE_ENABLE_I18N
1070 /* If it is possible to do searching in single byte encoding instead of UTF-8
1071 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1072 DFA nodes where needed. */
1078 int node
, i
, mb_chars
= 0, has_period
= 0;
1080 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1081 switch (dfa
->nodes
[node
].type
)
1084 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1088 switch (dfa
->nodes
[node
].opr
.idx
)
1096 /* Word anchors etc. cannot be handled. */
1106 case OP_DUP_ASTERISK
:
1107 case OP_DUP_QUESTION
:
1108 case OP_OPEN_SUBEXP
:
1109 case OP_CLOSE_SUBEXP
:
1111 case SIMPLE_BRACKET
:
1112 /* Just double check. */
1113 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1114 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1121 if (mb_chars
|| has_period
)
1122 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1124 if (dfa
->nodes
[node
].type
== CHARACTER
1125 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1126 dfa
->nodes
[node
].mb_partial
= 0;
1127 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1128 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1131 /* The search can be in single byte locale. */
1132 dfa
->mb_cur_max
= 1;
1134 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1139 optimize_subexps (so
, node
, sidx
, depth
)
1140 struct subexp_optimize
*so
;
1144 int idx
, new_depth
, new_sidx
;
1151 if ((depth
& 1) && node
->type
== CONCAT
1152 && node
->right
&& node
->right
->type
== 0
1153 && so
->nodes
[idx
= node
->right
->node_idx
].type
== OP_CLOSE_SUBEXP
)
1155 new_depth
= depth
+ 1;
1157 || (so
->nodes
[idx
].opr
.idx
< 8 * sizeof (so
->dfa
->used_bkref_map
)
1158 && so
->dfa
->used_bkref_map
& (1 << so
->nodes
[idx
].opr
.idx
)))
1159 new_sidx
= so
->nodes
[idx
].opr
.idx
;
1161 node
->left
= optimize_subexps (so
, node
->left
, new_sidx
, new_depth
);
1162 new_depth
= (depth
& 1) == 0 && node
->type
== CONCAT
1163 && node
->left
&& node
->left
->type
== 0
1164 && so
->nodes
[node
->left
->node_idx
].type
== OP_OPEN_SUBEXP
1166 node
->right
= optimize_subexps (so
, node
->right
, sidx
, new_depth
);
1168 if (node
->type
!= CONCAT
)
1170 if ((depth
& 1) == 0
1172 && node
->left
->type
== 0
1173 && so
->nodes
[idx
= node
->left
->node_idx
].type
== OP_OPEN_SUBEXP
)
1175 else if ((depth
& 1)
1177 && node
->right
->type
== 0
1178 && so
->nodes
[idx
= node
->right
->node_idx
].type
== OP_CLOSE_SUBEXP
)
1183 if (so
->nodes
[idx
].opr
.idx
< 8 * sizeof (so
->dfa
->used_bkref_map
)
1184 && so
->dfa
->used_bkref_map
& (1 << so
->nodes
[idx
].opr
.idx
))
1194 if (so
->dfa
->subexp_map
== NULL
)
1196 so
->dfa
->subexp_map
= re_malloc (int, so
->re_nsub
);
1197 if (so
->dfa
->subexp_map
== NULL
)
1200 for (i
= 0; i
< so
->re_nsub
; i
++)
1201 so
->dfa
->subexp_map
[i
] = i
;
1204 i
= so
->nodes
[idx
].opr
.idx
;
1206 so
->dfa
->subexp_map
[i
] = sidx
;
1209 so
->nodes
[idx
].type
= OP_DELETED_SUBEXP
;
1210 ret
->parent
= node
->parent
;
1214 /* Analyze the structure tree, and calculate "first", "next", "edest",
1215 "eclosure", and "inveclosure". */
1217 static reg_errcode_t
1224 /* Allocate arrays. */
1225 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1226 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1227 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1228 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1229 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1230 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1231 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1233 /* Initialize them. */
1234 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
1237 re_node_set_init_empty (dfa
->edests
+ i
);
1238 re_node_set_init_empty (dfa
->eclosures
+ i
);
1239 re_node_set_init_empty (dfa
->inveclosures
+ i
);
1242 ret
= analyze_tree (dfa
, dfa
->str_tree
);
1243 if (BE (ret
== REG_NOERROR
, 1))
1245 ret
= calc_eclosure (dfa
);
1246 if (ret
== REG_NOERROR
)
1247 calc_inveclosure (dfa
);
1252 /* Helper functions for analyze.
1253 This function calculate "first", "next", and "edest" for the subtree
1254 whose root is NODE. */
1256 static reg_errcode_t
1257 analyze_tree (dfa
, node
)
1262 if (node
->first
== -1)
1263 calc_first (dfa
, node
);
1264 if (node
->next
== -1)
1265 calc_next (dfa
, node
);
1266 calc_epsdest (dfa
, node
);
1268 /* Calculate "first" etc. for the left child. */
1269 if (node
->left
!= NULL
)
1271 ret
= analyze_tree (dfa
, node
->left
);
1272 if (BE (ret
!= REG_NOERROR
, 0))
1275 /* Calculate "first" etc. for the right child. */
1276 if (node
->right
!= NULL
)
1278 ret
= analyze_tree (dfa
, node
->right
);
1279 if (BE (ret
!= REG_NOERROR
, 0))
1285 /* Calculate "first" for the node NODE. */
1287 calc_first (dfa
, node
)
1292 idx
= node
->node_idx
;
1293 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1298 case OP_OPEN_BRACKET
:
1299 case OP_CLOSE_BRACKET
:
1300 case OP_OPEN_DUP_NUM
:
1301 case OP_CLOSE_DUP_NUM
:
1303 case OP_NON_MATCH_LIST
:
1304 case OP_OPEN_COLL_ELEM
:
1305 case OP_CLOSE_COLL_ELEM
:
1306 case OP_OPEN_EQUIV_CLASS
:
1307 case OP_CLOSE_EQUIV_CLASS
:
1308 case OP_OPEN_CHAR_CLASS
:
1309 case OP_CLOSE_CHAR_CLASS
:
1310 /* These must not appear here. */
1316 case OP_DUP_ASTERISK
:
1317 case OP_DUP_QUESTION
:
1318 #ifdef RE_ENABLE_I18N
1319 case OP_UTF8_PERIOD
:
1320 case COMPLEX_BRACKET
:
1321 #endif /* RE_ENABLE_I18N */
1322 case SIMPLE_BRACKET
:
1325 case OP_OPEN_SUBEXP
:
1326 case OP_CLOSE_SUBEXP
:
1332 /* else fall through */
1335 assert (node
->left
!= NULL
);
1337 if (node
->left
->first
== -1)
1338 calc_first (dfa
, node
->left
);
1339 node
->first
= node
->left
->first
;
1344 /* Calculate "next" for the node NODE. */
1347 calc_next (dfa
, node
)
1352 bin_tree_t
*parent
= node
->parent
;
1356 idx
= node
->node_idx
;
1357 if (node
->type
== 0)
1358 dfa
->nexts
[idx
] = node
->next
;
1362 idx
= parent
->node_idx
;
1363 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1367 case OP_DUP_ASTERISK
:
1371 if (parent
->left
== node
)
1373 if (parent
->right
->first
== -1)
1374 calc_first (dfa
, parent
->right
);
1375 node
->next
= parent
->right
->first
;
1378 /* else fall through */
1380 if (parent
->next
== -1)
1381 calc_next (dfa
, parent
);
1382 node
->next
= parent
->next
;
1385 idx
= node
->node_idx
;
1386 if (node
->type
== 0)
1387 dfa
->nexts
[idx
] = node
->next
;
1390 /* Calculate "edest" for the node NODE. */
1393 calc_epsdest (dfa
, node
)
1398 idx
= node
->node_idx
;
1399 if (node
->type
== 0)
1401 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1402 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1404 if (node
->left
->first
== -1)
1405 calc_first (dfa
, node
->left
);
1406 if (node
->next
== -1)
1407 calc_next (dfa
, node
);
1408 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1411 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1414 if (node
->left
!= NULL
)
1416 if (node
->left
->first
== -1)
1417 calc_first (dfa
, node
->left
);
1418 left
= node
->left
->first
;
1422 if (node
->next
== -1)
1423 calc_next (dfa
, node
);
1426 if (node
->right
!= NULL
)
1428 if (node
->right
->first
== -1)
1429 calc_first (dfa
, node
->right
);
1430 right
= node
->right
->first
;
1434 if (node
->next
== -1)
1435 calc_next (dfa
, node
);
1438 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1440 else if (dfa
->nodes
[idx
].type
== ANCHOR
1441 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1442 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1443 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1444 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1446 assert (!IS_EPSILON_NODE (dfa
->nodes
[idx
].type
));
1450 /* Duplicate the epsilon closure of the node ROOT_NODE.
1451 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1452 to their own constraint. */
1454 static reg_errcode_t
1455 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1458 int top_org_node
, top_clone_node
, root_node
;
1459 unsigned int init_constraint
;
1462 int org_node
, clone_node
, ret
;
1463 unsigned int constraint
= init_constraint
;
1464 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1466 int org_dest
, clone_dest
;
1467 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1469 /* If the back reference epsilon-transit, its destination must
1470 also have the constraint. Then duplicate the epsilon closure
1471 of the destination of the back reference, and store it in
1472 edests of the back reference. */
1473 org_dest
= dfa
->nexts
[org_node
];
1474 re_node_set_empty (dfa
->edests
+ clone_node
);
1475 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1476 if (BE (err
!= REG_NOERROR
, 0))
1478 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1479 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1480 if (BE (ret
< 0, 0))
1483 else if (dfa
->edests
[org_node
].nelem
== 0)
1485 /* In case of the node can't epsilon-transit, don't duplicate the
1486 destination and store the original destination as the
1487 destination of the node. */
1488 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1491 else if (dfa
->edests
[org_node
].nelem
== 1)
1493 /* In case of the node can epsilon-transit, and it has only one
1495 org_dest
= dfa
->edests
[org_node
].elems
[0];
1496 re_node_set_empty (dfa
->edests
+ clone_node
);
1497 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1499 /* In case of the node has another constraint, append it. */
1500 if (org_node
== root_node
&& clone_node
!= org_node
)
1502 /* ...but if the node is root_node itself, it means the
1503 epsilon closure have a loop, then tie it to the
1504 destination of the root_node. */
1505 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1507 if (BE (ret
< 0, 0))
1511 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1513 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1514 if (BE (err
!= REG_NOERROR
, 0))
1516 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1517 if (BE (ret
< 0, 0))
1520 else /* dfa->edests[org_node].nelem == 2 */
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. E.g. '|', '*', '+', '?'. */
1524 org_dest
= dfa
->edests
[org_node
].elems
[0];
1525 re_node_set_empty (dfa
->edests
+ clone_node
);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1528 if (clone_dest
== -1)
1530 /* There are no such a duplicated node, create a new one. */
1531 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1532 if (BE (err
!= REG_NOERROR
, 0))
1534 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1535 if (BE (ret
< 0, 0))
1537 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1538 root_node
, constraint
);
1539 if (BE (err
!= REG_NOERROR
, 0))
1544 /* There are a duplicated node which satisfy the constraint,
1545 use it to avoid infinite loop. */
1546 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1547 if (BE (ret
< 0, 0))
1551 org_dest
= dfa
->edests
[org_node
].elems
[1];
1552 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1553 if (BE (err
!= REG_NOERROR
, 0))
1555 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1556 if (BE (ret
< 0, 0))
1559 org_node
= org_dest
;
1560 clone_node
= clone_dest
;
1565 /* Search for a node which is duplicated from the node ORG_NODE, and
1566 satisfies the constraint CONSTRAINT. */
1569 search_duplicated_node (dfa
, org_node
, constraint
)
1572 unsigned int constraint
;
1575 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1577 if (org_node
== dfa
->org_indices
[idx
]
1578 && constraint
== dfa
->nodes
[idx
].constraint
)
1579 return idx
; /* Found. */
1581 return -1; /* Not found. */
1584 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1585 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1586 otherwise return the error code. */
1588 static reg_errcode_t
1589 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1591 int *new_idx
, org_idx
;
1592 unsigned int constraint
;
1594 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
], 1);
1595 if (BE (dup_idx
== -1, 0))
1597 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1598 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1599 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1600 dfa
->nodes
[dup_idx
].duplicated
= 1;
1601 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1602 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1603 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1605 /* Store the index of the original node. */
1606 dfa
->org_indices
[dup_idx
] = org_idx
;
1612 calc_inveclosure (dfa
)
1616 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1618 if (dfa
->nodes
[src
].type
== OP_DELETED_SUBEXP
)
1620 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1622 dest
= dfa
->eclosures
[src
].elems
[idx
];
1623 re_node_set_insert_last (dfa
->inveclosures
+ dest
, src
);
1628 /* Calculate "eclosure" for all the node in DFA. */
1630 static reg_errcode_t
1634 int node_idx
, incomplete
;
1636 assert (dfa
->nodes_len
> 0);
1639 /* For each nodes, calculate epsilon closure. */
1640 for (node_idx
= 0; ; ++node_idx
)
1643 re_node_set eclosure_elem
;
1644 if (node_idx
== dfa
->nodes_len
)
1653 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1655 if (dfa
->nodes
[node_idx
].type
== OP_DELETED_SUBEXP
)
1658 /* If we have already calculated, skip it. */
1659 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1661 /* Calculate epsilon closure of `node_idx'. */
1662 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1663 if (BE (err
!= REG_NOERROR
, 0))
1666 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1669 re_node_set_free (&eclosure_elem
);
1675 /* Calculate epsilon closure of NODE. */
1677 static reg_errcode_t
1678 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1679 re_node_set
*new_set
;
1684 unsigned int constraint
;
1686 re_node_set eclosure
;
1688 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1689 if (BE (err
!= REG_NOERROR
, 0))
1692 /* This indicates that we are calculating this node now.
1693 We reference this value to avoid infinite loop. */
1694 dfa
->eclosures
[node
].nelem
= -1;
1696 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1697 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1698 /* If the current node has constraints, duplicate all nodes.
1699 Since they must inherit the constraints. */
1701 && dfa
->edests
[node
].nelem
1702 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1704 int org_node
, cur_node
;
1705 org_node
= cur_node
= node
;
1706 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1707 if (BE (err
!= REG_NOERROR
, 0))
1711 /* Expand each epsilon destination nodes. */
1712 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1713 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1715 re_node_set eclosure_elem
;
1716 int edest
= dfa
->edests
[node
].elems
[i
];
1717 /* If calculating the epsilon closure of `edest' is in progress,
1718 return intermediate result. */
1719 if (dfa
->eclosures
[edest
].nelem
== -1)
1724 /* If we haven't calculated the epsilon closure of `edest' yet,
1725 calculate now. Otherwise use calculated epsilon closure. */
1726 if (dfa
->eclosures
[edest
].nelem
== 0)
1728 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1729 if (BE (err
!= REG_NOERROR
, 0))
1733 eclosure_elem
= dfa
->eclosures
[edest
];
1734 /* Merge the epsilon closure of `edest'. */
1735 re_node_set_merge (&eclosure
, &eclosure_elem
);
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa
->eclosures
[edest
].nelem
== 0)
1741 re_node_set_free (&eclosure_elem
);
1745 /* Epsilon closures include itself. */
1746 re_node_set_insert (&eclosure
, node
);
1747 if (incomplete
&& !root
)
1748 dfa
->eclosures
[node
].nelem
= 0;
1750 dfa
->eclosures
[node
] = eclosure
;
1751 *new_set
= eclosure
;
1755 /* Functions for token which are used in the parser. */
1757 /* Fetch a token from INPUT.
1758 We must not use this function inside bracket expressions. */
1761 fetch_token (result
, input
, syntax
)
1764 reg_syntax_t syntax
;
1766 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1773 peek_token (token
, input
, syntax
)
1776 reg_syntax_t syntax
;
1780 if (re_string_eoi (input
))
1782 token
->type
= END_OF_RE
;
1786 c
= re_string_peek_byte (input
, 0);
1789 token
->word_char
= 0;
1790 #ifdef RE_ENABLE_I18N
1791 token
->mb_partial
= 0;
1792 if (input
->mb_cur_max
> 1 &&
1793 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1795 token
->type
= CHARACTER
;
1796 token
->mb_partial
= 1;
1803 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1805 token
->type
= BACK_SLASH
;
1809 c2
= re_string_peek_byte_case (input
, 1);
1811 token
->type
= CHARACTER
;
1812 #ifdef RE_ENABLE_I18N
1813 if (input
->mb_cur_max
> 1)
1815 wint_t wc
= re_string_wchar_at (input
,
1816 re_string_cur_idx (input
) + 1);
1817 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1821 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1826 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1827 token
->type
= OP_ALT
;
1829 case '1': case '2': case '3': case '4': case '5':
1830 case '6': case '7': case '8': case '9':
1831 if (!(syntax
& RE_NO_BK_REFS
))
1833 token
->type
= OP_BACK_REF
;
1834 token
->opr
.idx
= c2
- '1';
1838 if (!(syntax
& RE_NO_GNU_OPS
))
1840 token
->type
= ANCHOR
;
1841 token
->opr
.ctx_type
= WORD_FIRST
;
1845 if (!(syntax
& RE_NO_GNU_OPS
))
1847 token
->type
= ANCHOR
;
1848 token
->opr
.ctx_type
= WORD_LAST
;
1852 if (!(syntax
& RE_NO_GNU_OPS
))
1854 token
->type
= ANCHOR
;
1855 token
->opr
.ctx_type
= WORD_DELIM
;
1859 if (!(syntax
& RE_NO_GNU_OPS
))
1861 token
->type
= ANCHOR
;
1862 token
->opr
.ctx_type
= INSIDE_WORD
;
1866 if (!(syntax
& RE_NO_GNU_OPS
))
1867 token
->type
= OP_WORD
;
1870 if (!(syntax
& RE_NO_GNU_OPS
))
1871 token
->type
= OP_NOTWORD
;
1874 if (!(syntax
& RE_NO_GNU_OPS
))
1875 token
->type
= OP_SPACE
;
1878 if (!(syntax
& RE_NO_GNU_OPS
))
1879 token
->type
= OP_NOTSPACE
;
1882 if (!(syntax
& RE_NO_GNU_OPS
))
1884 token
->type
= ANCHOR
;
1885 token
->opr
.ctx_type
= BUF_FIRST
;
1889 if (!(syntax
& RE_NO_GNU_OPS
))
1891 token
->type
= ANCHOR
;
1892 token
->opr
.ctx_type
= BUF_LAST
;
1896 if (!(syntax
& RE_NO_BK_PARENS
))
1897 token
->type
= OP_OPEN_SUBEXP
;
1900 if (!(syntax
& RE_NO_BK_PARENS
))
1901 token
->type
= OP_CLOSE_SUBEXP
;
1904 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1905 token
->type
= OP_DUP_PLUS
;
1908 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1909 token
->type
= OP_DUP_QUESTION
;
1912 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1913 token
->type
= OP_OPEN_DUP_NUM
;
1916 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1917 token
->type
= OP_CLOSE_DUP_NUM
;
1925 token
->type
= CHARACTER
;
1926 #ifdef RE_ENABLE_I18N
1927 if (input
->mb_cur_max
> 1)
1929 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1930 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1934 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1939 if (syntax
& RE_NEWLINE_ALT
)
1940 token
->type
= OP_ALT
;
1943 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1944 token
->type
= OP_ALT
;
1947 token
->type
= OP_DUP_ASTERISK
;
1950 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1951 token
->type
= OP_DUP_PLUS
;
1954 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1955 token
->type
= OP_DUP_QUESTION
;
1958 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1959 token
->type
= OP_OPEN_DUP_NUM
;
1962 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1963 token
->type
= OP_CLOSE_DUP_NUM
;
1966 if (syntax
& RE_NO_BK_PARENS
)
1967 token
->type
= OP_OPEN_SUBEXP
;
1970 if (syntax
& RE_NO_BK_PARENS
)
1971 token
->type
= OP_CLOSE_SUBEXP
;
1974 token
->type
= OP_OPEN_BRACKET
;
1977 token
->type
= OP_PERIOD
;
1980 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1981 re_string_cur_idx (input
) != 0)
1983 char prev
= re_string_peek_byte (input
, -1);
1984 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1987 token
->type
= ANCHOR
;
1988 token
->opr
.ctx_type
= LINE_FIRST
;
1991 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1992 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1995 re_string_skip_bytes (input
, 1);
1996 peek_token (&next
, input
, syntax
);
1997 re_string_skip_bytes (input
, -1);
1998 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2001 token
->type
= ANCHOR
;
2002 token
->opr
.ctx_type
= LINE_LAST
;
2010 /* Peek a token from INPUT, and return the length of the token.
2011 We must not use this function out of bracket expressions. */
2014 peek_token_bracket (token
, input
, syntax
)
2017 reg_syntax_t syntax
;
2020 if (re_string_eoi (input
))
2022 token
->type
= END_OF_RE
;
2025 c
= re_string_peek_byte (input
, 0);
2028 #ifdef RE_ENABLE_I18N
2029 if (input
->mb_cur_max
> 1 &&
2030 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2032 token
->type
= CHARACTER
;
2035 #endif /* RE_ENABLE_I18N */
2037 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2038 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2040 /* In this case, '\' escape a character. */
2042 re_string_skip_bytes (input
, 1);
2043 c2
= re_string_peek_byte (input
, 0);
2045 token
->type
= CHARACTER
;
2048 if (c
== '[') /* '[' is a special char in a bracket exps. */
2052 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2053 c2
= re_string_peek_byte (input
, 1);
2061 token
->type
= OP_OPEN_COLL_ELEM
;
2064 token
->type
= OP_OPEN_EQUIV_CLASS
;
2067 if (syntax
& RE_CHAR_CLASSES
)
2069 token
->type
= OP_OPEN_CHAR_CLASS
;
2072 /* else fall through. */
2074 token
->type
= CHARACTER
;
2084 token
->type
= OP_CHARSET_RANGE
;
2087 token
->type
= OP_CLOSE_BRACKET
;
2090 token
->type
= OP_NON_MATCH_LIST
;
2093 token
->type
= CHARACTER
;
2098 /* Functions for parser. */
2100 /* Entry point of the parser.
2101 Parse the regular expression REGEXP and return the structure tree.
2102 If an error is occured, ERR is set by error code, and return NULL.
2103 This function build the following tree, from regular expression <reg_exp>:
2109 CAT means concatenation.
2110 EOR means end of regular expression. */
2113 parse (regexp
, preg
, syntax
, err
)
2114 re_string_t
*regexp
;
2116 reg_syntax_t syntax
;
2119 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2120 bin_tree_t
*tree
, *eor
, *root
;
2121 re_token_t current_token
;
2122 dfa
->syntax
= syntax
;
2123 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2124 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2125 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2127 eor
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, ¤t_token
);
2129 root
= create_tree (dfa
, tree
, eor
, CONCAT
, 0);
2132 if (BE (eor
== NULL
|| root
== NULL
, 0))
2140 /* This function build the following tree, from regular expression
2141 <branch1>|<branch2>:
2147 ALT means alternative, which represents the operator `|'. */
2150 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2151 re_string_t
*regexp
;
2154 reg_syntax_t syntax
;
2158 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2159 bin_tree_t
*tree
, *branch
= NULL
;
2160 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2161 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2164 while (token
->type
== OP_ALT
)
2166 re_token_t alt_token
= *token
;
2167 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2168 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2169 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2171 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2172 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2177 tree
= re_dfa_add_tree_node (dfa
, tree
, branch
, &alt_token
);
2178 if (BE (tree
== NULL
, 0))
2183 dfa
->has_plural_match
= 1;
2188 /* This function build the following tree, from regular expression
2195 CAT means concatenation. */
2198 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2199 re_string_t
*regexp
;
2202 reg_syntax_t syntax
;
2206 bin_tree_t
*tree
, *exp
;
2207 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2208 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2209 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2212 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2213 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2215 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2216 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2220 if (tree
!= NULL
&& exp
!= NULL
)
2222 tree
= create_tree (dfa
, tree
, exp
, CONCAT
, 0);
2229 else if (tree
== NULL
)
2231 /* Otherwise exp == NULL, we don't need to create new tree. */
2236 /* This function build the following tree, from regular expression a*:
2243 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2244 re_string_t
*regexp
;
2247 reg_syntax_t syntax
;
2251 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2253 switch (token
->type
)
2256 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2257 if (BE (tree
== NULL
, 0))
2262 #ifdef RE_ENABLE_I18N
2263 if (dfa
->mb_cur_max
> 1)
2265 while (!re_string_eoi (regexp
)
2266 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2268 bin_tree_t
*mbc_remain
;
2269 fetch_token (token
, regexp
, syntax
);
2270 mbc_remain
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2271 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
, 0);
2272 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2281 case OP_OPEN_SUBEXP
:
2282 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2283 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2286 case OP_OPEN_BRACKET
:
2287 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2288 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2292 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2297 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2298 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2299 if (BE (tree
== NULL
, 0))
2305 dfa
->has_mb_node
= 1;
2307 case OP_OPEN_DUP_NUM
:
2308 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2314 case OP_DUP_ASTERISK
:
2316 case OP_DUP_QUESTION
:
2317 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2322 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2324 fetch_token (token
, regexp
, syntax
);
2325 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2327 /* else fall through */
2328 case OP_CLOSE_SUBEXP
:
2329 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2330 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2335 /* else fall through */
2336 case OP_CLOSE_DUP_NUM
:
2337 /* We treat it as a normal character. */
2339 /* Then we can these characters as normal characters. */
2340 token
->type
= CHARACTER
;
2341 /* mb_partial and word_char bits should be initialized already
2343 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2344 if (BE (tree
== NULL
, 0))
2351 if ((token
->opr
.ctx_type
2352 & (WORD_DELIM
| INSIDE_WORD
| WORD_FIRST
| WORD_LAST
))
2353 && dfa
->word_ops_used
== 0)
2354 init_word_char (dfa
);
2355 if (token
->opr
.ctx_type
== WORD_DELIM
)
2357 bin_tree_t
*tree_first
, *tree_last
;
2358 token
->opr
.ctx_type
= WORD_FIRST
;
2359 tree_first
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2360 token
->opr
.ctx_type
= WORD_LAST
;
2361 tree_last
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2362 token
->type
= OP_ALT
;
2363 tree
= re_dfa_add_tree_node (dfa
, tree_first
, tree_last
, token
);
2364 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2372 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2373 if (BE (tree
== NULL
, 0))
2379 /* We must return here, since ANCHORs can't be followed
2380 by repetition operators.
2381 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2382 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2383 fetch_token (token
, regexp
, syntax
);
2386 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2387 if (BE (tree
== NULL
, 0))
2392 if (dfa
->mb_cur_max
> 1)
2393 dfa
->has_mb_node
= 1;
2397 tree
= build_charclass_op (dfa
, regexp
->trans
,
2398 (const unsigned char *) "alnum",
2399 (const unsigned char *) "_",
2400 token
->type
== OP_NOTWORD
, err
);
2401 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2406 tree
= build_charclass_op (dfa
, regexp
->trans
,
2407 (const unsigned char *) "space",
2408 (const unsigned char *) "",
2409 token
->type
== OP_NOTSPACE
, err
);
2410 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2420 /* Must not happen? */
2426 fetch_token (token
, regexp
, syntax
);
2428 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2429 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2431 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2432 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2434 /* In BRE consecutive duplications are not allowed. */
2435 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2436 && (token
->type
== OP_DUP_ASTERISK
2437 || token
->type
== OP_OPEN_DUP_NUM
))
2442 dfa
->has_plural_match
= 1;
2448 /* This function build the following tree, from regular expression
2456 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2457 re_string_t
*regexp
;
2460 reg_syntax_t syntax
;
2464 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2465 bin_tree_t
*tree
, *left_par
, *right_par
;
2467 cur_nsub
= preg
->re_nsub
++;
2469 left_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2470 if (BE (left_par
== NULL
, 0))
2475 dfa
->nodes
[left_par
->node_idx
].opr
.idx
= cur_nsub
;
2476 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2478 /* The subexpression may be a null string. */
2479 if (token
->type
== OP_CLOSE_SUBEXP
)
2483 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2484 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2487 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2492 right_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2493 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2494 tree
= ((tree
== NULL
) ? right_par
2495 : create_tree (dfa
, tree
, right_par
, CONCAT
, 0));
2496 tree
= create_tree (dfa
, left_par
, tree
, CONCAT
, 0);
2497 if (BE (right_par
== NULL
|| tree
== NULL
, 0))
2502 dfa
->nodes
[right_par
->node_idx
].opr
.idx
= cur_nsub
;
2507 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2510 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2512 re_string_t
*regexp
;
2515 reg_syntax_t syntax
;
2518 re_token_t dup_token
;
2519 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2520 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2521 re_token_t start_token
= *token
;
2523 if (token
->type
== OP_OPEN_DUP_NUM
)
2526 start
= fetch_number (regexp
, token
, syntax
);
2529 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2530 start
= 0; /* We treat "{,m}" as "{0,m}". */
2533 *err
= REG_BADBR
; /* <re>{} is invalid. */
2537 if (BE (start
!= -2, 1))
2539 /* We treat "{n}" as "{n,n}". */
2540 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2541 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2542 ? fetch_number (regexp
, token
, syntax
) : -2));
2544 if (BE (start
== -2 || end
== -2, 0))
2546 /* Invalid sequence. */
2547 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2549 if (token
->type
== END_OF_RE
)
2557 /* If the syntax bit is set, rollback. */
2558 re_string_set_index (regexp
, start_idx
);
2559 *token
= start_token
;
2560 token
->type
= CHARACTER
;
2561 /* mb_partial and word_char bits should be already initialized by
2566 if (BE (end
!= -1 && start
> end
, 0))
2568 /* First number greater than second. */
2575 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2576 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2579 fetch_token (token
, regexp
, syntax
);
2581 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2582 if (BE (elem
== NULL
|| (start
== 0 && end
== 0), 0))
2585 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2586 if (BE (start
> 0, 0))
2589 for (i
= 2; i
<= start
; ++i
)
2591 elem
= duplicate_tree (elem
, dfa
);
2592 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2593 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2594 goto parse_dup_op_espace
;
2600 /* Duplicate ELEM before it is marked optional. */
2601 elem
= duplicate_tree (elem
, dfa
);
2607 mark_opt_subexp (elem
, dfa
);
2608 dup_token
.type
= (end
== -1 ? OP_DUP_ASTERISK
: OP_DUP_QUESTION
);
2609 tree
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2610 if (BE (tree
== NULL
, 0))
2611 goto parse_dup_op_espace
;
2613 /* This loop is actually executed only when end != -1,
2614 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2615 already created the start+1-th copy. */
2616 for (i
= start
+ 2; i
<= end
; ++i
)
2618 elem
= duplicate_tree (elem
, dfa
);
2619 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2620 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2621 goto parse_dup_op_espace
;
2623 tree
= re_dfa_add_tree_node (dfa
, tree
, NULL
, &dup_token
);
2624 if (BE (tree
== NULL
, 0))
2625 goto parse_dup_op_espace
;
2629 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
, 0);
2633 parse_dup_op_espace
:
2638 /* Size of the names for collating symbol/equivalence_class/character_class.
2639 I'm not sure, but maybe enough. */
2640 #define BRACKET_NAME_BUF_SIZE 32
2643 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2644 Build the range expression which starts from START_ELEM, and ends
2645 at END_ELEM. The result are written to MBCSET and SBCSET.
2646 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2647 mbcset->range_ends, is a pointer argument sinse we may
2650 static reg_errcode_t
2651 # ifdef RE_ENABLE_I18N
2652 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2653 re_charset_t
*mbcset
;
2655 # else /* not RE_ENABLE_I18N */
2656 build_range_exp (sbcset
, start_elem
, end_elem
)
2657 # endif /* not RE_ENABLE_I18N */
2658 re_bitset_ptr_t sbcset
;
2659 bracket_elem_t
*start_elem
, *end_elem
;
2661 unsigned int start_ch
, end_ch
;
2662 /* Equivalence Classes and Character Classes can't be a range start/end. */
2663 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2664 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2668 /* We can handle no multi character collating elements without libc
2670 if (BE ((start_elem
->type
== COLL_SYM
2671 && strlen ((char *) start_elem
->opr
.name
) > 1)
2672 || (end_elem
->type
== COLL_SYM
2673 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2674 return REG_ECOLLATE
;
2676 # ifdef RE_ENABLE_I18N
2678 wchar_t wc
, start_wc
, end_wc
;
2679 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2681 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2682 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2684 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2685 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2687 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2688 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2689 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2690 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2691 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2692 return REG_ECOLLATE
;
2693 cmp_buf
[0] = start_wc
;
2694 cmp_buf
[4] = end_wc
;
2695 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2698 /* Got valid collation sequence values, add them as a new entry.
2699 However, for !_LIBC we have no collation elements: if the
2700 character set is single byte, the single byte character set
2701 that we build below suffices. parse_bracket_exp passes
2702 no MBCSET if dfa->mb_cur_max == 1. */
2705 /* Check the space of the arrays. */
2706 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2708 /* There is not enough space, need realloc. */
2709 wchar_t *new_array_start
, *new_array_end
;
2712 /* +1 in case of mbcset->nranges is 0. */
2713 new_nranges
= 2 * mbcset
->nranges
+ 1;
2714 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2715 are NULL if *range_alloc == 0. */
2716 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2718 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2721 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2724 mbcset
->range_starts
= new_array_start
;
2725 mbcset
->range_ends
= new_array_end
;
2726 *range_alloc
= new_nranges
;
2729 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2730 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2733 /* Build the table for single byte characters. */
2734 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2737 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2738 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2739 bitset_set (sbcset
, wc
);
2742 # else /* not RE_ENABLE_I18N */
2745 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2746 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2748 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2749 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2751 if (start_ch
> end_ch
)
2753 /* Build the table for single byte characters. */
2754 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2755 if (start_ch
<= ch
&& ch
<= end_ch
)
2756 bitset_set (sbcset
, ch
);
2758 # endif /* not RE_ENABLE_I18N */
2761 #endif /* not _LIBC */
2764 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2765 Build the collating element which is represented by NAME.
2766 The result are written to MBCSET and SBCSET.
2767 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2768 pointer argument since we may update it. */
2770 static reg_errcode_t
2771 # ifdef RE_ENABLE_I18N
2772 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2773 re_charset_t
*mbcset
;
2774 int *coll_sym_alloc
;
2775 # else /* not RE_ENABLE_I18N */
2776 build_collating_symbol (sbcset
, name
)
2777 # endif /* not RE_ENABLE_I18N */
2778 re_bitset_ptr_t sbcset
;
2779 const unsigned char *name
;
2781 size_t name_len
= strlen ((const char *) name
);
2782 if (BE (name_len
!= 1, 0))
2783 return REG_ECOLLATE
;
2786 bitset_set (sbcset
, name
[0]);
2790 #endif /* not _LIBC */
2792 /* This function parse bracket expression like "[abc]", "[a-c]",
2796 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2797 re_string_t
*regexp
;
2800 reg_syntax_t syntax
;
2804 const unsigned char *collseqmb
;
2805 const char *collseqwc
;
2808 const int32_t *symb_table
;
2809 const unsigned char *extra
;
2811 /* Local function for parse_bracket_exp used in _LIBC environement.
2812 Seek the collating symbol entry correspondings to NAME.
2813 Return the index of the symbol in the SYMB_TABLE. */
2816 __attribute ((always_inline
))
2817 seek_collating_symbol_entry (name
, name_len
)
2818 const unsigned char *name
;
2821 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2822 int32_t elem
= hash
% table_size
;
2823 int32_t second
= hash
% (table_size
- 2);
2824 while (symb_table
[2 * elem
] != 0)
2826 /* First compare the hashing value. */
2827 if (symb_table
[2 * elem
] == hash
2828 /* Compare the length of the name. */
2829 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2830 /* Compare the name. */
2831 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2834 /* Yep, this is the entry. */
2844 /* Local function for parse_bracket_exp used in _LIBC environement.
2845 Look up the collation sequence value of BR_ELEM.
2846 Return the value if succeeded, UINT_MAX otherwise. */
2848 auto inline unsigned int
2849 __attribute ((always_inline
))
2850 lookup_collation_sequence_value (br_elem
)
2851 bracket_elem_t
*br_elem
;
2853 if (br_elem
->type
== SB_CHAR
)
2856 if (MB_CUR_MAX == 1)
2859 return collseqmb
[br_elem
->opr
.ch
];
2862 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2863 return __collseq_table_lookup (collseqwc
, wc
);
2866 else if (br_elem
->type
== MB_CHAR
)
2868 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2870 else if (br_elem
->type
== COLL_SYM
)
2872 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2876 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2878 if (symb_table
[2 * elem
] != 0)
2880 /* We found the entry. */
2881 idx
= symb_table
[2 * elem
+ 1];
2882 /* Skip the name of collating element name. */
2883 idx
+= 1 + extra
[idx
];
2884 /* Skip the byte sequence of the collating element. */
2885 idx
+= 1 + extra
[idx
];
2886 /* Adjust for the alignment. */
2887 idx
= (idx
+ 3) & ~3;
2888 /* Skip the multibyte collation sequence value. */
2889 idx
+= sizeof (unsigned int);
2890 /* Skip the wide char sequence of the collating element. */
2891 idx
+= sizeof (unsigned int) *
2892 (1 + *(unsigned int *) (extra
+ idx
));
2893 /* Return the collation sequence value. */
2894 return *(unsigned int *) (extra
+ idx
);
2896 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2898 /* No valid character. Match it as a single byte
2900 return collseqmb
[br_elem
->opr
.name
[0]];
2903 else if (sym_name_len
== 1)
2904 return collseqmb
[br_elem
->opr
.name
[0]];
2909 /* Local function for parse_bracket_exp used in _LIBC environement.
2910 Build the range expression which starts from START_ELEM, and ends
2911 at END_ELEM. The result are written to MBCSET and SBCSET.
2912 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2913 mbcset->range_ends, is a pointer argument sinse we may
2916 auto inline reg_errcode_t
2917 __attribute ((always_inline
))
2918 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2919 re_charset_t
*mbcset
;
2921 re_bitset_ptr_t sbcset
;
2922 bracket_elem_t
*start_elem
, *end_elem
;
2925 uint32_t start_collseq
;
2926 uint32_t end_collseq
;
2928 /* Equivalence Classes and Character Classes can't be a range
2930 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2931 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2935 start_collseq
= lookup_collation_sequence_value (start_elem
);
2936 end_collseq
= lookup_collation_sequence_value (end_elem
);
2937 /* Check start/end collation sequence values. */
2938 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2939 return REG_ECOLLATE
;
2940 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2943 /* Got valid collation sequence values, add them as a new entry.
2944 However, if we have no collation elements, and the character set
2945 is single byte, the single byte character set that we
2946 build below suffices. */
2947 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2949 /* Check the space of the arrays. */
2950 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2952 /* There is not enough space, need realloc. */
2953 uint32_t *new_array_start
;
2954 uint32_t *new_array_end
;
2957 /* +1 in case of mbcset->nranges is 0. */
2958 new_nranges
= 2 * mbcset
->nranges
+ 1;
2959 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2961 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2964 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2967 mbcset
->range_starts
= new_array_start
;
2968 mbcset
->range_ends
= new_array_end
;
2969 *range_alloc
= new_nranges
;
2972 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2973 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2976 /* Build the table for single byte characters. */
2977 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2979 uint32_t ch_collseq
;
2981 if (MB_CUR_MAX == 1)
2984 ch_collseq
= collseqmb
[ch
];
2986 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2987 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2988 bitset_set (sbcset
, ch
);
2993 /* Local function for parse_bracket_exp used in _LIBC environement.
2994 Build the collating element which is represented by NAME.
2995 The result are written to MBCSET and SBCSET.
2996 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2997 pointer argument sinse we may update it. */
2999 auto inline reg_errcode_t
3000 __attribute ((always_inline
))
3001 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
3002 re_charset_t
*mbcset
;
3003 int *coll_sym_alloc
;
3004 re_bitset_ptr_t sbcset
;
3005 const unsigned char *name
;
3008 size_t name_len
= strlen ((const char *) name
);
3011 elem
= seek_collating_symbol_entry (name
, name_len
);
3012 if (symb_table
[2 * elem
] != 0)
3014 /* We found the entry. */
3015 idx
= symb_table
[2 * elem
+ 1];
3016 /* Skip the name of collating element name. */
3017 idx
+= 1 + extra
[idx
];
3019 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3021 /* No valid character, treat it as a normal
3023 bitset_set (sbcset
, name
[0]);
3027 return REG_ECOLLATE
;
3029 /* Got valid collation sequence, add it as a new entry. */
3030 /* Check the space of the arrays. */
3031 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3033 /* Not enough, realloc it. */
3034 /* +1 in case of mbcset->ncoll_syms is 0. */
3035 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3036 /* Use realloc since mbcset->coll_syms is NULL
3038 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3039 new_coll_sym_alloc
);
3040 if (BE (new_coll_syms
== NULL
, 0))
3042 mbcset
->coll_syms
= new_coll_syms
;
3043 *coll_sym_alloc
= new_coll_sym_alloc
;
3045 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3050 if (BE (name_len
!= 1, 0))
3051 return REG_ECOLLATE
;
3054 bitset_set (sbcset
, name
[0]);
3061 re_token_t br_token
;
3062 re_bitset_ptr_t sbcset
;
3063 #ifdef RE_ENABLE_I18N
3064 re_charset_t
*mbcset
;
3065 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3066 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3067 #endif /* not RE_ENABLE_I18N */
3069 bin_tree_t
*work_tree
;
3071 int first_round
= 1;
3073 collseqmb
= (const unsigned char *)
3074 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3075 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3081 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3082 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3083 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3084 _NL_COLLATE_SYMB_TABLEMB
);
3085 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3086 _NL_COLLATE_SYMB_EXTRAMB
);
3089 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3090 #ifdef RE_ENABLE_I18N
3091 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3092 #endif /* RE_ENABLE_I18N */
3093 #ifdef RE_ENABLE_I18N
3094 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3096 if (BE (sbcset
== NULL
, 0))
3097 #endif /* RE_ENABLE_I18N */
3103 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3104 if (BE (token
->type
== END_OF_RE
, 0))
3107 goto parse_bracket_exp_free_return
;
3109 if (token
->type
== OP_NON_MATCH_LIST
)
3111 #ifdef RE_ENABLE_I18N
3112 mbcset
->non_match
= 1;
3113 #endif /* not RE_ENABLE_I18N */
3115 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3116 bitset_set (sbcset
, '\0');
3117 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3118 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3119 if (BE (token
->type
== END_OF_RE
, 0))
3122 goto parse_bracket_exp_free_return
;
3126 /* We treat the first ']' as a normal character. */
3127 if (token
->type
== OP_CLOSE_BRACKET
)
3128 token
->type
= CHARACTER
;
3132 bracket_elem_t start_elem
, end_elem
;
3133 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3134 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3136 int token_len2
= 0, is_range_exp
= 0;
3139 start_elem
.opr
.name
= start_name_buf
;
3140 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3141 syntax
, first_round
);
3142 if (BE (ret
!= REG_NOERROR
, 0))
3145 goto parse_bracket_exp_free_return
;
3149 /* Get information about the next token. We need it in any case. */
3150 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3152 /* Do not check for ranges if we know they are not allowed. */
3153 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3155 if (BE (token
->type
== END_OF_RE
, 0))
3158 goto parse_bracket_exp_free_return
;
3160 if (token
->type
== OP_CHARSET_RANGE
)
3162 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3163 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3164 if (BE (token2
.type
== END_OF_RE
, 0))
3167 goto parse_bracket_exp_free_return
;
3169 if (token2
.type
== OP_CLOSE_BRACKET
)
3171 /* We treat the last '-' as a normal character. */
3172 re_string_skip_bytes (regexp
, -token_len
);
3173 token
->type
= CHARACTER
;
3180 if (is_range_exp
== 1)
3182 end_elem
.opr
.name
= end_name_buf
;
3183 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3185 if (BE (ret
!= REG_NOERROR
, 0))
3188 goto parse_bracket_exp_free_return
;
3191 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3194 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3195 &start_elem
, &end_elem
);
3197 # ifdef RE_ENABLE_I18N
3198 *err
= build_range_exp (sbcset
,
3199 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3200 &range_alloc
, &start_elem
, &end_elem
);
3202 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3204 #endif /* RE_ENABLE_I18N */
3205 if (BE (*err
!= REG_NOERROR
, 0))
3206 goto parse_bracket_exp_free_return
;
3210 switch (start_elem
.type
)
3213 bitset_set (sbcset
, start_elem
.opr
.ch
);
3215 #ifdef RE_ENABLE_I18N
3217 /* Check whether the array has enough space. */
3218 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3220 wchar_t *new_mbchars
;
3221 /* Not enough, realloc it. */
3222 /* +1 in case of mbcset->nmbchars is 0. */
3223 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3224 /* Use realloc since array is NULL if *alloc == 0. */
3225 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3227 if (BE (new_mbchars
== NULL
, 0))
3228 goto parse_bracket_exp_espace
;
3229 mbcset
->mbchars
= new_mbchars
;
3231 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3233 #endif /* RE_ENABLE_I18N */
3235 *err
= build_equiv_class (sbcset
,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset
, &equiv_class_alloc
,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem
.opr
.name
);
3240 if (BE (*err
!= REG_NOERROR
, 0))
3241 goto parse_bracket_exp_free_return
;
3244 *err
= build_collating_symbol (sbcset
,
3245 #ifdef RE_ENABLE_I18N
3246 mbcset
, &coll_sym_alloc
,
3247 #endif /* RE_ENABLE_I18N */
3248 start_elem
.opr
.name
);
3249 if (BE (*err
!= REG_NOERROR
, 0))
3250 goto parse_bracket_exp_free_return
;
3253 *err
= build_charclass (regexp
->trans
, sbcset
,
3254 #ifdef RE_ENABLE_I18N
3255 mbcset
, &char_class_alloc
,
3256 #endif /* RE_ENABLE_I18N */
3257 start_elem
.opr
.name
, syntax
);
3258 if (BE (*err
!= REG_NOERROR
, 0))
3259 goto parse_bracket_exp_free_return
;
3266 if (BE (token
->type
== END_OF_RE
, 0))
3269 goto parse_bracket_exp_free_return
;
3271 if (token
->type
== OP_CLOSE_BRACKET
)
3275 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3277 /* If it is non-matching list. */
3279 bitset_not (sbcset
);
3281 #ifdef RE_ENABLE_I18N
3282 /* Ensure only single byte characters are set. */
3283 if (dfa
->mb_cur_max
> 1)
3284 bitset_mask (sbcset
, dfa
->sb_char
);
3285 #endif /* RE_ENABLE_I18N */
3287 /* Build a tree for simple bracket. */
3288 br_token
.type
= SIMPLE_BRACKET
;
3289 br_token
.opr
.sbcset
= sbcset
;
3290 work_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3291 if (BE (work_tree
== NULL
, 0))
3292 goto parse_bracket_exp_espace
;
3294 #ifdef RE_ENABLE_I18N
3295 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3296 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3297 || mbcset
->non_match
)))
3299 re_token_t alt_token
;
3300 bin_tree_t
*mbc_tree
;
3302 /* Build a tree for complex bracket. */
3303 dfa
->has_mb_node
= 1;
3304 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3305 if (sbcset
[sbc_idx
])
3307 /* If there are no bits set in sbcset, there is no point
3308 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3309 if (sbc_idx
== BITSET_UINTS
)
3312 dfa
->nodes
[work_tree
->node_idx
].type
= COMPLEX_BRACKET
;
3313 dfa
->nodes
[work_tree
->node_idx
].opr
.mbcset
= mbcset
;
3316 br_token
.type
= COMPLEX_BRACKET
;
3317 br_token
.opr
.mbcset
= mbcset
;
3318 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3319 if (BE (mbc_tree
== NULL
, 0))
3320 goto parse_bracket_exp_espace
;
3321 /* Then join them by ALT node. */
3322 alt_token
.type
= OP_ALT
;
3323 dfa
->has_plural_match
= 1;
3324 work_tree
= re_dfa_add_tree_node (dfa
, work_tree
, mbc_tree
, &alt_token
);
3325 if (BE (mbc_tree
!= NULL
, 1))
3330 free_charset (mbcset
);
3333 #else /* not RE_ENABLE_I18N */
3335 #endif /* not RE_ENABLE_I18N */
3337 parse_bracket_exp_espace
:
3339 parse_bracket_exp_free_return
:
3341 #ifdef RE_ENABLE_I18N
3342 free_charset (mbcset
);
3343 #endif /* RE_ENABLE_I18N */
3347 /* Parse an element in the bracket expression. */
3349 static reg_errcode_t
3350 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3352 bracket_elem_t
*elem
;
3353 re_string_t
*regexp
;
3357 reg_syntax_t syntax
;
3360 #ifdef RE_ENABLE_I18N
3362 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3363 if (cur_char_size
> 1)
3365 elem
->type
= MB_CHAR
;
3366 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3367 re_string_skip_bytes (regexp
, cur_char_size
);
3370 #endif /* RE_ENABLE_I18N */
3371 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3372 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3373 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3374 return parse_bracket_symbol (elem
, regexp
, token
);
3375 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3377 /* A '-' must only appear as anything but a range indicator before
3378 the closing bracket. Everything else is an error. */
3380 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3381 if (token2
.type
!= OP_CLOSE_BRACKET
)
3382 /* The actual error value is not standardized since this whole
3383 case is undefined. But ERANGE makes good sense. */
3386 elem
->type
= SB_CHAR
;
3387 elem
->opr
.ch
= token
->opr
.c
;
3391 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3392 such as [:<character_class>:], [.<collating_element>.], and
3393 [=<equivalent_class>=]. */
3395 static reg_errcode_t
3396 parse_bracket_symbol (elem
, regexp
, token
)
3397 bracket_elem_t
*elem
;
3398 re_string_t
*regexp
;
3401 unsigned char ch
, delim
= token
->opr
.c
;
3403 if (re_string_eoi(regexp
))
3407 if (i
>= BRACKET_NAME_BUF_SIZE
)
3409 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3410 ch
= re_string_fetch_byte_case (regexp
);
3412 ch
= re_string_fetch_byte (regexp
);
3413 if (re_string_eoi(regexp
))
3415 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3417 elem
->opr
.name
[i
] = ch
;
3419 re_string_skip_bytes (regexp
, 1);
3420 elem
->opr
.name
[i
] = '\0';
3421 switch (token
->type
)
3423 case OP_OPEN_COLL_ELEM
:
3424 elem
->type
= COLL_SYM
;
3426 case OP_OPEN_EQUIV_CLASS
:
3427 elem
->type
= EQUIV_CLASS
;
3429 case OP_OPEN_CHAR_CLASS
:
3430 elem
->type
= CHAR_CLASS
;
3438 /* Helper function for parse_bracket_exp.
3439 Build the equivalence class which is represented by NAME.
3440 The result are written to MBCSET and SBCSET.
3441 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3442 is a pointer argument sinse we may update it. */
3444 static reg_errcode_t
3445 #ifdef RE_ENABLE_I18N
3446 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3447 re_charset_t
*mbcset
;
3448 int *equiv_class_alloc
;
3449 #else /* not RE_ENABLE_I18N */
3450 build_equiv_class (sbcset
, name
)
3451 #endif /* not RE_ENABLE_I18N */
3452 re_bitset_ptr_t sbcset
;
3453 const unsigned char *name
;
3456 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3459 const int32_t *table
, *indirect
;
3460 const unsigned char *weights
, *extra
, *cp
;
3461 unsigned char char_buf
[2];
3465 /* This #include defines a local function! */
3466 # include <locale/weight.h>
3467 /* Calculate the index for equivalence class. */
3469 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3470 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3471 _NL_COLLATE_WEIGHTMB
);
3472 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3473 _NL_COLLATE_EXTRAMB
);
3474 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3475 _NL_COLLATE_INDIRECTMB
);
3476 idx1
= findidx (&cp
);
3477 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3478 /* This isn't a valid character. */
3479 return REG_ECOLLATE
;
3481 /* Build single byte matcing table for this equivalence class. */
3482 char_buf
[1] = (unsigned char) '\0';
3483 len
= weights
[idx1
];
3484 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3488 idx2
= findidx (&cp
);
3493 /* This isn't a valid character. */
3495 if (len
== weights
[idx2
])
3498 while (cnt
<= len
&&
3499 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3503 bitset_set (sbcset
, ch
);
3506 /* Check whether the array has enough space. */
3507 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3509 /* Not enough, realloc it. */
3510 /* +1 in case of mbcset->nequiv_classes is 0. */
3511 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3512 /* Use realloc since the array is NULL if *alloc == 0. */
3513 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3515 new_equiv_class_alloc
);
3516 if (BE (new_equiv_classes
== NULL
, 0))
3518 mbcset
->equiv_classes
= new_equiv_classes
;
3519 *equiv_class_alloc
= new_equiv_class_alloc
;
3521 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3526 if (BE (strlen ((const char *) name
) != 1, 0))
3527 return REG_ECOLLATE
;
3528 bitset_set (sbcset
, *name
);
3533 /* Helper function for parse_bracket_exp.
3534 Build the character class which is represented by NAME.
3535 The result are written to MBCSET and SBCSET.
3536 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3537 is a pointer argument sinse we may update it. */
3539 static reg_errcode_t
3540 #ifdef RE_ENABLE_I18N
3541 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3542 re_charset_t
*mbcset
;
3543 int *char_class_alloc
;
3544 #else /* not RE_ENABLE_I18N */
3545 build_charclass (trans
, sbcset
, class_name
, syntax
)
3546 #endif /* not RE_ENABLE_I18N */
3547 unsigned RE_TRANSLATE_TYPE trans
;
3548 re_bitset_ptr_t sbcset
;
3549 const unsigned char *class_name
;
3550 reg_syntax_t syntax
;
3553 const char *name
= (const char *) class_name
;
3555 /* In case of REG_ICASE "upper" and "lower" match the both of
3556 upper and lower cases. */
3557 if ((syntax
& RE_ICASE
)
3558 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3561 #ifdef RE_ENABLE_I18N
3562 /* Check the space of the arrays. */
3563 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3565 /* Not enough, realloc it. */
3566 /* +1 in case of mbcset->nchar_classes is 0. */
3567 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3568 /* Use realloc since array is NULL if *alloc == 0. */
3569 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3570 new_char_class_alloc
);
3571 if (BE (new_char_classes
== NULL
, 0))
3573 mbcset
->char_classes
= new_char_classes
;
3574 *char_class_alloc
= new_char_class_alloc
;
3576 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3577 #endif /* RE_ENABLE_I18N */
3579 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3580 for (i = 0; i < SBC_MAX; ++i) \
3582 if (ctype_func (i)) \
3584 int ch = trans ? trans[i] : i; \
3585 bitset_set (sbcset, ch); \
3589 if (strcmp (name
, "alnum") == 0)
3590 BUILD_CHARCLASS_LOOP (isalnum
)
3591 else if (strcmp (name
, "cntrl") == 0)
3592 BUILD_CHARCLASS_LOOP (iscntrl
)
3593 else if (strcmp (name
, "lower") == 0)
3594 BUILD_CHARCLASS_LOOP (islower
)
3595 else if (strcmp (name
, "space") == 0)
3596 BUILD_CHARCLASS_LOOP (isspace
)
3597 else if (strcmp (name
, "alpha") == 0)
3598 BUILD_CHARCLASS_LOOP (isalpha
)
3599 else if (strcmp (name
, "digit") == 0)
3600 BUILD_CHARCLASS_LOOP (isdigit
)
3601 else if (strcmp (name
, "print") == 0)
3602 BUILD_CHARCLASS_LOOP (isprint
)
3603 else if (strcmp (name
, "upper") == 0)
3604 BUILD_CHARCLASS_LOOP (isupper
)
3605 else if (strcmp (name
, "blank") == 0)
3606 BUILD_CHARCLASS_LOOP (isblank
)
3607 else if (strcmp (name
, "graph") == 0)
3608 BUILD_CHARCLASS_LOOP (isgraph
)
3609 else if (strcmp (name
, "punct") == 0)
3610 BUILD_CHARCLASS_LOOP (ispunct
)
3611 else if (strcmp (name
, "xdigit") == 0)
3612 BUILD_CHARCLASS_LOOP (isxdigit
)
3620 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3622 unsigned RE_TRANSLATE_TYPE trans
;
3623 const unsigned char *class_name
;
3624 const unsigned char *extra
;
3628 re_bitset_ptr_t sbcset
;
3629 #ifdef RE_ENABLE_I18N
3630 re_charset_t
*mbcset
;
3632 #endif /* not RE_ENABLE_I18N */
3634 re_token_t br_token
;
3637 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3638 #ifdef RE_ENABLE_I18N
3639 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3640 #endif /* RE_ENABLE_I18N */
3642 #ifdef RE_ENABLE_I18N
3643 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3644 #else /* not RE_ENABLE_I18N */
3645 if (BE (sbcset
== NULL
, 0))
3646 #endif /* not RE_ENABLE_I18N */
3654 #ifdef RE_ENABLE_I18N
3656 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3657 bitset_set(cset->sbcset, '\0');
3659 mbcset
->non_match
= 1;
3660 #endif /* not RE_ENABLE_I18N */
3663 /* We don't care the syntax in this case. */
3664 ret
= build_charclass (trans
, sbcset
,
3665 #ifdef RE_ENABLE_I18N
3667 #endif /* RE_ENABLE_I18N */
3670 if (BE (ret
!= REG_NOERROR
, 0))
3673 #ifdef RE_ENABLE_I18N
3674 free_charset (mbcset
);
3675 #endif /* RE_ENABLE_I18N */
3679 /* \w match '_' also. */
3680 for (; *extra
; extra
++)
3681 bitset_set (sbcset
, *extra
);
3683 /* If it is non-matching list. */
3685 bitset_not (sbcset
);
3687 #ifdef RE_ENABLE_I18N
3688 /* Ensure only single byte characters are set. */
3689 if (dfa
->mb_cur_max
> 1)
3690 bitset_mask (sbcset
, dfa
->sb_char
);
3693 /* Build a tree for simple bracket. */
3694 br_token
.type
= SIMPLE_BRACKET
;
3695 br_token
.opr
.sbcset
= sbcset
;
3696 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3697 if (BE (tree
== NULL
, 0))
3698 goto build_word_op_espace
;
3700 #ifdef RE_ENABLE_I18N
3701 if (dfa
->mb_cur_max
> 1)
3703 re_token_t alt_token
;
3704 bin_tree_t
*mbc_tree
;
3705 /* Build a tree for complex bracket. */
3706 br_token
.type
= COMPLEX_BRACKET
;
3707 br_token
.opr
.mbcset
= mbcset
;
3708 dfa
->has_mb_node
= 1;
3709 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3710 if (BE (mbc_tree
== NULL
, 0))
3711 goto build_word_op_espace
;
3712 /* Then join them by ALT node. */
3713 alt_token
.type
= OP_ALT
;
3714 dfa
->has_plural_match
= 1;
3715 tree
= re_dfa_add_tree_node (dfa
, tree
, mbc_tree
, &alt_token
);
3716 if (BE (mbc_tree
!= NULL
, 1))
3721 free_charset (mbcset
);
3724 #else /* not RE_ENABLE_I18N */
3726 #endif /* not RE_ENABLE_I18N */
3728 build_word_op_espace
:
3730 #ifdef RE_ENABLE_I18N
3731 free_charset (mbcset
);
3732 #endif /* RE_ENABLE_I18N */
3737 /* This is intended for the expressions like "a{1,3}".
3738 Fetch a number from `input', and return the number.
3739 Return -1, if the number field is empty like "{,1}".
3740 Return -2, If an error is occured. */
3743 fetch_number (input
, token
, syntax
)
3746 reg_syntax_t syntax
;
3752 fetch_token (token
, input
, syntax
);
3754 if (BE (token
->type
== END_OF_RE
, 0))
3756 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3758 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3759 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3760 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3765 #ifdef RE_ENABLE_I18N
3767 free_charset (re_charset_t
*cset
)
3769 re_free (cset
->mbchars
);
3771 re_free (cset
->coll_syms
);
3772 re_free (cset
->equiv_classes
);
3773 re_free (cset
->range_starts
);
3774 re_free (cset
->range_ends
);
3776 re_free (cset
->char_classes
);
3779 #endif /* RE_ENABLE_I18N */
3781 /* Functions for binary tree operation. */
3783 /* Create a tree node. */
3786 create_tree (dfa
, left
, right
, type
, index
)
3790 re_token_type_t type
;
3794 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3796 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3798 if (storage
== NULL
)
3800 storage
->next
= dfa
->str_tree_storage
;
3801 dfa
->str_tree_storage
= storage
;
3802 dfa
->str_tree_storage_idx
= 0;
3804 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3806 tree
->parent
= NULL
;
3808 tree
->right
= right
;
3810 tree
->node_idx
= index
;
3813 re_node_set_init_empty (&tree
->eclosure
);
3816 left
->parent
= tree
;
3818 right
->parent
= tree
;
3822 /* Create both a DFA node and a tree for it. */
3825 re_dfa_add_tree_node (dfa
, left
, right
, token
)
3829 const re_token_t
*token
;
3831 int new_idx
= re_dfa_add_node (dfa
, *token
, 0);
3836 return create_tree (dfa
, left
, right
, 0, new_idx
);
3839 /* Mark the tree SRC as an optional subexpression. */
3842 mark_opt_subexp (src
, dfa
)
3843 const bin_tree_t
*src
;
3846 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3848 if (src
->type
== CONCAT
3849 && src
->left
->type
== NON_TYPE
3850 && dfa
->nodes
[src
->left
->node_idx
].type
== OP_OPEN_SUBEXP
)
3851 mark_opt_subexp_iter (src
, dfa
, dfa
->nodes
[src
->left
->node_idx
].opr
.idx
);
3855 /* Recursive tree walker for mark_opt_subexp. */
3858 mark_opt_subexp_iter (src
, dfa
, idx
)
3859 const bin_tree_t
*src
;
3865 if (src
->type
== NON_TYPE
)
3867 node_idx
= src
->node_idx
;
3868 if ((dfa
->nodes
[node_idx
].type
== OP_OPEN_SUBEXP
3869 || dfa
->nodes
[node_idx
].type
== OP_CLOSE_SUBEXP
)
3870 && dfa
->nodes
[node_idx
].opr
.idx
== idx
)
3871 dfa
->nodes
[node_idx
].opt_subexp
= 1;
3874 if (src
->left
!= NULL
)
3875 mark_opt_subexp_iter (src
->left
, dfa
, idx
);
3877 if (src
->right
!= NULL
)
3878 mark_opt_subexp_iter (src
->right
, dfa
, idx
);
3882 /* Duplicate the node SRC, and return new node. */
3885 duplicate_tree (src
, dfa
)
3886 const bin_tree_t
*src
;
3889 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3891 /* Since node indies must be according to Post-order of the tree,
3892 we must duplicate the left at first. */
3893 if (src
->left
!= NULL
)
3895 left
= duplicate_tree (src
->left
, dfa
);
3900 /* Secondaly, duplicate the right. */
3901 if (src
->right
!= NULL
)
3903 right
= duplicate_tree (src
->right
, dfa
);
3908 /* At last, duplicate itself. */
3909 if (src
->type
== NON_TYPE
)
3911 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3912 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3913 if (BE (new_node_idx
== -1, 0))
3917 new_node_idx
= src
->type
;
3919 new_tree
= create_tree (dfa
, left
, right
, src
->type
, new_node_idx
);