1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 struct subexp_optimize
42 static bin_tree_t
*optimize_subexps (struct subexp_optimize
*so
,
43 bin_tree_t
*node
, int sidx
, int depth
);
44 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
45 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
46 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
47 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
48 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
49 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
50 int top_clone_node
, int root_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
53 unsigned int constraint
);
54 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
55 unsigned int constraint
);
56 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
57 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
59 static void calc_inveclosure (re_dfa_t
*dfa
);
60 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
62 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
64 static int peek_token (re_token_t
*token
, re_string_t
*input
,
66 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
68 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
69 reg_syntax_t syntax
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
80 re_token_t
*token
, reg_syntax_t syntax
,
81 int nest
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
83 re_dfa_t
*dfa
, re_token_t
*token
,
84 reg_syntax_t syntax
, reg_errcode_t
*err
);
85 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
86 re_token_t
*token
, reg_syntax_t syntax
,
88 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
90 re_token_t
*token
, int token_len
,
94 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
100 re_charset_t
*mbcset
, int *range_alloc
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 re_charset_t
*mbcset
,
106 const unsigned char *name
);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
109 bracket_elem_t
*start_elem
,
110 bracket_elem_t
*end_elem
);
111 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
112 const unsigned char *name
);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
117 re_charset_t
*mbcset
,
118 int *equiv_class_alloc
,
119 const unsigned char *name
);
120 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
121 re_bitset_ptr_t sbcset
,
122 re_charset_t
*mbcset
,
123 int *char_class_alloc
,
124 const unsigned char *class_name
,
125 reg_syntax_t syntax
);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
128 const unsigned char *name
);
129 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
130 re_bitset_ptr_t sbcset
,
131 const unsigned char *class_name
,
132 reg_syntax_t syntax
);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
135 unsigned RE_TRANSLATE_TYPE trans
,
136 const unsigned char *class_name
,
137 const unsigned char *extra
,
138 int non_match
, reg_errcode_t
*err
);
139 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
140 bin_tree_t
*left
, bin_tree_t
*right
,
141 re_token_type_t type
, int index
);
142 static bin_tree_t
*re_dfa_add_tree_node (re_dfa_t
*dfa
,
143 bin_tree_t
*left
, bin_tree_t
*right
,
144 const re_token_t
*token
)
145 __attribute ((noinline
));
146 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
147 static void mark_opt_subexp (const bin_tree_t
*src
, re_dfa_t
*dfa
);
148 static void mark_opt_subexp_iter (const bin_tree_t
*src
, re_dfa_t
*dfa
, int idx
);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid
[] attribute_hidden
=
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx
[] attribute_hidden
=
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
240 re_compile_pattern (pattern
, length
, bufp
)
243 struct re_pattern_buffer
*bufp
;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
252 /* Match anchors at newline. */
253 bufp
->newline_anchor
= 1;
255 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
259 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
262 weak_alias (__re_compile_pattern
, re_compile_pattern
)
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options
;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
281 re_set_syntax (syntax
)
284 reg_syntax_t ret
= re_syntax_options
;
286 re_syntax_options
= syntax
;
290 weak_alias (__re_set_syntax
, re_set_syntax
)
294 re_compile_fastmap (bufp
)
295 struct re_pattern_buffer
*bufp
;
297 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
298 char *fastmap
= bufp
->fastmap
;
300 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
301 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
302 if (dfa
->init_state
!= dfa
->init_state_word
)
303 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_nl
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
308 bufp
->fastmap_accurate
= 1;
312 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
316 __attribute ((always_inline
))
317 re_set_fastmap (char *fastmap
, int icase
, int ch
)
321 fastmap
[tolower (ch
)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
328 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
330 const re_dfastate_t
*init_state
;
333 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
335 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
336 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
338 int node
= init_state
->nodes
.elems
[node_cnt
];
339 re_token_type_t type
= dfa
->nodes
[node
].type
;
341 if (type
== CHARACTER
)
343 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
347 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
352 *p
++ = dfa
->nodes
[node
].opr
.c
;
353 while (++node
< dfa
->nodes_len
354 && dfa
->nodes
[node
].type
== CHARACTER
355 && dfa
->nodes
[node
].mb_partial
)
356 *p
++ = dfa
->nodes
[node
].opr
.c
;
357 memset (&state
, 0, sizeof (state
));
358 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
360 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
361 re_set_fastmap (fastmap
, 0, buf
[0]);
365 else if (type
== SIMPLE_BRACKET
)
368 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
369 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
370 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
371 re_set_fastmap (fastmap
, icase
, ch
);
373 #ifdef RE_ENABLE_I18N
374 else if (type
== COMPLEX_BRACKET
)
377 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
378 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
379 || cset
->nranges
|| cset
->nchar_classes
)
382 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
391 const int32_t *table
= (const int32_t *)
392 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
393 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
394 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
396 re_set_fastmap (fastmap
, icase
, ch
);
399 if (dfa
->mb_cur_max
> 1)
400 for (i
= 0; i
< SBC_MAX
; ++i
)
401 if (__btowc (i
) == WEOF
)
402 re_set_fastmap (fastmap
, icase
, i
);
403 # endif /* not _LIBC */
405 for (i
= 0; i
< cset
->nmbchars
; ++i
)
409 memset (&state
, '\0', sizeof (state
));
410 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
411 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
412 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
414 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
415 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
419 #endif /* RE_ENABLE_I18N */
420 else if (type
== OP_PERIOD
421 #ifdef RE_ENABLE_I18N
422 || type
== OP_UTF8_PERIOD
423 #endif /* RE_ENABLE_I18N */
424 || type
== END_OF_RE
)
426 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
427 if (type
== END_OF_RE
)
428 bufp
->can_be_null
= 1;
434 /* Entry point for POSIX code. */
435 /* regcomp takes a regular expression as a string and compiles it.
437 PREG is a regex_t *. We do not expect any fields to be initialized,
438 since POSIX says we shouldn't. Thus, we set
440 `buffer' to the compiled pattern;
441 `used' to the length of the compiled pattern;
442 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
443 REG_EXTENDED bit in CFLAGS is set; otherwise, to
444 RE_SYNTAX_POSIX_BASIC;
445 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
446 `fastmap' to an allocated space for the fastmap;
447 `fastmap_accurate' to zero;
448 `re_nsub' to the number of subexpressions in PATTERN.
450 PATTERN is the address of the pattern string.
452 CFLAGS is a series of bits which affect compilation.
454 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
455 use POSIX basic syntax.
457 If REG_NEWLINE is set, then . and [^...] don't match newline.
458 Also, regexec will try a match beginning after every newline.
460 If REG_ICASE is set, then we considers upper- and lowercase
461 versions of letters to be equivalent when matching.
463 If REG_NOSUB is set, then when PREG is passed to regexec, that
464 routine will report only success or failure, and nothing about the
467 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
468 the return codes and their meanings.) */
471 regcomp (preg
, pattern
, cflags
)
472 regex_t
*__restrict preg
;
473 const char *__restrict pattern
;
477 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
478 : RE_SYNTAX_POSIX_BASIC
);
484 /* Try to allocate space for the fastmap. */
485 preg
->fastmap
= re_malloc (char, SBC_MAX
);
486 if (BE (preg
->fastmap
== NULL
, 0))
489 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
491 /* If REG_NEWLINE is set, newlines are treated differently. */
492 if (cflags
& REG_NEWLINE
)
493 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
494 syntax
&= ~RE_DOT_NEWLINE
;
495 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
496 /* It also changes the matching behavior. */
497 preg
->newline_anchor
= 1;
500 preg
->newline_anchor
= 0;
501 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
502 preg
->translate
= NULL
;
504 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
506 /* POSIX doesn't distinguish between an unmatched open-group and an
507 unmatched close-group: both are REG_EPAREN. */
508 if (ret
== REG_ERPAREN
)
511 /* We have already checked preg->fastmap != NULL. */
512 if (BE (ret
== REG_NOERROR
, 1))
513 /* Compute the fastmap now, since regexec cannot modify the pattern
514 buffer. This function never fails in this implementation. */
515 (void) re_compile_fastmap (preg
);
518 /* Some error occurred while compiling the expression. */
519 re_free (preg
->fastmap
);
520 preg
->fastmap
= NULL
;
526 weak_alias (__regcomp
, regcomp
)
529 /* Returns a message corresponding to an error code, ERRCODE, returned
530 from either regcomp or regexec. We don't use PREG here. */
533 regerror (errcode
, preg
, errbuf
, errbuf_size
)
543 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
544 / sizeof (__re_error_msgid_idx
[0])), 0))
545 /* Only error codes returned by the rest of the code should be passed
546 to this routine. If we are given anything else, or if other regex
547 code generates an invalid error code, then the program has a bug.
548 Dump core so we can fix it. */
551 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
553 msg_size
= strlen (msg
) + 1; /* Includes the null. */
555 if (BE (errbuf_size
!= 0, 1))
557 if (BE (msg_size
> errbuf_size
, 0))
559 #if defined HAVE_MEMPCPY || defined _LIBC
560 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
562 memcpy (errbuf
, msg
, errbuf_size
- 1);
563 errbuf
[errbuf_size
- 1] = 0;
567 memcpy (errbuf
, msg
, msg_size
);
573 weak_alias (__regerror
, regerror
)
577 #ifdef RE_ENABLE_I18N
578 /* This static array is used for the map to single-byte characters when
579 UTF-8 is used. Otherwise we would allocate memory just to initialize
580 it the same all the time. UTF-8 is the preferred encoding so this is
581 a worthwhile optimization. */
582 static const bitset utf8_sb_map
=
584 /* Set the first 128 bits. */
585 # if UINT_MAX == 0xffffffff
586 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
588 # error "Add case for new unsigned int size"
595 free_dfa_content (re_dfa_t
*dfa
)
600 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
602 re_token_t
*node
= dfa
->nodes
+ i
;
603 #ifdef RE_ENABLE_I18N
604 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
605 free_charset (node
->opr
.mbcset
);
607 #endif /* RE_ENABLE_I18N */
608 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
609 re_free (node
->opr
.sbcset
);
611 re_free (dfa
->nexts
);
612 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
614 if (dfa
->eclosures
!= NULL
)
615 re_node_set_free (dfa
->eclosures
+ i
);
616 if (dfa
->inveclosures
!= NULL
)
617 re_node_set_free (dfa
->inveclosures
+ i
);
618 if (dfa
->edests
!= NULL
)
619 re_node_set_free (dfa
->edests
+ i
);
621 re_free (dfa
->edests
);
622 re_free (dfa
->eclosures
);
623 re_free (dfa
->inveclosures
);
624 re_free (dfa
->nodes
);
626 if (dfa
->state_table
)
627 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
629 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
630 for (j
= 0; j
< entry
->num
; ++j
)
632 re_dfastate_t
*state
= entry
->array
[j
];
635 re_free (entry
->array
);
637 re_free (dfa
->state_table
);
638 #ifdef RE_ENABLE_I18N
639 if (dfa
->sb_char
!= utf8_sb_map
)
640 re_free (dfa
->sb_char
);
642 re_free (dfa
->subexp_map
);
644 re_free (dfa
->re_str
);
651 /* Free dynamically allocated space used by PREG. */
657 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
658 if (BE (dfa
!= NULL
, 1))
659 free_dfa_content (dfa
);
663 re_free (preg
->fastmap
);
664 preg
->fastmap
= NULL
;
666 re_free (preg
->translate
);
667 preg
->translate
= NULL
;
670 weak_alias (__regfree
, regfree
)
673 /* Entry points compatible with 4.2 BSD regex library. We don't define
674 them unless specifically requested. */
676 #if defined _REGEX_RE_COMP || defined _LIBC
678 /* BSD has one and only one pattern buffer. */
679 static struct re_pattern_buffer re_comp_buf
;
683 /* Make these definitions weak in libc, so POSIX programs can redefine
684 these names if they don't use our functions, and still use
685 regcomp/regexec above without link errors. */
696 if (!re_comp_buf
.buffer
)
697 return gettext ("No previous regular expression");
701 if (re_comp_buf
.buffer
)
703 fastmap
= re_comp_buf
.fastmap
;
704 re_comp_buf
.fastmap
= NULL
;
705 __regfree (&re_comp_buf
);
706 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
707 re_comp_buf
.fastmap
= fastmap
;
710 if (re_comp_buf
.fastmap
== NULL
)
712 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
713 if (re_comp_buf
.fastmap
== NULL
)
714 return (char *) gettext (__re_error_msgid
715 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
718 /* Since `re_exec' always passes NULL for the `regs' argument, we
719 don't need to initialize the pattern buffer fields which affect it. */
721 /* Match anchors at newlines. */
722 re_comp_buf
.newline_anchor
= 1;
724 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
729 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
730 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
734 libc_freeres_fn (free_mem
)
736 __regfree (&re_comp_buf
);
740 #endif /* _REGEX_RE_COMP */
742 /* Internal entry point.
743 Compile the regular expression PATTERN, whose length is LENGTH.
744 SYNTAX indicate regular expression's syntax. */
747 re_compile_internal (preg
, pattern
, length
, syntax
)
749 const char * pattern
;
753 reg_errcode_t err
= REG_NOERROR
;
757 /* Initialize the pattern buffer. */
758 preg
->fastmap_accurate
= 0;
759 preg
->syntax
= syntax
;
760 preg
->not_bol
= preg
->not_eol
= 0;
763 preg
->can_be_null
= 0;
764 preg
->regs_allocated
= REGS_UNALLOCATED
;
766 /* Initialize the dfa. */
767 dfa
= (re_dfa_t
*) preg
->buffer
;
768 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
777 preg
->allocated
= sizeof (re_dfa_t
);
778 preg
->buffer
= (unsigned char *) dfa
;
780 preg
->used
= sizeof (re_dfa_t
);
782 __libc_lock_init (dfa
->lock
);
784 err
= init_dfa (dfa
, length
);
785 if (BE (err
!= REG_NOERROR
, 0))
787 free_dfa_content (dfa
);
793 dfa
->re_str
= re_malloc (char, length
+ 1);
794 strncpy (dfa
->re_str
, pattern
, length
+ 1);
797 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
798 syntax
& RE_ICASE
, dfa
);
799 if (BE (err
!= REG_NOERROR
, 0))
801 re_compile_internal_free_return
:
802 free_workarea_compile (preg
);
803 re_string_destruct (®exp
);
804 free_dfa_content (dfa
);
810 /* Parse the regular expression, and build a structure tree. */
812 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
813 if (BE (dfa
->str_tree
== NULL
, 0))
814 goto re_compile_internal_free_return
;
816 #ifdef RE_ENABLE_I18N
817 /* If possible, do searching in single byte encoding to speed things up. */
818 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
822 if (preg
->re_nsub
> 0)
824 struct subexp_optimize so
;
827 so
.nodes
= dfa
->nodes
;
828 so
.no_sub
= preg
->no_sub
;
829 so
.re_nsub
= preg
->re_nsub
;
830 dfa
->str_tree
= optimize_subexps (&so
, dfa
->str_tree
, -1, 0);
833 /* Analyze the tree and collect information which is necessary to
836 if (BE (err
!= REG_NOERROR
, 0))
837 goto re_compile_internal_free_return
;
839 /* Then create the initial state of the dfa. */
840 err
= create_initial_state (dfa
);
842 /* Release work areas. */
843 free_workarea_compile (preg
);
844 re_string_destruct (®exp
);
846 if (BE (err
!= REG_NOERROR
, 0))
848 free_dfa_content (dfa
);
856 /* Initialize DFA. We use the length of the regular expression PAT_LEN
857 as the initial length of some arrays. */
860 init_dfa (dfa
, pat_len
)
869 memset (dfa
, '\0', sizeof (re_dfa_t
));
871 /* Force allocation of str_tree_storage the first time. */
872 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
874 dfa
->nodes_alloc
= pat_len
+ 1;
875 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
877 dfa
->states_alloc
= pat_len
+ 1;
879 /* table_size = 2 ^ ceil(log pat_len) */
880 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
881 if (table_size
> pat_len
)
884 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
885 dfa
->state_hash_mask
= table_size
- 1;
887 dfa
->mb_cur_max
= MB_CUR_MAX
;
889 if (dfa
->mb_cur_max
== 6
890 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
892 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
895 # ifdef HAVE_LANGINFO_CODESET
896 codeset_name
= nl_langinfo (CODESET
);
898 codeset_name
= getenv ("LC_ALL");
899 if (codeset_name
== NULL
|| codeset
[0] == '\0')
900 codeset_name
= getenv ("LC_CTYPE");
901 if (codeset_name
== NULL
|| codeset
[0] == '\0')
902 codeset_name
= getenv ("LANG");
903 if (codeset_name
== NULL
)
905 else if (strchr (codeset_name
, '.') != NULL
)
906 codeset_name
= strchr (codeset_name
, '.') + 1;
909 if (strcasecmp (codeset_name
, "UTF-8") == 0
910 || strcasecmp (codeset_name
, "UTF8") == 0)
913 /* We check exhaustively in the loop below if this charset is a
914 superset of ASCII. */
915 dfa
->map_notascii
= 0;
918 #ifdef RE_ENABLE_I18N
919 if (dfa
->mb_cur_max
> 1)
922 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
927 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
928 if (BE (dfa
->sb_char
== NULL
, 0))
931 /* Clear all bits by, then set those corresponding to single
933 bitset_empty (dfa
->sb_char
);
935 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
936 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
938 wchar_t wch
= __btowc (ch
);
940 dfa
->sb_char
[i
] |= 1 << j
;
942 if (isascii (ch
) && wch
!= (wchar_t) ch
)
943 dfa
->map_notascii
= 1;
950 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
955 /* Initialize WORD_CHAR table, which indicate which character is
956 "word". In this case "word" means that it is the word construction
957 character used by some operators like "\<", "\>", etc. */
964 dfa
->word_ops_used
= 1;
965 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
966 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
967 if (isalnum (ch
) || ch
== '_')
968 dfa
->word_char
[i
] |= 1 << j
;
971 /* Free the work area which are only used while compiling. */
974 free_workarea_compile (preg
)
977 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
978 bin_tree_storage_t
*storage
, *next
;
979 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
981 next
= storage
->next
;
984 dfa
->str_tree_storage
= NULL
;
985 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
986 dfa
->str_tree
= NULL
;
987 re_free (dfa
->org_indices
);
988 dfa
->org_indices
= NULL
;
991 /* Create initial states for all contexts. */
994 create_initial_state (dfa
)
999 re_node_set init_nodes
;
1001 /* Initial states have the epsilon closure of the node which is
1002 the first node of the regular expression. */
1003 first
= dfa
->str_tree
->first
;
1004 dfa
->init_node
= first
;
1005 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1006 if (BE (err
!= REG_NOERROR
, 0))
1009 /* The back-references which are in initial states can epsilon transit,
1010 since in this case all of the subexpressions can be null.
1011 Then we add epsilon closures of the nodes which are the next nodes of
1012 the back-references. */
1013 if (dfa
->nbackref
> 0)
1014 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1016 int node_idx
= init_nodes
.elems
[i
];
1017 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1020 if (type
!= OP_BACK_REF
)
1022 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1024 re_token_t
*clexp_node
;
1025 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1026 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1027 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1030 if (clexp_idx
== init_nodes
.nelem
)
1033 if (type
== OP_BACK_REF
)
1035 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1036 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1038 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1044 /* It must be the first time to invoke acquire_state. */
1045 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1046 /* We don't check ERR here, since the initial state must not be NULL. */
1047 if (BE (dfa
->init_state
== NULL
, 0))
1049 if (dfa
->init_state
->has_constraint
)
1051 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1053 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1055 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1059 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1060 || dfa
->init_state_begbuf
== NULL
, 0))
1064 dfa
->init_state_word
= dfa
->init_state_nl
1065 = dfa
->init_state_begbuf
= dfa
->init_state
;
1067 re_node_set_free (&init_nodes
);
1071 #ifdef RE_ENABLE_I18N
1072 /* If it is possible to do searching in single byte encoding instead of UTF-8
1073 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1074 DFA nodes where needed. */
1080 int node
, i
, mb_chars
= 0, has_period
= 0;
1082 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1083 switch (dfa
->nodes
[node
].type
)
1086 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1090 switch (dfa
->nodes
[node
].opr
.idx
)
1098 /* Word anchors etc. cannot be handled. */
1108 case OP_DUP_ASTERISK
:
1109 case OP_DUP_QUESTION
:
1110 case OP_OPEN_SUBEXP
:
1111 case OP_CLOSE_SUBEXP
:
1113 case SIMPLE_BRACKET
:
1114 /* Just double check. */
1115 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1116 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1123 if (mb_chars
|| has_period
)
1124 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1126 if (dfa
->nodes
[node
].type
== CHARACTER
1127 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1128 dfa
->nodes
[node
].mb_partial
= 0;
1129 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1130 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1133 /* The search can be in single byte locale. */
1134 dfa
->mb_cur_max
= 1;
1136 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1141 optimize_subexps (so
, node
, sidx
, depth
)
1142 struct subexp_optimize
*so
;
1146 int idx
, new_depth
, new_sidx
;
1153 if ((depth
& 1) && node
->type
== CONCAT
1154 && node
->right
&& node
->right
->type
== 0
1155 && so
->nodes
[idx
= node
->right
->node_idx
].type
== OP_CLOSE_SUBEXP
)
1157 new_depth
= depth
+ 1;
1159 || (so
->nodes
[idx
].opr
.idx
< 8 * sizeof (so
->dfa
->used_bkref_map
)
1160 && so
->dfa
->used_bkref_map
& (1 << so
->nodes
[idx
].opr
.idx
)))
1161 new_sidx
= so
->nodes
[idx
].opr
.idx
;
1163 node
->left
= optimize_subexps (so
, node
->left
, new_sidx
, new_depth
);
1164 new_depth
= (depth
& 1) == 0 && node
->type
== CONCAT
1165 && node
->left
&& node
->left
->type
== 0
1166 && so
->nodes
[node
->left
->node_idx
].type
== OP_OPEN_SUBEXP
1168 node
->right
= optimize_subexps (so
, node
->right
, sidx
, new_depth
);
1170 if (node
->type
!= CONCAT
)
1172 if ((depth
& 1) == 0
1174 && node
->left
->type
== 0
1175 && so
->nodes
[idx
= node
->left
->node_idx
].type
== OP_OPEN_SUBEXP
)
1177 else if ((depth
& 1)
1179 && node
->right
->type
== 0
1180 && so
->nodes
[idx
= node
->right
->node_idx
].type
== OP_CLOSE_SUBEXP
)
1185 if (so
->nodes
[idx
].opr
.idx
< 8 * sizeof (so
->dfa
->used_bkref_map
)
1186 && so
->dfa
->used_bkref_map
& (1 << so
->nodes
[idx
].opr
.idx
))
1196 if (so
->dfa
->subexp_map
== NULL
)
1198 so
->dfa
->subexp_map
= re_malloc (int, so
->re_nsub
);
1199 if (so
->dfa
->subexp_map
== NULL
)
1202 for (i
= 0; i
< so
->re_nsub
; i
++)
1203 so
->dfa
->subexp_map
[i
] = i
;
1206 i
= so
->nodes
[idx
].opr
.idx
;
1208 so
->dfa
->subexp_map
[i
] = sidx
;
1211 so
->nodes
[idx
].type
= OP_DELETED_SUBEXP
;
1212 ret
->parent
= node
->parent
;
1216 /* Analyze the structure tree, and calculate "first", "next", "edest",
1217 "eclosure", and "inveclosure". */
1219 static reg_errcode_t
1226 /* Allocate arrays. */
1227 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1228 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1229 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1230 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1231 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1232 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1233 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1235 /* Initialize them. */
1236 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
1239 re_node_set_init_empty (dfa
->edests
+ i
);
1240 re_node_set_init_empty (dfa
->eclosures
+ i
);
1241 re_node_set_init_empty (dfa
->inveclosures
+ i
);
1244 ret
= analyze_tree (dfa
, dfa
->str_tree
);
1245 if (BE (ret
== REG_NOERROR
, 1))
1247 ret
= calc_eclosure (dfa
);
1248 if (ret
== REG_NOERROR
)
1249 calc_inveclosure (dfa
);
1254 /* Helper functions for analyze.
1255 This function calculate "first", "next", and "edest" for the subtree
1256 whose root is NODE. */
1258 static reg_errcode_t
1259 analyze_tree (dfa
, node
)
1264 if (node
->first
== -1)
1265 calc_first (dfa
, node
);
1266 if (node
->next
== -1)
1267 calc_next (dfa
, node
);
1268 calc_epsdest (dfa
, node
);
1270 /* Calculate "first" etc. for the left child. */
1271 if (node
->left
!= NULL
)
1273 ret
= analyze_tree (dfa
, node
->left
);
1274 if (BE (ret
!= REG_NOERROR
, 0))
1277 /* Calculate "first" etc. for the right child. */
1278 if (node
->right
!= NULL
)
1280 ret
= analyze_tree (dfa
, node
->right
);
1281 if (BE (ret
!= REG_NOERROR
, 0))
1287 /* Calculate "first" for the node NODE. */
1289 calc_first (dfa
, node
)
1294 idx
= node
->node_idx
;
1295 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1300 case OP_OPEN_BRACKET
:
1301 case OP_CLOSE_BRACKET
:
1302 case OP_OPEN_DUP_NUM
:
1303 case OP_CLOSE_DUP_NUM
:
1305 case OP_NON_MATCH_LIST
:
1306 case OP_OPEN_COLL_ELEM
:
1307 case OP_CLOSE_COLL_ELEM
:
1308 case OP_OPEN_EQUIV_CLASS
:
1309 case OP_CLOSE_EQUIV_CLASS
:
1310 case OP_OPEN_CHAR_CLASS
:
1311 case OP_CLOSE_CHAR_CLASS
:
1312 /* These must not appear here. */
1318 case OP_DUP_ASTERISK
:
1319 case OP_DUP_QUESTION
:
1320 #ifdef RE_ENABLE_I18N
1321 case OP_UTF8_PERIOD
:
1322 case COMPLEX_BRACKET
:
1323 #endif /* RE_ENABLE_I18N */
1324 case SIMPLE_BRACKET
:
1327 case OP_OPEN_SUBEXP
:
1328 case OP_CLOSE_SUBEXP
:
1334 /* else fall through */
1337 assert (node
->left
!= NULL
);
1339 if (node
->left
->first
== -1)
1340 calc_first (dfa
, node
->left
);
1341 node
->first
= node
->left
->first
;
1346 /* Calculate "next" for the node NODE. */
1349 calc_next (dfa
, node
)
1354 bin_tree_t
*parent
= node
->parent
;
1358 idx
= node
->node_idx
;
1359 if (node
->type
== 0)
1360 dfa
->nexts
[idx
] = node
->next
;
1364 idx
= parent
->node_idx
;
1365 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1369 case OP_DUP_ASTERISK
:
1373 if (parent
->left
== node
)
1375 if (parent
->right
->first
== -1)
1376 calc_first (dfa
, parent
->right
);
1377 node
->next
= parent
->right
->first
;
1380 /* else fall through */
1382 if (parent
->next
== -1)
1383 calc_next (dfa
, parent
);
1384 node
->next
= parent
->next
;
1387 idx
= node
->node_idx
;
1388 if (node
->type
== 0)
1389 dfa
->nexts
[idx
] = node
->next
;
1392 /* Calculate "edest" for the node NODE. */
1395 calc_epsdest (dfa
, node
)
1400 idx
= node
->node_idx
;
1401 if (node
->type
== 0)
1403 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1404 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1406 if (node
->left
->first
== -1)
1407 calc_first (dfa
, node
->left
);
1408 if (node
->next
== -1)
1409 calc_next (dfa
, node
);
1410 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1413 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1416 if (node
->left
!= NULL
)
1418 if (node
->left
->first
== -1)
1419 calc_first (dfa
, node
->left
);
1420 left
= node
->left
->first
;
1424 if (node
->next
== -1)
1425 calc_next (dfa
, node
);
1428 if (node
->right
!= NULL
)
1430 if (node
->right
->first
== -1)
1431 calc_first (dfa
, node
->right
);
1432 right
= node
->right
->first
;
1436 if (node
->next
== -1)
1437 calc_next (dfa
, node
);
1440 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1442 else if (dfa
->nodes
[idx
].type
== ANCHOR
1443 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1444 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1445 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1446 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1448 assert (!IS_EPSILON_NODE (dfa
->nodes
[idx
].type
));
1452 /* Duplicate the epsilon closure of the node ROOT_NODE.
1453 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1454 to their own constraint. */
1456 static reg_errcode_t
1457 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1460 int top_org_node
, top_clone_node
, root_node
;
1461 unsigned int init_constraint
;
1464 int org_node
, clone_node
, ret
;
1465 unsigned int constraint
= init_constraint
;
1466 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1468 int org_dest
, clone_dest
;
1469 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1471 /* If the back reference epsilon-transit, its destination must
1472 also have the constraint. Then duplicate the epsilon closure
1473 of the destination of the back reference, and store it in
1474 edests of the back reference. */
1475 org_dest
= dfa
->nexts
[org_node
];
1476 re_node_set_empty (dfa
->edests
+ clone_node
);
1477 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1478 if (BE (err
!= REG_NOERROR
, 0))
1480 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1481 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1482 if (BE (ret
< 0, 0))
1485 else if (dfa
->edests
[org_node
].nelem
== 0)
1487 /* In case of the node can't epsilon-transit, don't duplicate the
1488 destination and store the original destination as the
1489 destination of the node. */
1490 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1493 else if (dfa
->edests
[org_node
].nelem
== 1)
1495 /* In case of the node can epsilon-transit, and it has only one
1497 org_dest
= dfa
->edests
[org_node
].elems
[0];
1498 re_node_set_empty (dfa
->edests
+ clone_node
);
1499 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1501 /* In case of the node has another constraint, append it. */
1502 if (org_node
== root_node
&& clone_node
!= org_node
)
1504 /* ...but if the node is root_node itself, it means the
1505 epsilon closure have a loop, then tie it to the
1506 destination of the root_node. */
1507 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1509 if (BE (ret
< 0, 0))
1513 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1515 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1516 if (BE (err
!= REG_NOERROR
, 0))
1518 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1519 if (BE (ret
< 0, 0))
1522 else /* dfa->edests[org_node].nelem == 2 */
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. E.g. '|', '*', '+', '?'. */
1526 org_dest
= dfa
->edests
[org_node
].elems
[0];
1527 re_node_set_empty (dfa
->edests
+ clone_node
);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1530 if (clone_dest
== -1)
1532 /* There are no such a duplicated node, create a new one. */
1533 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1534 if (BE (err
!= REG_NOERROR
, 0))
1536 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1537 if (BE (ret
< 0, 0))
1539 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1540 root_node
, constraint
);
1541 if (BE (err
!= REG_NOERROR
, 0))
1546 /* There are a duplicated node which satisfy the constraint,
1547 use it to avoid infinite loop. */
1548 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1549 if (BE (ret
< 0, 0))
1553 org_dest
= dfa
->edests
[org_node
].elems
[1];
1554 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1555 if (BE (err
!= REG_NOERROR
, 0))
1557 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1558 if (BE (ret
< 0, 0))
1561 org_node
= org_dest
;
1562 clone_node
= clone_dest
;
1567 /* Search for a node which is duplicated from the node ORG_NODE, and
1568 satisfies the constraint CONSTRAINT. */
1571 search_duplicated_node (dfa
, org_node
, constraint
)
1574 unsigned int constraint
;
1577 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1579 if (org_node
== dfa
->org_indices
[idx
]
1580 && constraint
== dfa
->nodes
[idx
].constraint
)
1581 return idx
; /* Found. */
1583 return -1; /* Not found. */
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1588 otherwise return the error code. */
1590 static reg_errcode_t
1591 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1593 int *new_idx
, org_idx
;
1594 unsigned int constraint
;
1596 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
], 1);
1597 if (BE (dup_idx
== -1, 0))
1599 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1600 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1601 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1602 dfa
->nodes
[dup_idx
].duplicated
= 1;
1603 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1604 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1605 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1607 /* Store the index of the original node. */
1608 dfa
->org_indices
[dup_idx
] = org_idx
;
1614 calc_inveclosure (dfa
)
1618 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1620 if (dfa
->nodes
[src
].type
== OP_DELETED_SUBEXP
)
1622 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1624 dest
= dfa
->eclosures
[src
].elems
[idx
];
1625 re_node_set_insert_last (dfa
->inveclosures
+ dest
, src
);
1630 /* Calculate "eclosure" for all the node in DFA. */
1632 static reg_errcode_t
1636 int node_idx
, incomplete
;
1638 assert (dfa
->nodes_len
> 0);
1641 /* For each nodes, calculate epsilon closure. */
1642 for (node_idx
= 0; ; ++node_idx
)
1645 re_node_set eclosure_elem
;
1646 if (node_idx
== dfa
->nodes_len
)
1655 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1657 if (dfa
->nodes
[node_idx
].type
== OP_DELETED_SUBEXP
)
1660 /* If we have already calculated, skip it. */
1661 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1665 if (BE (err
!= REG_NOERROR
, 0))
1668 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1671 re_node_set_free (&eclosure_elem
);
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1681 re_node_set
*new_set
;
1686 unsigned int constraint
;
1688 re_node_set eclosure
;
1690 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1691 if (BE (err
!= REG_NOERROR
, 0))
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa
->eclosures
[node
].nelem
= -1;
1698 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1699 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1700 /* If the current node has constraints, duplicate all nodes.
1701 Since they must inherit the constraints. */
1703 && dfa
->edests
[node
].nelem
1704 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1706 int org_node
, cur_node
;
1707 org_node
= cur_node
= node
;
1708 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1709 if (BE (err
!= REG_NOERROR
, 0))
1713 /* Expand each epsilon destination nodes. */
1714 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1715 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1717 re_node_set eclosure_elem
;
1718 int edest
= dfa
->edests
[node
].elems
[i
];
1719 /* If calculating the epsilon closure of `edest' is in progress,
1720 return intermediate result. */
1721 if (dfa
->eclosures
[edest
].nelem
== -1)
1726 /* If we haven't calculated the epsilon closure of `edest' yet,
1727 calculate now. Otherwise use calculated epsilon closure. */
1728 if (dfa
->eclosures
[edest
].nelem
== 0)
1730 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1731 if (BE (err
!= REG_NOERROR
, 0))
1735 eclosure_elem
= dfa
->eclosures
[edest
];
1736 /* Merge the epsilon closure of `edest'. */
1737 re_node_set_merge (&eclosure
, &eclosure_elem
);
1738 /* If the epsilon closure of `edest' is incomplete,
1739 the epsilon closure of this node is also incomplete. */
1740 if (dfa
->eclosures
[edest
].nelem
== 0)
1743 re_node_set_free (&eclosure_elem
);
1747 /* Epsilon closures include itself. */
1748 re_node_set_insert (&eclosure
, node
);
1749 if (incomplete
&& !root
)
1750 dfa
->eclosures
[node
].nelem
= 0;
1752 dfa
->eclosures
[node
] = eclosure
;
1753 *new_set
= eclosure
;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1763 fetch_token (result
, input
, syntax
)
1766 reg_syntax_t syntax
;
1768 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1771 /* Peek a token from INPUT, and return the length of the token.
1772 We must not use this function inside bracket expressions. */
1775 peek_token (token
, input
, syntax
)
1778 reg_syntax_t syntax
;
1782 if (re_string_eoi (input
))
1784 token
->type
= END_OF_RE
;
1788 c
= re_string_peek_byte (input
, 0);
1791 token
->word_char
= 0;
1792 #ifdef RE_ENABLE_I18N
1793 token
->mb_partial
= 0;
1794 if (input
->mb_cur_max
> 1 &&
1795 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1797 token
->type
= CHARACTER
;
1798 token
->mb_partial
= 1;
1805 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1807 token
->type
= BACK_SLASH
;
1811 c2
= re_string_peek_byte_case (input
, 1);
1813 token
->type
= CHARACTER
;
1814 #ifdef RE_ENABLE_I18N
1815 if (input
->mb_cur_max
> 1)
1817 wint_t wc
= re_string_wchar_at (input
,
1818 re_string_cur_idx (input
) + 1);
1819 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1823 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1828 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1829 token
->type
= OP_ALT
;
1831 case '1': case '2': case '3': case '4': case '5':
1832 case '6': case '7': case '8': case '9':
1833 if (!(syntax
& RE_NO_BK_REFS
))
1835 token
->type
= OP_BACK_REF
;
1836 token
->opr
.idx
= c2
- '1';
1840 if (!(syntax
& RE_NO_GNU_OPS
))
1842 token
->type
= ANCHOR
;
1843 token
->opr
.ctx_type
= WORD_FIRST
;
1847 if (!(syntax
& RE_NO_GNU_OPS
))
1849 token
->type
= ANCHOR
;
1850 token
->opr
.ctx_type
= WORD_LAST
;
1854 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= ANCHOR
;
1857 token
->opr
.ctx_type
= WORD_DELIM
;
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1863 token
->type
= ANCHOR
;
1864 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1868 if (!(syntax
& RE_NO_GNU_OPS
))
1869 token
->type
= OP_WORD
;
1872 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= OP_NOTWORD
;
1876 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= OP_SPACE
;
1880 if (!(syntax
& RE_NO_GNU_OPS
))
1881 token
->type
= OP_NOTSPACE
;
1884 if (!(syntax
& RE_NO_GNU_OPS
))
1886 token
->type
= ANCHOR
;
1887 token
->opr
.ctx_type
= BUF_FIRST
;
1891 if (!(syntax
& RE_NO_GNU_OPS
))
1893 token
->type
= ANCHOR
;
1894 token
->opr
.ctx_type
= BUF_LAST
;
1898 if (!(syntax
& RE_NO_BK_PARENS
))
1899 token
->type
= OP_OPEN_SUBEXP
;
1902 if (!(syntax
& RE_NO_BK_PARENS
))
1903 token
->type
= OP_CLOSE_SUBEXP
;
1906 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1907 token
->type
= OP_DUP_PLUS
;
1910 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1911 token
->type
= OP_DUP_QUESTION
;
1914 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1915 token
->type
= OP_OPEN_DUP_NUM
;
1918 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1919 token
->type
= OP_CLOSE_DUP_NUM
;
1927 token
->type
= CHARACTER
;
1928 #ifdef RE_ENABLE_I18N
1929 if (input
->mb_cur_max
> 1)
1931 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1932 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1936 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1941 if (syntax
& RE_NEWLINE_ALT
)
1942 token
->type
= OP_ALT
;
1945 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1946 token
->type
= OP_ALT
;
1949 token
->type
= OP_DUP_ASTERISK
;
1952 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1953 token
->type
= OP_DUP_PLUS
;
1956 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1957 token
->type
= OP_DUP_QUESTION
;
1960 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1961 token
->type
= OP_OPEN_DUP_NUM
;
1964 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1965 token
->type
= OP_CLOSE_DUP_NUM
;
1968 if (syntax
& RE_NO_BK_PARENS
)
1969 token
->type
= OP_OPEN_SUBEXP
;
1972 if (syntax
& RE_NO_BK_PARENS
)
1973 token
->type
= OP_CLOSE_SUBEXP
;
1976 token
->type
= OP_OPEN_BRACKET
;
1979 token
->type
= OP_PERIOD
;
1982 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1983 re_string_cur_idx (input
) != 0)
1985 char prev
= re_string_peek_byte (input
, -1);
1986 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1989 token
->type
= ANCHOR
;
1990 token
->opr
.ctx_type
= LINE_FIRST
;
1993 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1994 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1997 re_string_skip_bytes (input
, 1);
1998 peek_token (&next
, input
, syntax
);
1999 re_string_skip_bytes (input
, -1);
2000 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2003 token
->type
= ANCHOR
;
2004 token
->opr
.ctx_type
= LINE_LAST
;
2012 /* Peek a token from INPUT, and return the length of the token.
2013 We must not use this function out of bracket expressions. */
2016 peek_token_bracket (token
, input
, syntax
)
2019 reg_syntax_t syntax
;
2022 if (re_string_eoi (input
))
2024 token
->type
= END_OF_RE
;
2027 c
= re_string_peek_byte (input
, 0);
2030 #ifdef RE_ENABLE_I18N
2031 if (input
->mb_cur_max
> 1 &&
2032 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2034 token
->type
= CHARACTER
;
2037 #endif /* RE_ENABLE_I18N */
2039 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2040 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2042 /* In this case, '\' escape a character. */
2044 re_string_skip_bytes (input
, 1);
2045 c2
= re_string_peek_byte (input
, 0);
2047 token
->type
= CHARACTER
;
2050 if (c
== '[') /* '[' is a special char in a bracket exps. */
2054 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2055 c2
= re_string_peek_byte (input
, 1);
2063 token
->type
= OP_OPEN_COLL_ELEM
;
2066 token
->type
= OP_OPEN_EQUIV_CLASS
;
2069 if (syntax
& RE_CHAR_CLASSES
)
2071 token
->type
= OP_OPEN_CHAR_CLASS
;
2074 /* else fall through. */
2076 token
->type
= CHARACTER
;
2086 token
->type
= OP_CHARSET_RANGE
;
2089 token
->type
= OP_CLOSE_BRACKET
;
2092 token
->type
= OP_NON_MATCH_LIST
;
2095 token
->type
= CHARACTER
;
2100 /* Functions for parser. */
2102 /* Entry point of the parser.
2103 Parse the regular expression REGEXP and return the structure tree.
2104 If an error is occured, ERR is set by error code, and return NULL.
2105 This function build the following tree, from regular expression <reg_exp>:
2111 CAT means concatenation.
2112 EOR means end of regular expression. */
2115 parse (regexp
, preg
, syntax
, err
)
2116 re_string_t
*regexp
;
2118 reg_syntax_t syntax
;
2121 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2122 bin_tree_t
*tree
, *eor
, *root
;
2123 re_token_t current_token
;
2124 dfa
->syntax
= syntax
;
2125 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2126 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2127 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2129 eor
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, ¤t_token
);
2131 root
= create_tree (dfa
, tree
, eor
, CONCAT
, 0);
2134 if (BE (eor
== NULL
|| root
== NULL
, 0))
2142 /* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
2149 ALT means alternative, which represents the operator `|'. */
2152 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2153 re_string_t
*regexp
;
2156 reg_syntax_t syntax
;
2160 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2161 bin_tree_t
*tree
, *branch
= NULL
;
2162 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2163 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2166 while (token
->type
== OP_ALT
)
2168 re_token_t alt_token
= *token
;
2169 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2170 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2171 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2173 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2174 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2179 tree
= re_dfa_add_tree_node (dfa
, tree
, branch
, &alt_token
);
2180 if (BE (tree
== NULL
, 0))
2185 dfa
->has_plural_match
= 1;
2190 /* This function build the following tree, from regular expression
2197 CAT means concatenation. */
2200 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2201 re_string_t
*regexp
;
2204 reg_syntax_t syntax
;
2208 bin_tree_t
*tree
, *exp
;
2209 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2210 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2211 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2214 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2215 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2217 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2218 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2222 if (tree
!= NULL
&& exp
!= NULL
)
2224 tree
= create_tree (dfa
, tree
, exp
, CONCAT
, 0);
2231 else if (tree
== NULL
)
2233 /* Otherwise exp == NULL, we don't need to create new tree. */
2238 /* This function build the following tree, from regular expression a*:
2245 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2246 re_string_t
*regexp
;
2249 reg_syntax_t syntax
;
2253 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2255 switch (token
->type
)
2258 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2259 if (BE (tree
== NULL
, 0))
2264 #ifdef RE_ENABLE_I18N
2265 if (dfa
->mb_cur_max
> 1)
2267 while (!re_string_eoi (regexp
)
2268 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2270 bin_tree_t
*mbc_remain
;
2271 fetch_token (token
, regexp
, syntax
);
2272 mbc_remain
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2273 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
, 0);
2274 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2283 case OP_OPEN_SUBEXP
:
2284 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2285 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2288 case OP_OPEN_BRACKET
:
2289 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2290 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2294 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2299 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2300 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2301 if (BE (tree
== NULL
, 0))
2307 dfa
->has_mb_node
= 1;
2309 case OP_OPEN_DUP_NUM
:
2310 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2316 case OP_DUP_ASTERISK
:
2318 case OP_DUP_QUESTION
:
2319 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2324 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2326 fetch_token (token
, regexp
, syntax
);
2327 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2329 /* else fall through */
2330 case OP_CLOSE_SUBEXP
:
2331 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2332 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2337 /* else fall through */
2338 case OP_CLOSE_DUP_NUM
:
2339 /* We treat it as a normal character. */
2341 /* Then we can these characters as normal characters. */
2342 token
->type
= CHARACTER
;
2343 /* mb_partial and word_char bits should be initialized already
2345 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2346 if (BE (tree
== NULL
, 0))
2353 if ((token
->opr
.ctx_type
2354 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2355 && dfa
->word_ops_used
== 0)
2356 init_word_char (dfa
);
2357 if (token
->opr
.ctx_type
== WORD_DELIM
2358 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2360 bin_tree_t
*tree_first
, *tree_last
;
2361 if (token
->opr
.ctx_type
== WORD_DELIM
)
2363 token
->opr
.ctx_type
= WORD_FIRST
;
2364 tree_first
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2365 token
->opr
.ctx_type
= WORD_LAST
;
2369 token
->opr
.ctx_type
= INSIDE_WORD
;
2370 tree_first
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2371 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2373 tree_last
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2374 token
->type
= OP_ALT
;
2375 tree
= re_dfa_add_tree_node (dfa
, tree_first
, tree_last
, token
);
2376 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2384 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2385 if (BE (tree
== NULL
, 0))
2391 /* We must return here, since ANCHORs can't be followed
2392 by repetition operators.
2393 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2394 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2395 fetch_token (token
, regexp
, syntax
);
2398 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2399 if (BE (tree
== NULL
, 0))
2404 if (dfa
->mb_cur_max
> 1)
2405 dfa
->has_mb_node
= 1;
2409 tree
= build_charclass_op (dfa
, regexp
->trans
,
2410 (const unsigned char *) "alnum",
2411 (const unsigned char *) "_",
2412 token
->type
== OP_NOTWORD
, err
);
2413 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2418 tree
= build_charclass_op (dfa
, regexp
->trans
,
2419 (const unsigned char *) "space",
2420 (const unsigned char *) "",
2421 token
->type
== OP_NOTSPACE
, err
);
2422 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2432 /* Must not happen? */
2438 fetch_token (token
, regexp
, syntax
);
2440 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2441 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2443 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2444 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2446 /* In BRE consecutive duplications are not allowed. */
2447 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2448 && (token
->type
== OP_DUP_ASTERISK
2449 || token
->type
== OP_OPEN_DUP_NUM
))
2454 dfa
->has_plural_match
= 1;
2460 /* This function build the following tree, from regular expression
2468 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2469 re_string_t
*regexp
;
2472 reg_syntax_t syntax
;
2476 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2477 bin_tree_t
*tree
, *left_par
, *right_par
;
2479 cur_nsub
= preg
->re_nsub
++;
2481 left_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2482 if (BE (left_par
== NULL
, 0))
2487 dfa
->nodes
[left_par
->node_idx
].opr
.idx
= cur_nsub
;
2488 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2490 /* The subexpression may be a null string. */
2491 if (token
->type
== OP_CLOSE_SUBEXP
)
2495 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2496 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2499 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2504 right_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2505 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2506 tree
= ((tree
== NULL
) ? right_par
2507 : create_tree (dfa
, tree
, right_par
, CONCAT
, 0));
2508 tree
= create_tree (dfa
, left_par
, tree
, CONCAT
, 0);
2509 if (BE (right_par
== NULL
|| tree
== NULL
, 0))
2514 dfa
->nodes
[right_par
->node_idx
].opr
.idx
= cur_nsub
;
2519 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2522 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2524 re_string_t
*regexp
;
2527 reg_syntax_t syntax
;
2530 re_token_t dup_token
;
2531 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2532 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2533 re_token_t start_token
= *token
;
2535 if (token
->type
== OP_OPEN_DUP_NUM
)
2538 start
= fetch_number (regexp
, token
, syntax
);
2541 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2542 start
= 0; /* We treat "{,m}" as "{0,m}". */
2545 *err
= REG_BADBR
; /* <re>{} is invalid. */
2549 if (BE (start
!= -2, 1))
2551 /* We treat "{n}" as "{n,n}". */
2552 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2553 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2554 ? fetch_number (regexp
, token
, syntax
) : -2));
2556 if (BE (start
== -2 || end
== -2, 0))
2558 /* Invalid sequence. */
2559 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2561 if (token
->type
== END_OF_RE
)
2569 /* If the syntax bit is set, rollback. */
2570 re_string_set_index (regexp
, start_idx
);
2571 *token
= start_token
;
2572 token
->type
= CHARACTER
;
2573 /* mb_partial and word_char bits should be already initialized by
2578 if (BE (end
!= -1 && start
> end
, 0))
2580 /* First number greater than second. */
2587 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2588 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2591 fetch_token (token
, regexp
, syntax
);
2593 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2594 if (BE (elem
== NULL
|| (start
== 0 && end
== 0), 0))
2597 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2598 if (BE (start
> 0, 0))
2601 for (i
= 2; i
<= start
; ++i
)
2603 elem
= duplicate_tree (elem
, dfa
);
2604 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2605 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2606 goto parse_dup_op_espace
;
2612 /* Duplicate ELEM before it is marked optional. */
2613 elem
= duplicate_tree (elem
, dfa
);
2619 mark_opt_subexp (elem
, dfa
);
2620 dup_token
.type
= (end
== -1 ? OP_DUP_ASTERISK
: OP_DUP_QUESTION
);
2621 tree
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2622 if (BE (tree
== NULL
, 0))
2623 goto parse_dup_op_espace
;
2625 /* This loop is actually executed only when end != -1,
2626 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2627 already created the start+1-th copy. */
2628 for (i
= start
+ 2; i
<= end
; ++i
)
2630 elem
= duplicate_tree (elem
, dfa
);
2631 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2632 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2633 goto parse_dup_op_espace
;
2635 tree
= re_dfa_add_tree_node (dfa
, tree
, NULL
, &dup_token
);
2636 if (BE (tree
== NULL
, 0))
2637 goto parse_dup_op_espace
;
2641 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
, 0);
2645 parse_dup_op_espace
:
2650 /* Size of the names for collating symbol/equivalence_class/character_class.
2651 I'm not sure, but maybe enough. */
2652 #define BRACKET_NAME_BUF_SIZE 32
2655 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2656 Build the range expression which starts from START_ELEM, and ends
2657 at END_ELEM. The result are written to MBCSET and SBCSET.
2658 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2659 mbcset->range_ends, is a pointer argument sinse we may
2662 static reg_errcode_t
2663 # ifdef RE_ENABLE_I18N
2664 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2665 re_charset_t
*mbcset
;
2667 # else /* not RE_ENABLE_I18N */
2668 build_range_exp (sbcset
, start_elem
, end_elem
)
2669 # endif /* not RE_ENABLE_I18N */
2670 re_bitset_ptr_t sbcset
;
2671 bracket_elem_t
*start_elem
, *end_elem
;
2673 unsigned int start_ch
, end_ch
;
2674 /* Equivalence Classes and Character Classes can't be a range start/end. */
2675 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2676 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2680 /* We can handle no multi character collating elements without libc
2682 if (BE ((start_elem
->type
== COLL_SYM
2683 && strlen ((char *) start_elem
->opr
.name
) > 1)
2684 || (end_elem
->type
== COLL_SYM
2685 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2686 return REG_ECOLLATE
;
2688 # ifdef RE_ENABLE_I18N
2690 wchar_t wc
, start_wc
, end_wc
;
2691 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2693 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2694 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2696 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2697 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2699 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2700 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2701 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2702 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2703 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2704 return REG_ECOLLATE
;
2705 cmp_buf
[0] = start_wc
;
2706 cmp_buf
[4] = end_wc
;
2707 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2710 /* Got valid collation sequence values, add them as a new entry.
2711 However, for !_LIBC we have no collation elements: if the
2712 character set is single byte, the single byte character set
2713 that we build below suffices. parse_bracket_exp passes
2714 no MBCSET if dfa->mb_cur_max == 1. */
2717 /* Check the space of the arrays. */
2718 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2720 /* There is not enough space, need realloc. */
2721 wchar_t *new_array_start
, *new_array_end
;
2724 /* +1 in case of mbcset->nranges is 0. */
2725 new_nranges
= 2 * mbcset
->nranges
+ 1;
2726 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2727 are NULL if *range_alloc == 0. */
2728 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2730 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2733 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2736 mbcset
->range_starts
= new_array_start
;
2737 mbcset
->range_ends
= new_array_end
;
2738 *range_alloc
= new_nranges
;
2741 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2742 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2745 /* Build the table for single byte characters. */
2746 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2749 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2750 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2751 bitset_set (sbcset
, wc
);
2754 # else /* not RE_ENABLE_I18N */
2757 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2758 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2760 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2761 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2763 if (start_ch
> end_ch
)
2765 /* Build the table for single byte characters. */
2766 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2767 if (start_ch
<= ch
&& ch
<= end_ch
)
2768 bitset_set (sbcset
, ch
);
2770 # endif /* not RE_ENABLE_I18N */
2773 #endif /* not _LIBC */
2776 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2777 Build the collating element which is represented by NAME.
2778 The result are written to MBCSET and SBCSET.
2779 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2780 pointer argument since we may update it. */
2782 static reg_errcode_t
2783 # ifdef RE_ENABLE_I18N
2784 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2785 re_charset_t
*mbcset
;
2786 int *coll_sym_alloc
;
2787 # else /* not RE_ENABLE_I18N */
2788 build_collating_symbol (sbcset
, name
)
2789 # endif /* not RE_ENABLE_I18N */
2790 re_bitset_ptr_t sbcset
;
2791 const unsigned char *name
;
2793 size_t name_len
= strlen ((const char *) name
);
2794 if (BE (name_len
!= 1, 0))
2795 return REG_ECOLLATE
;
2798 bitset_set (sbcset
, name
[0]);
2802 #endif /* not _LIBC */
2804 /* This function parse bracket expression like "[abc]", "[a-c]",
2808 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2809 re_string_t
*regexp
;
2812 reg_syntax_t syntax
;
2816 const unsigned char *collseqmb
;
2817 const char *collseqwc
;
2820 const int32_t *symb_table
;
2821 const unsigned char *extra
;
2823 /* Local function for parse_bracket_exp used in _LIBC environement.
2824 Seek the collating symbol entry correspondings to NAME.
2825 Return the index of the symbol in the SYMB_TABLE. */
2828 __attribute ((always_inline
))
2829 seek_collating_symbol_entry (name
, name_len
)
2830 const unsigned char *name
;
2833 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2834 int32_t elem
= hash
% table_size
;
2835 int32_t second
= hash
% (table_size
- 2);
2836 while (symb_table
[2 * elem
] != 0)
2838 /* First compare the hashing value. */
2839 if (symb_table
[2 * elem
] == hash
2840 /* Compare the length of the name. */
2841 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2842 /* Compare the name. */
2843 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2846 /* Yep, this is the entry. */
2856 /* Local function for parse_bracket_exp used in _LIBC environement.
2857 Look up the collation sequence value of BR_ELEM.
2858 Return the value if succeeded, UINT_MAX otherwise. */
2860 auto inline unsigned int
2861 __attribute ((always_inline
))
2862 lookup_collation_sequence_value (br_elem
)
2863 bracket_elem_t
*br_elem
;
2865 if (br_elem
->type
== SB_CHAR
)
2868 if (MB_CUR_MAX == 1)
2871 return collseqmb
[br_elem
->opr
.ch
];
2874 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2875 return __collseq_table_lookup (collseqwc
, wc
);
2878 else if (br_elem
->type
== MB_CHAR
)
2880 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2882 else if (br_elem
->type
== COLL_SYM
)
2884 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2888 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2890 if (symb_table
[2 * elem
] != 0)
2892 /* We found the entry. */
2893 idx
= symb_table
[2 * elem
+ 1];
2894 /* Skip the name of collating element name. */
2895 idx
+= 1 + extra
[idx
];
2896 /* Skip the byte sequence of the collating element. */
2897 idx
+= 1 + extra
[idx
];
2898 /* Adjust for the alignment. */
2899 idx
= (idx
+ 3) & ~3;
2900 /* Skip the multibyte collation sequence value. */
2901 idx
+= sizeof (unsigned int);
2902 /* Skip the wide char sequence of the collating element. */
2903 idx
+= sizeof (unsigned int) *
2904 (1 + *(unsigned int *) (extra
+ idx
));
2905 /* Return the collation sequence value. */
2906 return *(unsigned int *) (extra
+ idx
);
2908 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2910 /* No valid character. Match it as a single byte
2912 return collseqmb
[br_elem
->opr
.name
[0]];
2915 else if (sym_name_len
== 1)
2916 return collseqmb
[br_elem
->opr
.name
[0]];
2921 /* Local function for parse_bracket_exp used in _LIBC environement.
2922 Build the range expression which starts from START_ELEM, and ends
2923 at END_ELEM. The result are written to MBCSET and SBCSET.
2924 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2925 mbcset->range_ends, is a pointer argument sinse we may
2928 auto inline reg_errcode_t
2929 __attribute ((always_inline
))
2930 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2931 re_charset_t
*mbcset
;
2933 re_bitset_ptr_t sbcset
;
2934 bracket_elem_t
*start_elem
, *end_elem
;
2937 uint32_t start_collseq
;
2938 uint32_t end_collseq
;
2940 /* Equivalence Classes and Character Classes can't be a range
2942 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2943 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2947 start_collseq
= lookup_collation_sequence_value (start_elem
);
2948 end_collseq
= lookup_collation_sequence_value (end_elem
);
2949 /* Check start/end collation sequence values. */
2950 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2951 return REG_ECOLLATE
;
2952 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2955 /* Got valid collation sequence values, add them as a new entry.
2956 However, if we have no collation elements, and the character set
2957 is single byte, the single byte character set that we
2958 build below suffices. */
2959 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2961 /* Check the space of the arrays. */
2962 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2964 /* There is not enough space, need realloc. */
2965 uint32_t *new_array_start
;
2966 uint32_t *new_array_end
;
2969 /* +1 in case of mbcset->nranges is 0. */
2970 new_nranges
= 2 * mbcset
->nranges
+ 1;
2971 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2973 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2976 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2979 mbcset
->range_starts
= new_array_start
;
2980 mbcset
->range_ends
= new_array_end
;
2981 *range_alloc
= new_nranges
;
2984 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2985 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2988 /* Build the table for single byte characters. */
2989 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2991 uint32_t ch_collseq
;
2993 if (MB_CUR_MAX == 1)
2996 ch_collseq
= collseqmb
[ch
];
2998 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2999 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3000 bitset_set (sbcset
, ch
);
3005 /* Local function for parse_bracket_exp used in _LIBC environement.
3006 Build the collating element which is represented by NAME.
3007 The result are written to MBCSET and SBCSET.
3008 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3009 pointer argument sinse we may update it. */
3011 auto inline reg_errcode_t
3012 __attribute ((always_inline
))
3013 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
3014 re_charset_t
*mbcset
;
3015 int *coll_sym_alloc
;
3016 re_bitset_ptr_t sbcset
;
3017 const unsigned char *name
;
3020 size_t name_len
= strlen ((const char *) name
);
3023 elem
= seek_collating_symbol_entry (name
, name_len
);
3024 if (symb_table
[2 * elem
] != 0)
3026 /* We found the entry. */
3027 idx
= symb_table
[2 * elem
+ 1];
3028 /* Skip the name of collating element name. */
3029 idx
+= 1 + extra
[idx
];
3031 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3033 /* No valid character, treat it as a normal
3035 bitset_set (sbcset
, name
[0]);
3039 return REG_ECOLLATE
;
3041 /* Got valid collation sequence, add it as a new entry. */
3042 /* Check the space of the arrays. */
3043 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3045 /* Not enough, realloc it. */
3046 /* +1 in case of mbcset->ncoll_syms is 0. */
3047 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3048 /* Use realloc since mbcset->coll_syms is NULL
3050 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3051 new_coll_sym_alloc
);
3052 if (BE (new_coll_syms
== NULL
, 0))
3054 mbcset
->coll_syms
= new_coll_syms
;
3055 *coll_sym_alloc
= new_coll_sym_alloc
;
3057 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3062 if (BE (name_len
!= 1, 0))
3063 return REG_ECOLLATE
;
3066 bitset_set (sbcset
, name
[0]);
3073 re_token_t br_token
;
3074 re_bitset_ptr_t sbcset
;
3075 #ifdef RE_ENABLE_I18N
3076 re_charset_t
*mbcset
;
3077 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3078 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3079 #endif /* not RE_ENABLE_I18N */
3081 bin_tree_t
*work_tree
;
3083 int first_round
= 1;
3085 collseqmb
= (const unsigned char *)
3086 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3087 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3093 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3094 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3095 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3096 _NL_COLLATE_SYMB_TABLEMB
);
3097 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3098 _NL_COLLATE_SYMB_EXTRAMB
);
3101 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3102 #ifdef RE_ENABLE_I18N
3103 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3104 #endif /* RE_ENABLE_I18N */
3105 #ifdef RE_ENABLE_I18N
3106 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3108 if (BE (sbcset
== NULL
, 0))
3109 #endif /* RE_ENABLE_I18N */
3115 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3116 if (BE (token
->type
== END_OF_RE
, 0))
3119 goto parse_bracket_exp_free_return
;
3121 if (token
->type
== OP_NON_MATCH_LIST
)
3123 #ifdef RE_ENABLE_I18N
3124 mbcset
->non_match
= 1;
3125 #endif /* not RE_ENABLE_I18N */
3127 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3128 bitset_set (sbcset
, '\0');
3129 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3130 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3131 if (BE (token
->type
== END_OF_RE
, 0))
3134 goto parse_bracket_exp_free_return
;
3138 /* We treat the first ']' as a normal character. */
3139 if (token
->type
== OP_CLOSE_BRACKET
)
3140 token
->type
= CHARACTER
;
3144 bracket_elem_t start_elem
, end_elem
;
3145 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3146 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3148 int token_len2
= 0, is_range_exp
= 0;
3151 start_elem
.opr
.name
= start_name_buf
;
3152 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3153 syntax
, first_round
);
3154 if (BE (ret
!= REG_NOERROR
, 0))
3157 goto parse_bracket_exp_free_return
;
3161 /* Get information about the next token. We need it in any case. */
3162 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3164 /* Do not check for ranges if we know they are not allowed. */
3165 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3167 if (BE (token
->type
== END_OF_RE
, 0))
3170 goto parse_bracket_exp_free_return
;
3172 if (token
->type
== OP_CHARSET_RANGE
)
3174 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3175 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3176 if (BE (token2
.type
== END_OF_RE
, 0))
3179 goto parse_bracket_exp_free_return
;
3181 if (token2
.type
== OP_CLOSE_BRACKET
)
3183 /* We treat the last '-' as a normal character. */
3184 re_string_skip_bytes (regexp
, -token_len
);
3185 token
->type
= CHARACTER
;
3192 if (is_range_exp
== 1)
3194 end_elem
.opr
.name
= end_name_buf
;
3195 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3197 if (BE (ret
!= REG_NOERROR
, 0))
3200 goto parse_bracket_exp_free_return
;
3203 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3206 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3207 &start_elem
, &end_elem
);
3209 # ifdef RE_ENABLE_I18N
3210 *err
= build_range_exp (sbcset
,
3211 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3212 &range_alloc
, &start_elem
, &end_elem
);
3214 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3216 #endif /* RE_ENABLE_I18N */
3217 if (BE (*err
!= REG_NOERROR
, 0))
3218 goto parse_bracket_exp_free_return
;
3222 switch (start_elem
.type
)
3225 bitset_set (sbcset
, start_elem
.opr
.ch
);
3227 #ifdef RE_ENABLE_I18N
3229 /* Check whether the array has enough space. */
3230 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3232 wchar_t *new_mbchars
;
3233 /* Not enough, realloc it. */
3234 /* +1 in case of mbcset->nmbchars is 0. */
3235 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3236 /* Use realloc since array is NULL if *alloc == 0. */
3237 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3239 if (BE (new_mbchars
== NULL
, 0))
3240 goto parse_bracket_exp_espace
;
3241 mbcset
->mbchars
= new_mbchars
;
3243 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3245 #endif /* RE_ENABLE_I18N */
3247 *err
= build_equiv_class (sbcset
,
3248 #ifdef RE_ENABLE_I18N
3249 mbcset
, &equiv_class_alloc
,
3250 #endif /* RE_ENABLE_I18N */
3251 start_elem
.opr
.name
);
3252 if (BE (*err
!= REG_NOERROR
, 0))
3253 goto parse_bracket_exp_free_return
;
3256 *err
= build_collating_symbol (sbcset
,
3257 #ifdef RE_ENABLE_I18N
3258 mbcset
, &coll_sym_alloc
,
3259 #endif /* RE_ENABLE_I18N */
3260 start_elem
.opr
.name
);
3261 if (BE (*err
!= REG_NOERROR
, 0))
3262 goto parse_bracket_exp_free_return
;
3265 *err
= build_charclass (regexp
->trans
, sbcset
,
3266 #ifdef RE_ENABLE_I18N
3267 mbcset
, &char_class_alloc
,
3268 #endif /* RE_ENABLE_I18N */
3269 start_elem
.opr
.name
, syntax
);
3270 if (BE (*err
!= REG_NOERROR
, 0))
3271 goto parse_bracket_exp_free_return
;
3278 if (BE (token
->type
== END_OF_RE
, 0))
3281 goto parse_bracket_exp_free_return
;
3283 if (token
->type
== OP_CLOSE_BRACKET
)
3287 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3289 /* If it is non-matching list. */
3291 bitset_not (sbcset
);
3293 #ifdef RE_ENABLE_I18N
3294 /* Ensure only single byte characters are set. */
3295 if (dfa
->mb_cur_max
> 1)
3296 bitset_mask (sbcset
, dfa
->sb_char
);
3297 #endif /* RE_ENABLE_I18N */
3299 /* Build a tree for simple bracket. */
3300 br_token
.type
= SIMPLE_BRACKET
;
3301 br_token
.opr
.sbcset
= sbcset
;
3302 work_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3303 if (BE (work_tree
== NULL
, 0))
3304 goto parse_bracket_exp_espace
;
3306 #ifdef RE_ENABLE_I18N
3307 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3308 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3309 || mbcset
->non_match
)))
3311 re_token_t alt_token
;
3312 bin_tree_t
*mbc_tree
;
3314 /* Build a tree for complex bracket. */
3315 dfa
->has_mb_node
= 1;
3316 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3317 if (sbcset
[sbc_idx
])
3319 /* If there are no bits set in sbcset, there is no point
3320 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3321 if (sbc_idx
== BITSET_UINTS
)
3324 dfa
->nodes
[work_tree
->node_idx
].type
= COMPLEX_BRACKET
;
3325 dfa
->nodes
[work_tree
->node_idx
].opr
.mbcset
= mbcset
;
3328 br_token
.type
= COMPLEX_BRACKET
;
3329 br_token
.opr
.mbcset
= mbcset
;
3330 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3331 if (BE (mbc_tree
== NULL
, 0))
3332 goto parse_bracket_exp_espace
;
3333 /* Then join them by ALT node. */
3334 alt_token
.type
= OP_ALT
;
3335 dfa
->has_plural_match
= 1;
3336 work_tree
= re_dfa_add_tree_node (dfa
, work_tree
, mbc_tree
, &alt_token
);
3337 if (BE (mbc_tree
!= NULL
, 1))
3342 free_charset (mbcset
);
3345 #else /* not RE_ENABLE_I18N */
3347 #endif /* not RE_ENABLE_I18N */
3349 parse_bracket_exp_espace
:
3351 parse_bracket_exp_free_return
:
3353 #ifdef RE_ENABLE_I18N
3354 free_charset (mbcset
);
3355 #endif /* RE_ENABLE_I18N */
3359 /* Parse an element in the bracket expression. */
3361 static reg_errcode_t
3362 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3364 bracket_elem_t
*elem
;
3365 re_string_t
*regexp
;
3369 reg_syntax_t syntax
;
3372 #ifdef RE_ENABLE_I18N
3374 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3375 if (cur_char_size
> 1)
3377 elem
->type
= MB_CHAR
;
3378 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3379 re_string_skip_bytes (regexp
, cur_char_size
);
3382 #endif /* RE_ENABLE_I18N */
3383 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3384 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3385 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3386 return parse_bracket_symbol (elem
, regexp
, token
);
3387 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3389 /* A '-' must only appear as anything but a range indicator before
3390 the closing bracket. Everything else is an error. */
3392 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3393 if (token2
.type
!= OP_CLOSE_BRACKET
)
3394 /* The actual error value is not standardized since this whole
3395 case is undefined. But ERANGE makes good sense. */
3398 elem
->type
= SB_CHAR
;
3399 elem
->opr
.ch
= token
->opr
.c
;
3403 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3404 such as [:<character_class>:], [.<collating_element>.], and
3405 [=<equivalent_class>=]. */
3407 static reg_errcode_t
3408 parse_bracket_symbol (elem
, regexp
, token
)
3409 bracket_elem_t
*elem
;
3410 re_string_t
*regexp
;
3413 unsigned char ch
, delim
= token
->opr
.c
;
3415 if (re_string_eoi(regexp
))
3419 if (i
>= BRACKET_NAME_BUF_SIZE
)
3421 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3422 ch
= re_string_fetch_byte_case (regexp
);
3424 ch
= re_string_fetch_byte (regexp
);
3425 if (re_string_eoi(regexp
))
3427 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3429 elem
->opr
.name
[i
] = ch
;
3431 re_string_skip_bytes (regexp
, 1);
3432 elem
->opr
.name
[i
] = '\0';
3433 switch (token
->type
)
3435 case OP_OPEN_COLL_ELEM
:
3436 elem
->type
= COLL_SYM
;
3438 case OP_OPEN_EQUIV_CLASS
:
3439 elem
->type
= EQUIV_CLASS
;
3441 case OP_OPEN_CHAR_CLASS
:
3442 elem
->type
= CHAR_CLASS
;
3450 /* Helper function for parse_bracket_exp.
3451 Build the equivalence class which is represented by NAME.
3452 The result are written to MBCSET and SBCSET.
3453 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3454 is a pointer argument sinse we may update it. */
3456 static reg_errcode_t
3457 #ifdef RE_ENABLE_I18N
3458 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3459 re_charset_t
*mbcset
;
3460 int *equiv_class_alloc
;
3461 #else /* not RE_ENABLE_I18N */
3462 build_equiv_class (sbcset
, name
)
3463 #endif /* not RE_ENABLE_I18N */
3464 re_bitset_ptr_t sbcset
;
3465 const unsigned char *name
;
3468 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3471 const int32_t *table
, *indirect
;
3472 const unsigned char *weights
, *extra
, *cp
;
3473 unsigned char char_buf
[2];
3477 /* This #include defines a local function! */
3478 # include <locale/weight.h>
3479 /* Calculate the index for equivalence class. */
3481 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3482 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3483 _NL_COLLATE_WEIGHTMB
);
3484 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3485 _NL_COLLATE_EXTRAMB
);
3486 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3487 _NL_COLLATE_INDIRECTMB
);
3488 idx1
= findidx (&cp
);
3489 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3490 /* This isn't a valid character. */
3491 return REG_ECOLLATE
;
3493 /* Build single byte matcing table for this equivalence class. */
3494 char_buf
[1] = (unsigned char) '\0';
3495 len
= weights
[idx1
];
3496 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3500 idx2
= findidx (&cp
);
3505 /* This isn't a valid character. */
3507 if (len
== weights
[idx2
])
3510 while (cnt
<= len
&&
3511 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3515 bitset_set (sbcset
, ch
);
3518 /* Check whether the array has enough space. */
3519 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3521 /* Not enough, realloc it. */
3522 /* +1 in case of mbcset->nequiv_classes is 0. */
3523 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3524 /* Use realloc since the array is NULL if *alloc == 0. */
3525 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3527 new_equiv_class_alloc
);
3528 if (BE (new_equiv_classes
== NULL
, 0))
3530 mbcset
->equiv_classes
= new_equiv_classes
;
3531 *equiv_class_alloc
= new_equiv_class_alloc
;
3533 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3538 if (BE (strlen ((const char *) name
) != 1, 0))
3539 return REG_ECOLLATE
;
3540 bitset_set (sbcset
, *name
);
3545 /* Helper function for parse_bracket_exp.
3546 Build the character class which is represented by NAME.
3547 The result are written to MBCSET and SBCSET.
3548 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3549 is a pointer argument sinse we may update it. */
3551 static reg_errcode_t
3552 #ifdef RE_ENABLE_I18N
3553 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3554 re_charset_t
*mbcset
;
3555 int *char_class_alloc
;
3556 #else /* not RE_ENABLE_I18N */
3557 build_charclass (trans
, sbcset
, class_name
, syntax
)
3558 #endif /* not RE_ENABLE_I18N */
3559 unsigned RE_TRANSLATE_TYPE trans
;
3560 re_bitset_ptr_t sbcset
;
3561 const unsigned char *class_name
;
3562 reg_syntax_t syntax
;
3565 const char *name
= (const char *) class_name
;
3567 /* In case of REG_ICASE "upper" and "lower" match the both of
3568 upper and lower cases. */
3569 if ((syntax
& RE_ICASE
)
3570 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3573 #ifdef RE_ENABLE_I18N
3574 /* Check the space of the arrays. */
3575 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3577 /* Not enough, realloc it. */
3578 /* +1 in case of mbcset->nchar_classes is 0. */
3579 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3580 /* Use realloc since array is NULL if *alloc == 0. */
3581 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3582 new_char_class_alloc
);
3583 if (BE (new_char_classes
== NULL
, 0))
3585 mbcset
->char_classes
= new_char_classes
;
3586 *char_class_alloc
= new_char_class_alloc
;
3588 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3589 #endif /* RE_ENABLE_I18N */
3591 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3592 for (i = 0; i < SBC_MAX; ++i) \
3594 if (ctype_func (i)) \
3596 int ch = trans ? trans[i] : i; \
3597 bitset_set (sbcset, ch); \
3601 if (strcmp (name
, "alnum") == 0)
3602 BUILD_CHARCLASS_LOOP (isalnum
)
3603 else if (strcmp (name
, "cntrl") == 0)
3604 BUILD_CHARCLASS_LOOP (iscntrl
)
3605 else if (strcmp (name
, "lower") == 0)
3606 BUILD_CHARCLASS_LOOP (islower
)
3607 else if (strcmp (name
, "space") == 0)
3608 BUILD_CHARCLASS_LOOP (isspace
)
3609 else if (strcmp (name
, "alpha") == 0)
3610 BUILD_CHARCLASS_LOOP (isalpha
)
3611 else if (strcmp (name
, "digit") == 0)
3612 BUILD_CHARCLASS_LOOP (isdigit
)
3613 else if (strcmp (name
, "print") == 0)
3614 BUILD_CHARCLASS_LOOP (isprint
)
3615 else if (strcmp (name
, "upper") == 0)
3616 BUILD_CHARCLASS_LOOP (isupper
)
3617 else if (strcmp (name
, "blank") == 0)
3618 BUILD_CHARCLASS_LOOP (isblank
)
3619 else if (strcmp (name
, "graph") == 0)
3620 BUILD_CHARCLASS_LOOP (isgraph
)
3621 else if (strcmp (name
, "punct") == 0)
3622 BUILD_CHARCLASS_LOOP (ispunct
)
3623 else if (strcmp (name
, "xdigit") == 0)
3624 BUILD_CHARCLASS_LOOP (isxdigit
)
3632 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3634 unsigned RE_TRANSLATE_TYPE trans
;
3635 const unsigned char *class_name
;
3636 const unsigned char *extra
;
3640 re_bitset_ptr_t sbcset
;
3641 #ifdef RE_ENABLE_I18N
3642 re_charset_t
*mbcset
;
3644 #endif /* not RE_ENABLE_I18N */
3646 re_token_t br_token
;
3649 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3650 #ifdef RE_ENABLE_I18N
3651 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3652 #endif /* RE_ENABLE_I18N */
3654 #ifdef RE_ENABLE_I18N
3655 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3656 #else /* not RE_ENABLE_I18N */
3657 if (BE (sbcset
== NULL
, 0))
3658 #endif /* not RE_ENABLE_I18N */
3666 #ifdef RE_ENABLE_I18N
3668 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3669 bitset_set(cset->sbcset, '\0');
3671 mbcset
->non_match
= 1;
3672 #endif /* not RE_ENABLE_I18N */
3675 /* We don't care the syntax in this case. */
3676 ret
= build_charclass (trans
, sbcset
,
3677 #ifdef RE_ENABLE_I18N
3679 #endif /* RE_ENABLE_I18N */
3682 if (BE (ret
!= REG_NOERROR
, 0))
3685 #ifdef RE_ENABLE_I18N
3686 free_charset (mbcset
);
3687 #endif /* RE_ENABLE_I18N */
3691 /* \w match '_' also. */
3692 for (; *extra
; extra
++)
3693 bitset_set (sbcset
, *extra
);
3695 /* If it is non-matching list. */
3697 bitset_not (sbcset
);
3699 #ifdef RE_ENABLE_I18N
3700 /* Ensure only single byte characters are set. */
3701 if (dfa
->mb_cur_max
> 1)
3702 bitset_mask (sbcset
, dfa
->sb_char
);
3705 /* Build a tree for simple bracket. */
3706 br_token
.type
= SIMPLE_BRACKET
;
3707 br_token
.opr
.sbcset
= sbcset
;
3708 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3709 if (BE (tree
== NULL
, 0))
3710 goto build_word_op_espace
;
3712 #ifdef RE_ENABLE_I18N
3713 if (dfa
->mb_cur_max
> 1)
3715 re_token_t alt_token
;
3716 bin_tree_t
*mbc_tree
;
3717 /* Build a tree for complex bracket. */
3718 br_token
.type
= COMPLEX_BRACKET
;
3719 br_token
.opr
.mbcset
= mbcset
;
3720 dfa
->has_mb_node
= 1;
3721 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3722 if (BE (mbc_tree
== NULL
, 0))
3723 goto build_word_op_espace
;
3724 /* Then join them by ALT node. */
3725 alt_token
.type
= OP_ALT
;
3726 dfa
->has_plural_match
= 1;
3727 tree
= re_dfa_add_tree_node (dfa
, tree
, mbc_tree
, &alt_token
);
3728 if (BE (mbc_tree
!= NULL
, 1))
3733 free_charset (mbcset
);
3736 #else /* not RE_ENABLE_I18N */
3738 #endif /* not RE_ENABLE_I18N */
3740 build_word_op_espace
:
3742 #ifdef RE_ENABLE_I18N
3743 free_charset (mbcset
);
3744 #endif /* RE_ENABLE_I18N */
3749 /* This is intended for the expressions like "a{1,3}".
3750 Fetch a number from `input', and return the number.
3751 Return -1, if the number field is empty like "{,1}".
3752 Return -2, If an error is occured. */
3755 fetch_number (input
, token
, syntax
)
3758 reg_syntax_t syntax
;
3764 fetch_token (token
, input
, syntax
);
3766 if (BE (token
->type
== END_OF_RE
, 0))
3768 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3770 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3771 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3772 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3777 #ifdef RE_ENABLE_I18N
3779 free_charset (re_charset_t
*cset
)
3781 re_free (cset
->mbchars
);
3783 re_free (cset
->coll_syms
);
3784 re_free (cset
->equiv_classes
);
3785 re_free (cset
->range_starts
);
3786 re_free (cset
->range_ends
);
3788 re_free (cset
->char_classes
);
3791 #endif /* RE_ENABLE_I18N */
3793 /* Functions for binary tree operation. */
3795 /* Create a tree node. */
3798 create_tree (dfa
, left
, right
, type
, index
)
3802 re_token_type_t type
;
3806 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3808 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3810 if (storage
== NULL
)
3812 storage
->next
= dfa
->str_tree_storage
;
3813 dfa
->str_tree_storage
= storage
;
3814 dfa
->str_tree_storage_idx
= 0;
3816 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3818 tree
->parent
= NULL
;
3820 tree
->right
= right
;
3822 tree
->node_idx
= index
;
3825 re_node_set_init_empty (&tree
->eclosure
);
3828 left
->parent
= tree
;
3830 right
->parent
= tree
;
3834 /* Create both a DFA node and a tree for it. */
3837 re_dfa_add_tree_node (dfa
, left
, right
, token
)
3841 const re_token_t
*token
;
3843 int new_idx
= re_dfa_add_node (dfa
, *token
, 0);
3848 return create_tree (dfa
, left
, right
, 0, new_idx
);
3851 /* Mark the tree SRC as an optional subexpression. */
3854 mark_opt_subexp (src
, dfa
)
3855 const bin_tree_t
*src
;
3858 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3860 if (src
->type
== CONCAT
3861 && src
->left
->type
== NON_TYPE
3862 && dfa
->nodes
[src
->left
->node_idx
].type
== OP_OPEN_SUBEXP
)
3863 mark_opt_subexp_iter (src
, dfa
, dfa
->nodes
[src
->left
->node_idx
].opr
.idx
);
3867 /* Recursive tree walker for mark_opt_subexp. */
3870 mark_opt_subexp_iter (src
, dfa
, idx
)
3871 const bin_tree_t
*src
;
3877 if (src
->type
== NON_TYPE
)
3879 node_idx
= src
->node_idx
;
3880 if ((dfa
->nodes
[node_idx
].type
== OP_OPEN_SUBEXP
3881 || dfa
->nodes
[node_idx
].type
== OP_CLOSE_SUBEXP
)
3882 && dfa
->nodes
[node_idx
].opr
.idx
== idx
)
3883 dfa
->nodes
[node_idx
].opt_subexp
= 1;
3886 if (src
->left
!= NULL
)
3887 mark_opt_subexp_iter (src
->left
, dfa
, idx
);
3889 if (src
->right
!= NULL
)
3890 mark_opt_subexp_iter (src
->right
, dfa
, idx
);
3894 /* Duplicate the node SRC, and return new node. */
3897 duplicate_tree (src
, dfa
)
3898 const bin_tree_t
*src
;
3901 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3903 /* Since node indies must be according to Post-order of the tree,
3904 we must duplicate the left at first. */
3905 if (src
->left
!= NULL
)
3907 left
= duplicate_tree (src
->left
, dfa
);
3912 /* Secondaly, duplicate the right. */
3913 if (src
->right
!= NULL
)
3915 right
= duplicate_tree (src
->right
, dfa
);
3920 /* At last, duplicate itself. */
3921 if (src
->type
== NON_TYPE
)
3923 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3924 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3925 if (BE (new_node_idx
== -1, 0))
3929 new_node_idx
= src
->type
;
3931 new_tree
= create_tree (dfa
, left
, right
, src
->type
, new_node_idx
);