1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
37 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
38 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
39 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
40 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
41 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
42 int top_clone_node
, int root_node
,
43 unsigned int constraint
);
44 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
45 unsigned int constraint
);
46 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
47 unsigned int constraint
);
48 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
49 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
51 static void calc_inveclosure (re_dfa_t
*dfa
);
52 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
54 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
56 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
92 re_charset_t
*mbcset
, int *range_alloc
,
93 bracket_elem_t
*start_elem
,
94 bracket_elem_t
*end_elem
);
95 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
98 const unsigned char *name
);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 const unsigned char *name
);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
109 re_charset_t
*mbcset
,
110 int *equiv_class_alloc
,
111 const unsigned char *name
);
112 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
113 re_bitset_ptr_t sbcset
,
114 re_charset_t
*mbcset
,
115 int *char_class_alloc
,
116 const unsigned char *class_name
,
117 reg_syntax_t syntax
);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
120 const unsigned char *name
);
121 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
122 re_bitset_ptr_t sbcset
,
123 const unsigned char *class_name
,
124 reg_syntax_t syntax
);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
127 unsigned RE_TRANSLATE_TYPE trans
,
128 const unsigned char *class_name
,
129 const unsigned char *extra
,
130 int non_match
, reg_errcode_t
*err
);
131 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
132 bin_tree_t
*left
, bin_tree_t
*right
,
133 re_token_type_t type
, int index
);
134 static bin_tree_t
*re_dfa_add_tree_node (re_dfa_t
*dfa
,
135 bin_tree_t
*left
, bin_tree_t
*right
,
136 const re_token_t
*token
)
137 __attribute ((noinline
));
138 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
139 static void mark_opt_subexp (const bin_tree_t
*src
, re_dfa_t
*dfa
);
140 static void mark_opt_subexp_iter (const bin_tree_t
*src
, re_dfa_t
*dfa
, int idx
);
142 /* This table gives an error message for each of the error codes listed
143 in regex.h. Obviously the order here has to be same as there.
144 POSIX doesn't require that we do anything for REG_NOERROR,
145 but why not be nice? */
147 const char __re_error_msgid
[] attribute_hidden
=
149 #define REG_NOERROR_IDX 0
150 gettext_noop ("Success") /* REG_NOERROR */
152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
153 gettext_noop ("No match") /* REG_NOMATCH */
155 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
156 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
159 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
161 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
162 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
165 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
168 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
170 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
171 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
173 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
174 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
176 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
177 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
179 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
180 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
182 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
183 gettext_noop ("Invalid range end") /* REG_ERANGE */
185 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
186 gettext_noop ("Memory exhausted") /* REG_ESPACE */
188 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
189 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
191 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
192 gettext_noop ("Premature end of regular expression") /* REG_EEND */
194 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
195 gettext_noop ("Regular expression too big") /* REG_ESIZE */
197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
198 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
201 const size_t __re_error_msgid_idx
[] attribute_hidden
=
222 /* Entry points for GNU code. */
224 /* re_compile_pattern is the GNU regular expression compiler: it
225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
226 Returns 0 if the pattern was valid, otherwise an error string.
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
232 re_compile_pattern (pattern
, length
, bufp
)
235 struct re_pattern_buffer
*bufp
;
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
244 /* Match anchors at newline. */
245 bufp
->newline_anchor
= 1;
247 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
251 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
254 weak_alias (__re_compile_pattern
, re_compile_pattern
)
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options
;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
273 re_set_syntax (syntax
)
276 reg_syntax_t ret
= re_syntax_options
;
278 re_syntax_options
= syntax
;
282 weak_alias (__re_set_syntax
, re_set_syntax
)
286 re_compile_fastmap (bufp
)
287 struct re_pattern_buffer
*bufp
;
289 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
290 char *fastmap
= bufp
->fastmap
;
292 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
293 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
294 if (dfa
->init_state
!= dfa
->init_state_word
)
295 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
296 if (dfa
->init_state
!= dfa
->init_state_nl
)
297 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
298 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
299 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
300 bufp
->fastmap_accurate
= 1;
304 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
308 __attribute ((always_inline
))
309 re_set_fastmap (char *fastmap
, int icase
, int ch
)
313 fastmap
[tolower (ch
)] = 1;
316 /* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
320 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
322 const re_dfastate_t
*init_state
;
325 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
327 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
328 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
330 int node
= init_state
->nodes
.elems
[node_cnt
];
331 re_token_type_t type
= dfa
->nodes
[node
].type
;
333 if (type
== CHARACTER
)
335 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
339 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
344 *p
++ = dfa
->nodes
[node
].opr
.c
;
345 while (++node
< dfa
->nodes_len
346 && dfa
->nodes
[node
].type
== CHARACTER
347 && dfa
->nodes
[node
].mb_partial
)
348 *p
++ = dfa
->nodes
[node
].opr
.c
;
349 memset (&state
, 0, sizeof (state
));
350 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
352 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
353 re_set_fastmap (fastmap
, 0, buf
[0]);
357 else if (type
== SIMPLE_BRACKET
)
360 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
361 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
362 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
363 re_set_fastmap (fastmap
, icase
, ch
);
365 #ifdef RE_ENABLE_I18N
366 else if (type
== COMPLEX_BRACKET
)
369 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
370 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
371 || cset
->nranges
|| cset
->nchar_classes
)
374 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
376 /* In this case we want to catch the bytes which are
377 the first byte of any collation elements.
378 e.g. In da_DK, we want to catch 'a' since "aa"
379 is a valid collation element, and don't catch
380 'b' since 'b' is the only collation element
381 which starts from 'b'. */
383 const int32_t *table
= (const int32_t *)
384 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
385 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
386 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
388 re_set_fastmap (fastmap
, icase
, ch
);
391 if (dfa
->mb_cur_max
> 1)
392 for (i
= 0; i
< SBC_MAX
; ++i
)
393 if (__btowc (i
) == WEOF
)
394 re_set_fastmap (fastmap
, icase
, i
);
395 # endif /* not _LIBC */
397 for (i
= 0; i
< cset
->nmbchars
; ++i
)
401 memset (&state
, '\0', sizeof (state
));
402 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
403 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
404 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
406 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
407 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
411 #endif /* RE_ENABLE_I18N */
412 else if (type
== OP_PERIOD
413 #ifdef RE_ENABLE_I18N
414 || type
== OP_UTF8_PERIOD
415 #endif /* RE_ENABLE_I18N */
416 || type
== END_OF_RE
)
418 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
419 if (type
== END_OF_RE
)
420 bufp
->can_be_null
= 1;
426 /* Entry point for POSIX code. */
427 /* regcomp takes a regular expression as a string and compiles it.
429 PREG is a regex_t *. We do not expect any fields to be initialized,
430 since POSIX says we shouldn't. Thus, we set
432 `buffer' to the compiled pattern;
433 `used' to the length of the compiled pattern;
434 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
435 REG_EXTENDED bit in CFLAGS is set; otherwise, to
436 RE_SYNTAX_POSIX_BASIC;
437 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
438 `fastmap' to an allocated space for the fastmap;
439 `fastmap_accurate' to zero;
440 `re_nsub' to the number of subexpressions in PATTERN.
442 PATTERN is the address of the pattern string.
444 CFLAGS is a series of bits which affect compilation.
446 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
447 use POSIX basic syntax.
449 If REG_NEWLINE is set, then . and [^...] don't match newline.
450 Also, regexec will try a match beginning after every newline.
452 If REG_ICASE is set, then we considers upper- and lowercase
453 versions of letters to be equivalent when matching.
455 If REG_NOSUB is set, then when PREG is passed to regexec, that
456 routine will report only success or failure, and nothing about the
459 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
460 the return codes and their meanings.) */
463 regcomp (preg
, pattern
, cflags
)
464 regex_t
*__restrict preg
;
465 const char *__restrict pattern
;
469 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
470 : RE_SYNTAX_POSIX_BASIC
);
476 /* Try to allocate space for the fastmap. */
477 preg
->fastmap
= re_malloc (char, SBC_MAX
);
478 if (BE (preg
->fastmap
== NULL
, 0))
481 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags
& REG_NEWLINE
)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax
&= ~RE_DOT_NEWLINE
;
487 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
488 /* It also changes the matching behavior. */
489 preg
->newline_anchor
= 1;
492 preg
->newline_anchor
= 0;
493 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
494 preg
->translate
= NULL
;
496 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret
== REG_ERPAREN
)
503 /* We have already checked preg->fastmap != NULL. */
504 if (BE (ret
== REG_NOERROR
, 1))
505 /* Compute the fastmap now, since regexec cannot modify the pattern
506 buffer. This function never fails in this implementation. */
507 (void) re_compile_fastmap (preg
);
510 /* Some error occurred while compiling the expression. */
511 re_free (preg
->fastmap
);
512 preg
->fastmap
= NULL
;
518 weak_alias (__regcomp
, regcomp
)
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
525 regerror (errcode
, preg
, errbuf
, errbuf_size
)
535 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
536 / sizeof (__re_error_msgid_idx
[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
543 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
545 msg_size
= strlen (msg
) + 1; /* Includes the null. */
547 if (BE (errbuf_size
!= 0, 1))
549 if (BE (msg_size
> errbuf_size
, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
554 memcpy (errbuf
, msg
, errbuf_size
- 1);
555 errbuf
[errbuf_size
- 1] = 0;
559 memcpy (errbuf
, msg
, msg_size
);
565 weak_alias (__regerror
, regerror
)
569 #ifdef RE_ENABLE_I18N
570 /* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574 static const bitset utf8_sb_map
=
576 /* Set the first 128 bits. */
577 # if UINT_MAX == 0xffffffff
578 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
580 # error "Add case for new unsigned int size"
587 free_dfa_content (re_dfa_t
*dfa
)
591 re_free (dfa
->subexps
);
594 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
596 re_token_t
*node
= dfa
->nodes
+ i
;
597 #ifdef RE_ENABLE_I18N
598 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
599 free_charset (node
->opr
.mbcset
);
601 #endif /* RE_ENABLE_I18N */
602 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
603 re_free (node
->opr
.sbcset
);
605 re_free (dfa
->nexts
);
606 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
608 if (dfa
->eclosures
!= NULL
)
609 re_node_set_free (dfa
->eclosures
+ i
);
610 if (dfa
->inveclosures
!= NULL
)
611 re_node_set_free (dfa
->inveclosures
+ i
);
612 if (dfa
->edests
!= NULL
)
613 re_node_set_free (dfa
->edests
+ i
);
615 re_free (dfa
->edests
);
616 re_free (dfa
->eclosures
);
617 re_free (dfa
->inveclosures
);
618 re_free (dfa
->nodes
);
620 if (dfa
->state_table
)
621 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
623 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
624 for (j
= 0; j
< entry
->num
; ++j
)
626 re_dfastate_t
*state
= entry
->array
[j
];
629 re_free (entry
->array
);
631 re_free (dfa
->state_table
);
632 #ifdef RE_ENABLE_I18N
633 if (dfa
->sb_char
!= utf8_sb_map
)
634 re_free (dfa
->sb_char
);
637 re_free (dfa
->re_str
);
644 /* Free dynamically allocated space used by PREG. */
650 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
651 if (BE (dfa
!= NULL
, 1))
652 free_dfa_content (dfa
);
656 re_free (preg
->fastmap
);
657 preg
->fastmap
= NULL
;
659 re_free (preg
->translate
);
660 preg
->translate
= NULL
;
663 weak_alias (__regfree
, regfree
)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf
;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
689 if (!re_comp_buf
.buffer
)
690 return gettext ("No previous regular expression");
694 if (re_comp_buf
.buffer
)
696 fastmap
= re_comp_buf
.fastmap
;
697 re_comp_buf
.fastmap
= NULL
;
698 __regfree (&re_comp_buf
);
699 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
700 re_comp_buf
.fastmap
= fastmap
;
703 if (re_comp_buf
.fastmap
== NULL
)
705 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
706 if (re_comp_buf
.fastmap
== NULL
)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf
.newline_anchor
= 1;
717 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
727 libc_freeres_fn (free_mem
)
729 __regfree (&re_comp_buf
);
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
740 re_compile_internal (preg
, pattern
, length
, syntax
)
742 const char * pattern
;
746 reg_errcode_t err
= REG_NOERROR
;
750 /* Initialize the pattern buffer. */
751 preg
->fastmap_accurate
= 0;
752 preg
->syntax
= syntax
;
753 preg
->not_bol
= preg
->not_eol
= 0;
756 preg
->can_be_null
= 0;
757 preg
->regs_allocated
= REGS_UNALLOCATED
;
759 /* Initialize the dfa. */
760 dfa
= (re_dfa_t
*) preg
->buffer
;
761 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
770 preg
->allocated
= sizeof (re_dfa_t
);
771 preg
->buffer
= (unsigned char *) dfa
;
773 preg
->used
= sizeof (re_dfa_t
);
775 err
= init_dfa (dfa
, length
);
776 if (BE (err
!= REG_NOERROR
, 0))
778 free_dfa_content (dfa
);
784 dfa
->re_str
= re_malloc (char, length
+ 1);
785 strncpy (dfa
->re_str
, pattern
, length
+ 1);
788 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
789 syntax
& RE_ICASE
, dfa
);
790 if (BE (err
!= REG_NOERROR
, 0))
792 re_compile_internal_free_return
:
793 free_workarea_compile (preg
);
794 re_string_destruct (®exp
);
795 free_dfa_content (dfa
);
801 /* Parse the regular expression, and build a structure tree. */
803 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
804 if (BE (dfa
->str_tree
== NULL
, 0))
805 goto re_compile_internal_free_return
;
807 #ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
809 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
813 /* Analyze the tree and collect information which is necessary to
816 if (BE (err
!= REG_NOERROR
, 0))
817 goto re_compile_internal_free_return
;
819 /* Then create the initial state of the dfa. */
820 err
= create_initial_state (dfa
);
822 /* Release work areas. */
823 free_workarea_compile (preg
);
824 re_string_destruct (®exp
);
826 if (BE (err
!= REG_NOERROR
, 0))
828 free_dfa_content (dfa
);
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
840 init_dfa (dfa
, pat_len
)
849 memset (dfa
, '\0', sizeof (re_dfa_t
));
851 /* Force allocation of str_tree_storage the first time. */
852 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
854 dfa
->nodes_alloc
= pat_len
+ 1;
855 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
857 dfa
->states_alloc
= pat_len
+ 1;
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
861 if (table_size
> pat_len
)
864 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
865 dfa
->state_hash_mask
= table_size
- 1;
867 dfa
->subexps_alloc
= 1;
868 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
870 dfa
->mb_cur_max
= MB_CUR_MAX
;
872 if (dfa
->mb_cur_max
== 6
873 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
875 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
878 # ifdef HAVE_LANGINFO_CODESET
879 codeset_name
= nl_langinfo (CODESET
);
881 codeset_name
= getenv ("LC_ALL");
882 if (codeset_name
== NULL
|| codeset
[0] == '\0')
883 codeset_name
= getenv ("LC_CTYPE");
884 if (codeset_name
== NULL
|| codeset
[0] == '\0')
885 codeset_name
= getenv ("LANG");
886 if (codeset_name
== NULL
)
888 else if (strchr (codeset_name
, '.') != NULL
)
889 codeset_name
= strchr (codeset_name
, '.') + 1;
892 if (strcasecmp (codeset_name
, "UTF-8") == 0
893 || strcasecmp (codeset_name
, "UTF8") == 0)
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa
->map_notascii
= 0;
901 #ifdef RE_ENABLE_I18N
902 if (dfa
->mb_cur_max
> 1)
905 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
910 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
911 if (BE (dfa
->sb_char
== NULL
, 0))
914 /* Clear all bits by, then set those corresponding to single
916 bitset_empty (dfa
->sb_char
);
918 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
919 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
921 wchar_t wch
= __btowc (ch
);
923 dfa
->sb_char
[i
] |= 1 << j
;
925 if (isascii (ch
) && wch
!= (wchar_t) ch
)
926 dfa
->map_notascii
= 1;
933 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
934 || dfa
->subexps
== NULL
, 0))
939 /* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
948 dfa
->word_ops_used
= 1;
949 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
950 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
951 if (isalnum (ch
) || ch
== '_')
952 dfa
->word_char
[i
] |= 1 << j
;
955 /* Free the work area which are only used while compiling. */
958 free_workarea_compile (preg
)
961 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
962 bin_tree_storage_t
*storage
, *next
;
963 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
965 next
= storage
->next
;
968 dfa
->str_tree_storage
= NULL
;
969 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
970 dfa
->str_tree
= NULL
;
971 re_free (dfa
->org_indices
);
972 dfa
->org_indices
= NULL
;
975 /* Create initial states for all contexts. */
978 create_initial_state (dfa
)
983 re_node_set init_nodes
;
985 /* Initial states have the epsilon closure of the node which is
986 the first node of the regular expression. */
987 first
= dfa
->str_tree
->first
;
988 dfa
->init_node
= first
;
989 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
990 if (BE (err
!= REG_NOERROR
, 0))
993 /* The back-references which are in initial states can epsilon transit,
994 since in this case all of the subexpressions can be null.
995 Then we add epsilon closures of the nodes which are the next nodes of
996 the back-references. */
997 if (dfa
->nbackref
> 0)
998 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1000 int node_idx
= init_nodes
.elems
[i
];
1001 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1004 if (type
!= OP_BACK_REF
)
1006 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1008 re_token_t
*clexp_node
;
1009 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1010 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1011 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
1014 if (clexp_idx
== init_nodes
.nelem
)
1017 if (type
== OP_BACK_REF
)
1019 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1020 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1022 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1028 /* It must be the first time to invoke acquire_state. */
1029 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1030 /* We don't check ERR here, since the initial state must not be NULL. */
1031 if (BE (dfa
->init_state
== NULL
, 0))
1033 if (dfa
->init_state
->has_constraint
)
1035 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1037 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1039 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1043 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1044 || dfa
->init_state_begbuf
== NULL
, 0))
1048 dfa
->init_state_word
= dfa
->init_state_nl
1049 = dfa
->init_state_begbuf
= dfa
->init_state
;
1051 re_node_set_free (&init_nodes
);
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058 DFA nodes where needed. */
1064 int node
, i
, mb_chars
= 0, has_period
= 0;
1066 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1067 switch (dfa
->nodes
[node
].type
)
1070 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1074 switch (dfa
->nodes
[node
].opr
.idx
)
1082 /* Word anchors etc. cannot be handled. */
1092 case OP_DUP_ASTERISK
:
1093 case OP_DUP_QUESTION
:
1094 case OP_OPEN_SUBEXP
:
1095 case OP_CLOSE_SUBEXP
:
1097 case SIMPLE_BRACKET
:
1098 /* Just double check. */
1099 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1100 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1107 if (mb_chars
|| has_period
)
1108 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1110 if (dfa
->nodes
[node
].type
== CHARACTER
1111 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1112 dfa
->nodes
[node
].mb_partial
= 0;
1113 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1114 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1117 /* The search can be in single byte locale. */
1118 dfa
->mb_cur_max
= 1;
1120 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1124 /* Analyze the structure tree, and calculate "first", "next", "edest",
1125 "eclosure", and "inveclosure". */
1127 static reg_errcode_t
1134 /* Allocate arrays. */
1135 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1136 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1137 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1138 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1139 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1140 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1141 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1143 /* Initialize them. */
1144 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
1147 re_node_set_init_empty (dfa
->edests
+ i
);
1148 re_node_set_init_empty (dfa
->eclosures
+ i
);
1149 re_node_set_init_empty (dfa
->inveclosures
+ i
);
1152 ret
= analyze_tree (dfa
, dfa
->str_tree
);
1153 if (BE (ret
== REG_NOERROR
, 1))
1155 ret
= calc_eclosure (dfa
);
1156 if (ret
== REG_NOERROR
)
1157 calc_inveclosure (dfa
);
1162 /* Helper functions for analyze.
1163 This function calculate "first", "next", and "edest" for the subtree
1164 whose root is NODE. */
1166 static reg_errcode_t
1167 analyze_tree (dfa
, node
)
1172 if (node
->first
== -1)
1173 calc_first (dfa
, node
);
1174 if (node
->next
== -1)
1175 calc_next (dfa
, node
);
1176 if (node
->eclosure
.nelem
== 0)
1177 calc_epsdest (dfa
, node
);
1178 /* Calculate "first" etc. for the left child. */
1179 if (node
->left
!= NULL
)
1181 ret
= analyze_tree (dfa
, node
->left
);
1182 if (BE (ret
!= REG_NOERROR
, 0))
1185 /* Calculate "first" etc. for the right child. */
1186 if (node
->right
!= NULL
)
1188 ret
= analyze_tree (dfa
, node
->right
);
1189 if (BE (ret
!= REG_NOERROR
, 0))
1195 /* Calculate "first" for the node NODE. */
1197 calc_first (dfa
, node
)
1202 idx
= node
->node_idx
;
1203 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1208 case OP_OPEN_BRACKET
:
1209 case OP_CLOSE_BRACKET
:
1210 case OP_OPEN_DUP_NUM
:
1211 case OP_CLOSE_DUP_NUM
:
1213 case OP_NON_MATCH_LIST
:
1214 case OP_OPEN_COLL_ELEM
:
1215 case OP_CLOSE_COLL_ELEM
:
1216 case OP_OPEN_EQUIV_CLASS
:
1217 case OP_CLOSE_EQUIV_CLASS
:
1218 case OP_OPEN_CHAR_CLASS
:
1219 case OP_CLOSE_CHAR_CLASS
:
1220 /* These must not appear here. */
1226 case OP_DUP_ASTERISK
:
1227 case OP_DUP_QUESTION
:
1228 #ifdef RE_ENABLE_I18N
1229 case OP_UTF8_PERIOD
:
1230 case COMPLEX_BRACKET
:
1231 #endif /* RE_ENABLE_I18N */
1232 case SIMPLE_BRACKET
:
1235 case OP_OPEN_SUBEXP
:
1236 case OP_CLOSE_SUBEXP
:
1242 /* else fall through */
1245 assert (node
->left
!= NULL
);
1247 if (node
->left
->first
== -1)
1248 calc_first (dfa
, node
->left
);
1249 node
->first
= node
->left
->first
;
1254 /* Calculate "next" for the node NODE. */
1257 calc_next (dfa
, node
)
1262 bin_tree_t
*parent
= node
->parent
;
1266 idx
= node
->node_idx
;
1267 if (node
->type
== 0)
1268 dfa
->nexts
[idx
] = node
->next
;
1272 idx
= parent
->node_idx
;
1273 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1277 case OP_DUP_ASTERISK
:
1281 if (parent
->left
== node
)
1283 if (parent
->right
->first
== -1)
1284 calc_first (dfa
, parent
->right
);
1285 node
->next
= parent
->right
->first
;
1288 /* else fall through */
1290 if (parent
->next
== -1)
1291 calc_next (dfa
, parent
);
1292 node
->next
= parent
->next
;
1295 idx
= node
->node_idx
;
1296 if (node
->type
== 0)
1297 dfa
->nexts
[idx
] = node
->next
;
1300 /* Calculate "edest" for the node NODE. */
1303 calc_epsdest (dfa
, node
)
1308 idx
= node
->node_idx
;
1309 if (node
->type
== 0)
1311 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1312 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1314 if (node
->left
->first
== -1)
1315 calc_first (dfa
, node
->left
);
1316 if (node
->next
== -1)
1317 calc_next (dfa
, node
);
1318 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1321 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1324 if (node
->left
!= NULL
)
1326 if (node
->left
->first
== -1)
1327 calc_first (dfa
, node
->left
);
1328 left
= node
->left
->first
;
1332 if (node
->next
== -1)
1333 calc_next (dfa
, node
);
1336 if (node
->right
!= NULL
)
1338 if (node
->right
->first
== -1)
1339 calc_first (dfa
, node
->right
);
1340 right
= node
->right
->first
;
1344 if (node
->next
== -1)
1345 calc_next (dfa
, node
);
1348 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1350 else if (dfa
->nodes
[idx
].type
== ANCHOR
1351 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1352 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1353 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1354 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1356 assert (!IS_EPSILON_NODE (dfa
->nodes
[idx
].type
));
1360 /* Duplicate the epsilon closure of the node ROOT_NODE.
1361 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1362 to their own constraint. */
1364 static reg_errcode_t
1365 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1368 int top_org_node
, top_clone_node
, root_node
;
1369 unsigned int init_constraint
;
1372 int org_node
, clone_node
, ret
;
1373 unsigned int constraint
= init_constraint
;
1374 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1376 int org_dest
, clone_dest
;
1377 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1379 /* If the back reference epsilon-transit, its destination must
1380 also have the constraint. Then duplicate the epsilon closure
1381 of the destination of the back reference, and store it in
1382 edests of the back reference. */
1383 org_dest
= dfa
->nexts
[org_node
];
1384 re_node_set_empty (dfa
->edests
+ clone_node
);
1385 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1386 if (BE (err
!= REG_NOERROR
, 0))
1388 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1389 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1390 if (BE (ret
< 0, 0))
1393 else if (dfa
->edests
[org_node
].nelem
== 0)
1395 /* In case of the node can't epsilon-transit, don't duplicate the
1396 destination and store the original destination as the
1397 destination of the node. */
1398 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1401 else if (dfa
->edests
[org_node
].nelem
== 1)
1403 /* In case of the node can epsilon-transit, and it has only one
1405 org_dest
= dfa
->edests
[org_node
].elems
[0];
1406 re_node_set_empty (dfa
->edests
+ clone_node
);
1407 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1409 /* In case of the node has another constraint, append it. */
1410 if (org_node
== root_node
&& clone_node
!= org_node
)
1412 /* ...but if the node is root_node itself, it means the
1413 epsilon closure have a loop, then tie it to the
1414 destination of the root_node. */
1415 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1417 if (BE (ret
< 0, 0))
1421 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1423 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1424 if (BE (err
!= REG_NOERROR
, 0))
1426 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1427 if (BE (ret
< 0, 0))
1430 else /* dfa->edests[org_node].nelem == 2 */
1432 /* In case of the node can epsilon-transit, and it has two
1433 destinations. E.g. '|', '*', '+', '?'. */
1434 org_dest
= dfa
->edests
[org_node
].elems
[0];
1435 re_node_set_empty (dfa
->edests
+ clone_node
);
1436 /* Search for a duplicated node which satisfies the constraint. */
1437 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1438 if (clone_dest
== -1)
1440 /* There are no such a duplicated node, create a new one. */
1441 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1442 if (BE (err
!= REG_NOERROR
, 0))
1444 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1445 if (BE (ret
< 0, 0))
1447 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1448 root_node
, constraint
);
1449 if (BE (err
!= REG_NOERROR
, 0))
1454 /* There are a duplicated node which satisfy the constraint,
1455 use it to avoid infinite loop. */
1456 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1457 if (BE (ret
< 0, 0))
1461 org_dest
= dfa
->edests
[org_node
].elems
[1];
1462 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1463 if (BE (err
!= REG_NOERROR
, 0))
1465 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1466 if (BE (ret
< 0, 0))
1469 org_node
= org_dest
;
1470 clone_node
= clone_dest
;
1475 /* Search for a node which is duplicated from the node ORG_NODE, and
1476 satisfies the constraint CONSTRAINT. */
1479 search_duplicated_node (dfa
, org_node
, constraint
)
1482 unsigned int constraint
;
1485 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1487 if (org_node
== dfa
->org_indices
[idx
]
1488 && constraint
== dfa
->nodes
[idx
].constraint
)
1489 return idx
; /* Found. */
1491 return -1; /* Not found. */
1494 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1495 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1496 otherwise return the error code. */
1498 static reg_errcode_t
1499 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1501 int *new_idx
, org_idx
;
1502 unsigned int constraint
;
1504 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
], 1);
1505 if (BE (dup_idx
== -1, 0))
1507 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1508 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1509 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1510 dfa
->nodes
[dup_idx
].duplicated
= 1;
1511 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1512 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1513 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1515 /* Store the index of the original node. */
1516 dfa
->org_indices
[dup_idx
] = org_idx
;
1522 calc_inveclosure (dfa
)
1526 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1528 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1530 dest
= dfa
->eclosures
[src
].elems
[idx
];
1531 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1536 /* Calculate "eclosure" for all the node in DFA. */
1538 static reg_errcode_t
1542 int node_idx
, incomplete
;
1544 assert (dfa
->nodes_len
> 0);
1547 /* For each nodes, calculate epsilon closure. */
1548 for (node_idx
= 0; ; ++node_idx
)
1551 re_node_set eclosure_elem
;
1552 if (node_idx
== dfa
->nodes_len
)
1561 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1563 /* If we have already calculated, skip it. */
1564 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1566 /* Calculate epsilon closure of `node_idx'. */
1567 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1568 if (BE (err
!= REG_NOERROR
, 0))
1571 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1574 re_node_set_free (&eclosure_elem
);
1580 /* Calculate epsilon closure of NODE. */
1582 static reg_errcode_t
1583 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1584 re_node_set
*new_set
;
1589 unsigned int constraint
;
1591 re_node_set eclosure
;
1593 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1594 if (BE (err
!= REG_NOERROR
, 0))
1597 /* This indicates that we are calculating this node now.
1598 We reference this value to avoid infinite loop. */
1599 dfa
->eclosures
[node
].nelem
= -1;
1601 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1602 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1603 /* If the current node has constraints, duplicate all nodes.
1604 Since they must inherit the constraints. */
1606 && dfa
->edests
[node
].nelem
1607 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1609 int org_node
, cur_node
;
1610 org_node
= cur_node
= node
;
1611 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1612 if (BE (err
!= REG_NOERROR
, 0))
1616 /* Expand each epsilon destination nodes. */
1617 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1618 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1620 re_node_set eclosure_elem
;
1621 int edest
= dfa
->edests
[node
].elems
[i
];
1622 /* If calculating the epsilon closure of `edest' is in progress,
1623 return intermediate result. */
1624 if (dfa
->eclosures
[edest
].nelem
== -1)
1629 /* If we haven't calculated the epsilon closure of `edest' yet,
1630 calculate now. Otherwise use calculated epsilon closure. */
1631 if (dfa
->eclosures
[edest
].nelem
== 0)
1633 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1634 if (BE (err
!= REG_NOERROR
, 0))
1638 eclosure_elem
= dfa
->eclosures
[edest
];
1639 /* Merge the epsilon closure of `edest'. */
1640 re_node_set_merge (&eclosure
, &eclosure_elem
);
1641 /* If the epsilon closure of `edest' is incomplete,
1642 the epsilon closure of this node is also incomplete. */
1643 if (dfa
->eclosures
[edest
].nelem
== 0)
1646 re_node_set_free (&eclosure_elem
);
1650 /* Epsilon closures include itself. */
1651 re_node_set_insert (&eclosure
, node
);
1652 if (incomplete
&& !root
)
1653 dfa
->eclosures
[node
].nelem
= 0;
1655 dfa
->eclosures
[node
] = eclosure
;
1656 *new_set
= eclosure
;
1660 /* Functions for token which are used in the parser. */
1662 /* Fetch a token from INPUT.
1663 We must not use this function inside bracket expressions. */
1666 fetch_token (result
, input
, syntax
)
1669 reg_syntax_t syntax
;
1671 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1674 /* Peek a token from INPUT, and return the length of the token.
1675 We must not use this function inside bracket expressions. */
1678 peek_token (token
, input
, syntax
)
1681 reg_syntax_t syntax
;
1685 if (re_string_eoi (input
))
1687 token
->type
= END_OF_RE
;
1691 c
= re_string_peek_byte (input
, 0);
1694 token
->word_char
= 0;
1695 #ifdef RE_ENABLE_I18N
1696 token
->mb_partial
= 0;
1697 if (input
->mb_cur_max
> 1 &&
1698 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1700 token
->type
= CHARACTER
;
1701 token
->mb_partial
= 1;
1708 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1710 token
->type
= BACK_SLASH
;
1714 c2
= re_string_peek_byte_case (input
, 1);
1716 token
->type
= CHARACTER
;
1717 #ifdef RE_ENABLE_I18N
1718 if (input
->mb_cur_max
> 1)
1720 wint_t wc
= re_string_wchar_at (input
,
1721 re_string_cur_idx (input
) + 1);
1722 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1726 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1731 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1732 token
->type
= OP_ALT
;
1734 case '1': case '2': case '3': case '4': case '5':
1735 case '6': case '7': case '8': case '9':
1736 if (!(syntax
& RE_NO_BK_REFS
))
1738 token
->type
= OP_BACK_REF
;
1739 token
->opr
.idx
= c2
- '0';
1743 if (!(syntax
& RE_NO_GNU_OPS
))
1745 token
->type
= ANCHOR
;
1746 token
->opr
.ctx_type
= WORD_FIRST
;
1750 if (!(syntax
& RE_NO_GNU_OPS
))
1752 token
->type
= ANCHOR
;
1753 token
->opr
.ctx_type
= WORD_LAST
;
1757 if (!(syntax
& RE_NO_GNU_OPS
))
1759 token
->type
= ANCHOR
;
1760 token
->opr
.ctx_type
= WORD_DELIM
;
1764 if (!(syntax
& RE_NO_GNU_OPS
))
1766 token
->type
= ANCHOR
;
1767 token
->opr
.ctx_type
= INSIDE_WORD
;
1771 if (!(syntax
& RE_NO_GNU_OPS
))
1772 token
->type
= OP_WORD
;
1775 if (!(syntax
& RE_NO_GNU_OPS
))
1776 token
->type
= OP_NOTWORD
;
1779 if (!(syntax
& RE_NO_GNU_OPS
))
1780 token
->type
= OP_SPACE
;
1783 if (!(syntax
& RE_NO_GNU_OPS
))
1784 token
->type
= OP_NOTSPACE
;
1787 if (!(syntax
& RE_NO_GNU_OPS
))
1789 token
->type
= ANCHOR
;
1790 token
->opr
.ctx_type
= BUF_FIRST
;
1794 if (!(syntax
& RE_NO_GNU_OPS
))
1796 token
->type
= ANCHOR
;
1797 token
->opr
.ctx_type
= BUF_LAST
;
1801 if (!(syntax
& RE_NO_BK_PARENS
))
1802 token
->type
= OP_OPEN_SUBEXP
;
1805 if (!(syntax
& RE_NO_BK_PARENS
))
1806 token
->type
= OP_CLOSE_SUBEXP
;
1809 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1810 token
->type
= OP_DUP_PLUS
;
1813 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1814 token
->type
= OP_DUP_QUESTION
;
1817 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1818 token
->type
= OP_OPEN_DUP_NUM
;
1821 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1822 token
->type
= OP_CLOSE_DUP_NUM
;
1830 token
->type
= CHARACTER
;
1831 #ifdef RE_ENABLE_I18N
1832 if (input
->mb_cur_max
> 1)
1834 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1835 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1839 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1844 if (syntax
& RE_NEWLINE_ALT
)
1845 token
->type
= OP_ALT
;
1848 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1849 token
->type
= OP_ALT
;
1852 token
->type
= OP_DUP_ASTERISK
;
1855 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1856 token
->type
= OP_DUP_PLUS
;
1859 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1860 token
->type
= OP_DUP_QUESTION
;
1863 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1864 token
->type
= OP_OPEN_DUP_NUM
;
1867 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1868 token
->type
= OP_CLOSE_DUP_NUM
;
1871 if (syntax
& RE_NO_BK_PARENS
)
1872 token
->type
= OP_OPEN_SUBEXP
;
1875 if (syntax
& RE_NO_BK_PARENS
)
1876 token
->type
= OP_CLOSE_SUBEXP
;
1879 token
->type
= OP_OPEN_BRACKET
;
1882 token
->type
= OP_PERIOD
;
1885 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1886 re_string_cur_idx (input
) != 0)
1888 char prev
= re_string_peek_byte (input
, -1);
1889 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1892 token
->type
= ANCHOR
;
1893 token
->opr
.ctx_type
= LINE_FIRST
;
1896 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1897 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1900 re_string_skip_bytes (input
, 1);
1901 peek_token (&next
, input
, syntax
);
1902 re_string_skip_bytes (input
, -1);
1903 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1906 token
->type
= ANCHOR
;
1907 token
->opr
.ctx_type
= LINE_LAST
;
1915 /* Peek a token from INPUT, and return the length of the token.
1916 We must not use this function out of bracket expressions. */
1919 peek_token_bracket (token
, input
, syntax
)
1922 reg_syntax_t syntax
;
1925 if (re_string_eoi (input
))
1927 token
->type
= END_OF_RE
;
1930 c
= re_string_peek_byte (input
, 0);
1933 #ifdef RE_ENABLE_I18N
1934 if (input
->mb_cur_max
> 1 &&
1935 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1937 token
->type
= CHARACTER
;
1940 #endif /* RE_ENABLE_I18N */
1942 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1943 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1945 /* In this case, '\' escape a character. */
1947 re_string_skip_bytes (input
, 1);
1948 c2
= re_string_peek_byte (input
, 0);
1950 token
->type
= CHARACTER
;
1953 if (c
== '[') /* '[' is a special char in a bracket exps. */
1957 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
1958 c2
= re_string_peek_byte (input
, 1);
1966 token
->type
= OP_OPEN_COLL_ELEM
;
1969 token
->type
= OP_OPEN_EQUIV_CLASS
;
1972 if (syntax
& RE_CHAR_CLASSES
)
1974 token
->type
= OP_OPEN_CHAR_CLASS
;
1977 /* else fall through. */
1979 token
->type
= CHARACTER
;
1989 token
->type
= OP_CHARSET_RANGE
;
1992 token
->type
= OP_CLOSE_BRACKET
;
1995 token
->type
= OP_NON_MATCH_LIST
;
1998 token
->type
= CHARACTER
;
2003 /* Functions for parser. */
2005 /* Entry point of the parser.
2006 Parse the regular expression REGEXP and return the structure tree.
2007 If an error is occured, ERR is set by error code, and return NULL.
2008 This function build the following tree, from regular expression <reg_exp>:
2014 CAT means concatenation.
2015 EOR means end of regular expression. */
2018 parse (regexp
, preg
, syntax
, err
)
2019 re_string_t
*regexp
;
2021 reg_syntax_t syntax
;
2024 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2025 bin_tree_t
*tree
, *eor
, *root
;
2026 re_token_t current_token
;
2027 dfa
->syntax
= syntax
;
2028 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2029 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2030 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2032 eor
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, ¤t_token
);
2034 root
= create_tree (dfa
, tree
, eor
, CONCAT
, 0);
2037 if (BE (eor
== NULL
|| root
== NULL
, 0))
2045 /* This function build the following tree, from regular expression
2046 <branch1>|<branch2>:
2052 ALT means alternative, which represents the operator `|'. */
2055 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2056 re_string_t
*regexp
;
2059 reg_syntax_t syntax
;
2063 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2064 bin_tree_t
*tree
, *branch
= NULL
;
2065 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2066 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2069 while (token
->type
== OP_ALT
)
2071 re_token_t alt_token
= *token
;
2072 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2073 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2074 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2076 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2077 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2082 tree
= re_dfa_add_tree_node (dfa
, tree
, branch
, &alt_token
);
2083 if (BE (tree
== NULL
, 0))
2088 dfa
->has_plural_match
= 1;
2093 /* This function build the following tree, from regular expression
2100 CAT means concatenation. */
2103 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2104 re_string_t
*regexp
;
2107 reg_syntax_t syntax
;
2111 bin_tree_t
*tree
, *exp
;
2112 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2113 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2114 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2117 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2118 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2120 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2121 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2125 if (tree
!= NULL
&& exp
!= NULL
)
2127 tree
= create_tree (dfa
, tree
, exp
, CONCAT
, 0);
2134 else if (tree
== NULL
)
2136 /* Otherwise exp == NULL, we don't need to create new tree. */
2141 /* This function build the following tree, from regular expression a*:
2148 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2149 re_string_t
*regexp
;
2152 reg_syntax_t syntax
;
2156 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2158 switch (token
->type
)
2161 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2162 if (BE (tree
== NULL
, 0))
2167 #ifdef RE_ENABLE_I18N
2168 if (dfa
->mb_cur_max
> 1)
2170 while (!re_string_eoi (regexp
)
2171 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2173 bin_tree_t
*mbc_remain
;
2174 fetch_token (token
, regexp
, syntax
);
2175 mbc_remain
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2176 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
, 0);
2177 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2186 case OP_OPEN_SUBEXP
:
2187 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2188 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2191 case OP_OPEN_BRACKET
:
2192 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2193 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2197 if (BE (preg
->re_nsub
< token
->opr
.idx
2198 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
2203 dfa
->used_bkref_map
|= 1 << (token
->opr
.idx
- 1);
2204 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2205 if (BE (tree
== NULL
, 0))
2211 dfa
->has_mb_node
= 1;
2213 case OP_OPEN_DUP_NUM
:
2214 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2220 case OP_DUP_ASTERISK
:
2222 case OP_DUP_QUESTION
:
2223 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2228 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2230 fetch_token (token
, regexp
, syntax
);
2231 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2233 /* else fall through */
2234 case OP_CLOSE_SUBEXP
:
2235 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2236 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2241 /* else fall through */
2242 case OP_CLOSE_DUP_NUM
:
2243 /* We treat it as a normal character. */
2245 /* Then we can these characters as normal characters. */
2246 token
->type
= CHARACTER
;
2247 /* mb_partial and word_char bits should be initialized already
2249 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2250 if (BE (tree
== NULL
, 0))
2257 if ((token
->opr
.ctx_type
2258 & (WORD_DELIM
| INSIDE_WORD
| WORD_FIRST
| WORD_LAST
))
2259 && dfa
->word_ops_used
== 0)
2260 init_word_char (dfa
);
2261 if (token
->opr
.ctx_type
== WORD_DELIM
)
2263 bin_tree_t
*tree_first
, *tree_last
;
2264 token
->opr
.ctx_type
= WORD_FIRST
;
2265 tree_first
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2266 token
->opr
.ctx_type
= WORD_LAST
;
2267 tree_last
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2268 token
->type
= OP_ALT
;
2269 tree
= re_dfa_add_tree_node (dfa
, tree_first
, tree_last
, token
);
2270 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2278 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2279 if (BE (tree
== NULL
, 0))
2285 /* We must return here, since ANCHORs can't be followed
2286 by repetition operators.
2287 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2288 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2289 fetch_token (token
, regexp
, syntax
);
2292 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2293 if (BE (tree
== NULL
, 0))
2298 if (dfa
->mb_cur_max
> 1)
2299 dfa
->has_mb_node
= 1;
2303 tree
= build_charclass_op (dfa
, regexp
->trans
,
2304 (const unsigned char *) "alnum",
2305 (const unsigned char *) "_",
2306 token
->type
== OP_NOTWORD
, err
);
2307 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2312 tree
= build_charclass_op (dfa
, regexp
->trans
,
2313 (const unsigned char *) "space",
2314 (const unsigned char *) "",
2315 token
->type
== OP_NOTSPACE
, err
);
2316 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2326 /* Must not happen? */
2332 fetch_token (token
, regexp
, syntax
);
2334 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2335 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2337 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2338 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2340 /* In BRE consecutive duplications are not allowed. */
2341 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2342 && (token
->type
== OP_DUP_ASTERISK
2343 || token
->type
== OP_OPEN_DUP_NUM
))
2348 dfa
->has_plural_match
= 1;
2354 /* This function build the following tree, from regular expression
2362 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2363 re_string_t
*regexp
;
2366 reg_syntax_t syntax
;
2370 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2371 bin_tree_t
*tree
, *left_par
, *right_par
;
2373 cur_nsub
= preg
->re_nsub
++;
2374 if (BE (dfa
->subexps_alloc
< preg
->re_nsub
, 0))
2376 re_subexp_t
*new_array
;
2377 dfa
->subexps_alloc
*= 2;
2378 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2379 if (BE (new_array
== NULL
, 0))
2381 dfa
->subexps_alloc
/= 2;
2385 dfa
->subexps
= new_array
;
2387 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2388 dfa
->subexps
[cur_nsub
].end
= -1;
2390 left_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2391 if (BE (left_par
== NULL
, 0))
2396 dfa
->nodes
[left_par
->node_idx
].opr
.idx
= cur_nsub
;
2397 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2399 /* The subexpression may be a null string. */
2400 if (token
->type
== OP_CLOSE_SUBEXP
)
2404 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2405 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2408 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2413 right_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2414 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2415 tree
= ((tree
== NULL
) ? right_par
2416 : create_tree (dfa
, tree
, right_par
, CONCAT
, 0));
2417 tree
= create_tree (dfa
, left_par
, tree
, CONCAT
, 0);
2418 if (BE (right_par
== NULL
|| tree
== NULL
, 0))
2423 dfa
->nodes
[right_par
->node_idx
].opr
.idx
= cur_nsub
;
2428 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2431 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2433 re_string_t
*regexp
;
2436 reg_syntax_t syntax
;
2439 re_token_t dup_token
;
2440 bin_tree_t
*tree
= NULL
;
2441 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2442 re_token_t start_token
= *token
;
2444 if (token
->type
== OP_OPEN_DUP_NUM
)
2447 start
= fetch_number (regexp
, token
, syntax
);
2450 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2451 start
= 0; /* We treat "{,m}" as "{0,m}". */
2454 *err
= REG_BADBR
; /* <re>{} is invalid. */
2458 if (BE (start
!= -2, 1))
2460 /* We treat "{n}" as "{n,n}". */
2461 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2462 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2463 ? fetch_number (regexp
, token
, syntax
) : -2));
2465 if (BE (start
== -2 || end
== -2, 0))
2467 /* Invalid sequence. */
2468 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2470 if (token
->type
== END_OF_RE
)
2478 /* If the syntax bit is set, rollback. */
2479 re_string_set_index (regexp
, start_idx
);
2480 *token
= start_token
;
2481 token
->type
= CHARACTER
;
2482 /* mb_partial and word_char bits should be already initialized by
2487 if (BE (end
!= -1 && start
> end
, 0))
2489 /* First number greater than second. */
2496 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2497 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2500 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2501 if (BE (elem
== NULL
, 0))
2504 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2505 else if (BE (start
> 0, 0))
2508 for (i
= 2; i
<= start
; ++i
)
2510 elem
= duplicate_tree (elem
, dfa
);
2511 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2512 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2513 goto parse_dup_op_espace
;
2517 if (BE (end
!= start
, 1))
2519 dup_token
.type
= (end
== -1 ? OP_DUP_ASTERISK
: OP_DUP_QUESTION
);
2520 if (BE (start
> 0, 0))
2522 elem
= duplicate_tree (elem
, dfa
);
2523 if (BE (elem
== NULL
, 0))
2524 goto parse_dup_op_espace
;
2526 /* This subexpression will be marked as optional, so that
2527 empty matches do not touch the registers. */
2528 mark_opt_subexp (elem
, dfa
);
2530 /* Prepare the tree with the modifier. */
2531 elem
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2532 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2536 /* We do not need to duplicate the tree because we have not
2538 mark_opt_subexp (elem
, dfa
);
2539 tree
= elem
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2542 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2543 goto parse_dup_op_espace
;
2545 /* This loop is actually executed only when end != -1,
2546 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2547 already created the start+1-th copy. */
2548 for (i
= start
+ 2; i
<= end
; ++i
)
2550 elem
= duplicate_tree (elem
, dfa
);
2551 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2552 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2560 fetch_token (token
, regexp
, syntax
);
2563 parse_dup_op_espace
:
2568 /* Size of the names for collating symbol/equivalence_class/character_class.
2569 I'm not sure, but maybe enough. */
2570 #define BRACKET_NAME_BUF_SIZE 32
2573 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2574 Build the range expression which starts from START_ELEM, and ends
2575 at END_ELEM. The result are written to MBCSET and SBCSET.
2576 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2577 mbcset->range_ends, is a pointer argument sinse we may
2580 static reg_errcode_t
2581 # ifdef RE_ENABLE_I18N
2582 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2583 re_charset_t
*mbcset
;
2585 # else /* not RE_ENABLE_I18N */
2586 build_range_exp (sbcset
, start_elem
, end_elem
)
2587 # endif /* not RE_ENABLE_I18N */
2588 re_bitset_ptr_t sbcset
;
2589 bracket_elem_t
*start_elem
, *end_elem
;
2591 unsigned int start_ch
, end_ch
;
2592 /* Equivalence Classes and Character Classes can't be a range start/end. */
2593 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2594 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2598 /* We can handle no multi character collating elements without libc
2600 if (BE ((start_elem
->type
== COLL_SYM
2601 && strlen ((char *) start_elem
->opr
.name
) > 1)
2602 || (end_elem
->type
== COLL_SYM
2603 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2604 return REG_ECOLLATE
;
2606 # ifdef RE_ENABLE_I18N
2608 wchar_t wc
, start_wc
, end_wc
;
2609 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2611 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2612 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2614 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2615 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2617 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2618 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2619 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2620 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2621 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2622 return REG_ECOLLATE
;
2623 cmp_buf
[0] = start_wc
;
2624 cmp_buf
[4] = end_wc
;
2625 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2628 /* Got valid collation sequence values, add them as a new entry.
2629 However, for !_LIBC we have no collation elements: if the
2630 character set is single byte, the single byte character set
2631 that we build below suffices. parse_bracket_exp passes
2632 no MBCSET if dfa->mb_cur_max == 1. */
2635 /* Check the space of the arrays. */
2636 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2638 /* There is not enough space, need realloc. */
2639 wchar_t *new_array_start
, *new_array_end
;
2642 /* +1 in case of mbcset->nranges is 0. */
2643 new_nranges
= 2 * mbcset
->nranges
+ 1;
2644 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2645 are NULL if *range_alloc == 0. */
2646 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2648 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2651 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2654 mbcset
->range_starts
= new_array_start
;
2655 mbcset
->range_ends
= new_array_end
;
2656 *range_alloc
= new_nranges
;
2659 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2660 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2663 /* Build the table for single byte characters. */
2664 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2667 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2668 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2669 bitset_set (sbcset
, wc
);
2672 # else /* not RE_ENABLE_I18N */
2675 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2676 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2678 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2679 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2681 if (start_ch
> end_ch
)
2683 /* Build the table for single byte characters. */
2684 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2685 if (start_ch
<= ch
&& ch
<= end_ch
)
2686 bitset_set (sbcset
, ch
);
2688 # endif /* not RE_ENABLE_I18N */
2691 #endif /* not _LIBC */
2694 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2695 Build the collating element which is represented by NAME.
2696 The result are written to MBCSET and SBCSET.
2697 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2698 pointer argument since we may update it. */
2700 static reg_errcode_t
2701 # ifdef RE_ENABLE_I18N
2702 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2703 re_charset_t
*mbcset
;
2704 int *coll_sym_alloc
;
2705 # else /* not RE_ENABLE_I18N */
2706 build_collating_symbol (sbcset
, name
)
2707 # endif /* not RE_ENABLE_I18N */
2708 re_bitset_ptr_t sbcset
;
2709 const unsigned char *name
;
2711 size_t name_len
= strlen ((const char *) name
);
2712 if (BE (name_len
!= 1, 0))
2713 return REG_ECOLLATE
;
2716 bitset_set (sbcset
, name
[0]);
2720 #endif /* not _LIBC */
2722 /* This function parse bracket expression like "[abc]", "[a-c]",
2726 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2727 re_string_t
*regexp
;
2730 reg_syntax_t syntax
;
2734 const unsigned char *collseqmb
;
2735 const char *collseqwc
;
2738 const int32_t *symb_table
;
2739 const unsigned char *extra
;
2741 /* Local function for parse_bracket_exp used in _LIBC environement.
2742 Seek the collating symbol entry correspondings to NAME.
2743 Return the index of the symbol in the SYMB_TABLE. */
2746 __attribute ((always_inline
))
2747 seek_collating_symbol_entry (name
, name_len
)
2748 const unsigned char *name
;
2751 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2752 int32_t elem
= hash
% table_size
;
2753 int32_t second
= hash
% (table_size
- 2);
2754 while (symb_table
[2 * elem
] != 0)
2756 /* First compare the hashing value. */
2757 if (symb_table
[2 * elem
] == hash
2758 /* Compare the length of the name. */
2759 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2760 /* Compare the name. */
2761 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2764 /* Yep, this is the entry. */
2774 /* Local function for parse_bracket_exp used in _LIBC environement.
2775 Look up the collation sequence value of BR_ELEM.
2776 Return the value if succeeded, UINT_MAX otherwise. */
2778 auto inline unsigned int
2779 __attribute ((always_inline
))
2780 lookup_collation_sequence_value (br_elem
)
2781 bracket_elem_t
*br_elem
;
2783 if (br_elem
->type
== SB_CHAR
)
2786 if (MB_CUR_MAX == 1)
2789 return collseqmb
[br_elem
->opr
.ch
];
2792 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2793 return __collseq_table_lookup (collseqwc
, wc
);
2796 else if (br_elem
->type
== MB_CHAR
)
2798 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2800 else if (br_elem
->type
== COLL_SYM
)
2802 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2806 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2808 if (symb_table
[2 * elem
] != 0)
2810 /* We found the entry. */
2811 idx
= symb_table
[2 * elem
+ 1];
2812 /* Skip the name of collating element name. */
2813 idx
+= 1 + extra
[idx
];
2814 /* Skip the byte sequence of the collating element. */
2815 idx
+= 1 + extra
[idx
];
2816 /* Adjust for the alignment. */
2817 idx
= (idx
+ 3) & ~3;
2818 /* Skip the multibyte collation sequence value. */
2819 idx
+= sizeof (unsigned int);
2820 /* Skip the wide char sequence of the collating element. */
2821 idx
+= sizeof (unsigned int) *
2822 (1 + *(unsigned int *) (extra
+ idx
));
2823 /* Return the collation sequence value. */
2824 return *(unsigned int *) (extra
+ idx
);
2826 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2828 /* No valid character. Match it as a single byte
2830 return collseqmb
[br_elem
->opr
.name
[0]];
2833 else if (sym_name_len
== 1)
2834 return collseqmb
[br_elem
->opr
.name
[0]];
2839 /* Local function for parse_bracket_exp used in _LIBC environement.
2840 Build the range expression which starts from START_ELEM, and ends
2841 at END_ELEM. The result are written to MBCSET and SBCSET.
2842 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2843 mbcset->range_ends, is a pointer argument sinse we may
2846 auto inline reg_errcode_t
2847 __attribute ((always_inline
))
2848 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2849 re_charset_t
*mbcset
;
2851 re_bitset_ptr_t sbcset
;
2852 bracket_elem_t
*start_elem
, *end_elem
;
2855 uint32_t start_collseq
;
2856 uint32_t end_collseq
;
2858 /* Equivalence Classes and Character Classes can't be a range
2860 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2861 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2865 start_collseq
= lookup_collation_sequence_value (start_elem
);
2866 end_collseq
= lookup_collation_sequence_value (end_elem
);
2867 /* Check start/end collation sequence values. */
2868 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2869 return REG_ECOLLATE
;
2870 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2873 /* Got valid collation sequence values, add them as a new entry.
2874 However, if we have no collation elements, and the character set
2875 is single byte, the single byte character set that we
2876 build below suffices. */
2877 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2879 /* Check the space of the arrays. */
2880 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2882 /* There is not enough space, need realloc. */
2883 uint32_t *new_array_start
;
2884 uint32_t *new_array_end
;
2887 /* +1 in case of mbcset->nranges is 0. */
2888 new_nranges
= 2 * mbcset
->nranges
+ 1;
2889 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2891 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2894 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2897 mbcset
->range_starts
= new_array_start
;
2898 mbcset
->range_ends
= new_array_end
;
2899 *range_alloc
= new_nranges
;
2902 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2903 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2906 /* Build the table for single byte characters. */
2907 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2909 uint32_t ch_collseq
;
2911 if (MB_CUR_MAX == 1)
2914 ch_collseq
= collseqmb
[ch
];
2916 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2917 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2918 bitset_set (sbcset
, ch
);
2923 /* Local function for parse_bracket_exp used in _LIBC environement.
2924 Build the collating element which is represented by NAME.
2925 The result are written to MBCSET and SBCSET.
2926 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2927 pointer argument sinse we may update it. */
2929 auto inline reg_errcode_t
2930 __attribute ((always_inline
))
2931 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2932 re_charset_t
*mbcset
;
2933 int *coll_sym_alloc
;
2934 re_bitset_ptr_t sbcset
;
2935 const unsigned char *name
;
2938 size_t name_len
= strlen ((const char *) name
);
2941 elem
= seek_collating_symbol_entry (name
, name_len
);
2942 if (symb_table
[2 * elem
] != 0)
2944 /* We found the entry. */
2945 idx
= symb_table
[2 * elem
+ 1];
2946 /* Skip the name of collating element name. */
2947 idx
+= 1 + extra
[idx
];
2949 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2951 /* No valid character, treat it as a normal
2953 bitset_set (sbcset
, name
[0]);
2957 return REG_ECOLLATE
;
2959 /* Got valid collation sequence, add it as a new entry. */
2960 /* Check the space of the arrays. */
2961 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2963 /* Not enough, realloc it. */
2964 /* +1 in case of mbcset->ncoll_syms is 0. */
2965 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2966 /* Use realloc since mbcset->coll_syms is NULL
2968 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2969 new_coll_sym_alloc
);
2970 if (BE (new_coll_syms
== NULL
, 0))
2972 mbcset
->coll_syms
= new_coll_syms
;
2973 *coll_sym_alloc
= new_coll_sym_alloc
;
2975 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2980 if (BE (name_len
!= 1, 0))
2981 return REG_ECOLLATE
;
2984 bitset_set (sbcset
, name
[0]);
2991 re_token_t br_token
;
2992 re_bitset_ptr_t sbcset
;
2993 #ifdef RE_ENABLE_I18N
2994 re_charset_t
*mbcset
;
2995 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2996 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2997 #endif /* not RE_ENABLE_I18N */
2999 bin_tree_t
*work_tree
;
3001 int first_round
= 1;
3003 collseqmb
= (const unsigned char *)
3004 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3005 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3011 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3012 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3013 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3014 _NL_COLLATE_SYMB_TABLEMB
);
3015 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3016 _NL_COLLATE_SYMB_EXTRAMB
);
3019 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3020 #ifdef RE_ENABLE_I18N
3021 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3022 #endif /* RE_ENABLE_I18N */
3023 #ifdef RE_ENABLE_I18N
3024 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3026 if (BE (sbcset
== NULL
, 0))
3027 #endif /* RE_ENABLE_I18N */
3033 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3034 if (BE (token
->type
== END_OF_RE
, 0))
3037 goto parse_bracket_exp_free_return
;
3039 if (token
->type
== OP_NON_MATCH_LIST
)
3041 #ifdef RE_ENABLE_I18N
3042 mbcset
->non_match
= 1;
3043 #endif /* not RE_ENABLE_I18N */
3045 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3046 bitset_set (sbcset
, '\0');
3047 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3048 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3049 if (BE (token
->type
== END_OF_RE
, 0))
3052 goto parse_bracket_exp_free_return
;
3056 /* We treat the first ']' as a normal character. */
3057 if (token
->type
== OP_CLOSE_BRACKET
)
3058 token
->type
= CHARACTER
;
3062 bracket_elem_t start_elem
, end_elem
;
3063 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3064 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3066 int token_len2
= 0, is_range_exp
= 0;
3069 start_elem
.opr
.name
= start_name_buf
;
3070 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3071 syntax
, first_round
);
3072 if (BE (ret
!= REG_NOERROR
, 0))
3075 goto parse_bracket_exp_free_return
;
3079 /* Get information about the next token. We need it in any case. */
3080 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3082 /* Do not check for ranges if we know they are not allowed. */
3083 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3085 if (BE (token
->type
== END_OF_RE
, 0))
3088 goto parse_bracket_exp_free_return
;
3090 if (token
->type
== OP_CHARSET_RANGE
)
3092 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3093 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3094 if (BE (token2
.type
== END_OF_RE
, 0))
3097 goto parse_bracket_exp_free_return
;
3099 if (token2
.type
== OP_CLOSE_BRACKET
)
3101 /* We treat the last '-' as a normal character. */
3102 re_string_skip_bytes (regexp
, -token_len
);
3103 token
->type
= CHARACTER
;
3110 if (is_range_exp
== 1)
3112 end_elem
.opr
.name
= end_name_buf
;
3113 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3115 if (BE (ret
!= REG_NOERROR
, 0))
3118 goto parse_bracket_exp_free_return
;
3121 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3124 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3125 &start_elem
, &end_elem
);
3127 # ifdef RE_ENABLE_I18N
3128 *err
= build_range_exp (sbcset
,
3129 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3130 &range_alloc
, &start_elem
, &end_elem
);
3132 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3134 #endif /* RE_ENABLE_I18N */
3135 if (BE (*err
!= REG_NOERROR
, 0))
3136 goto parse_bracket_exp_free_return
;
3140 switch (start_elem
.type
)
3143 bitset_set (sbcset
, start_elem
.opr
.ch
);
3145 #ifdef RE_ENABLE_I18N
3147 /* Check whether the array has enough space. */
3148 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3150 wchar_t *new_mbchars
;
3151 /* Not enough, realloc it. */
3152 /* +1 in case of mbcset->nmbchars is 0. */
3153 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3154 /* Use realloc since array is NULL if *alloc == 0. */
3155 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3157 if (BE (new_mbchars
== NULL
, 0))
3158 goto parse_bracket_exp_espace
;
3159 mbcset
->mbchars
= new_mbchars
;
3161 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3163 #endif /* RE_ENABLE_I18N */
3165 *err
= build_equiv_class (sbcset
,
3166 #ifdef RE_ENABLE_I18N
3167 mbcset
, &equiv_class_alloc
,
3168 #endif /* RE_ENABLE_I18N */
3169 start_elem
.opr
.name
);
3170 if (BE (*err
!= REG_NOERROR
, 0))
3171 goto parse_bracket_exp_free_return
;
3174 *err
= build_collating_symbol (sbcset
,
3175 #ifdef RE_ENABLE_I18N
3176 mbcset
, &coll_sym_alloc
,
3177 #endif /* RE_ENABLE_I18N */
3178 start_elem
.opr
.name
);
3179 if (BE (*err
!= REG_NOERROR
, 0))
3180 goto parse_bracket_exp_free_return
;
3183 *err
= build_charclass (regexp
->trans
, sbcset
,
3184 #ifdef RE_ENABLE_I18N
3185 mbcset
, &char_class_alloc
,
3186 #endif /* RE_ENABLE_I18N */
3187 start_elem
.opr
.name
, syntax
);
3188 if (BE (*err
!= REG_NOERROR
, 0))
3189 goto parse_bracket_exp_free_return
;
3196 if (BE (token
->type
== END_OF_RE
, 0))
3199 goto parse_bracket_exp_free_return
;
3201 if (token
->type
== OP_CLOSE_BRACKET
)
3205 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3207 /* If it is non-matching list. */
3209 bitset_not (sbcset
);
3211 #ifdef RE_ENABLE_I18N
3212 /* Ensure only single byte characters are set. */
3213 if (dfa
->mb_cur_max
> 1)
3214 bitset_mask (sbcset
, dfa
->sb_char
);
3215 #endif /* RE_ENABLE_I18N */
3217 /* Build a tree for simple bracket. */
3218 br_token
.type
= SIMPLE_BRACKET
;
3219 br_token
.opr
.sbcset
= sbcset
;
3220 work_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3221 if (BE (work_tree
== NULL
, 0))
3222 goto parse_bracket_exp_espace
;
3224 #ifdef RE_ENABLE_I18N
3225 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3226 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3227 || mbcset
->non_match
)))
3229 re_token_t alt_token
;
3230 bin_tree_t
*mbc_tree
;
3232 /* Build a tree for complex bracket. */
3233 dfa
->has_mb_node
= 1;
3234 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3235 if (sbcset
[sbc_idx
])
3237 /* If there are no bits set in sbcset, there is no point
3238 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3239 if (sbc_idx
== BITSET_UINTS
)
3242 dfa
->nodes
[work_tree
->node_idx
].type
= COMPLEX_BRACKET
;
3243 dfa
->nodes
[work_tree
->node_idx
].opr
.mbcset
= mbcset
;
3246 br_token
.type
= COMPLEX_BRACKET
;
3247 br_token
.opr
.mbcset
= mbcset
;
3248 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3249 if (BE (mbc_tree
== NULL
, 0))
3250 goto parse_bracket_exp_espace
;
3251 /* Then join them by ALT node. */
3252 alt_token
.type
= OP_ALT
;
3253 dfa
->has_plural_match
= 1;
3254 work_tree
= re_dfa_add_tree_node (dfa
, work_tree
, mbc_tree
, &alt_token
);
3255 if (BE (mbc_tree
!= NULL
, 1))
3260 free_charset (mbcset
);
3263 #else /* not RE_ENABLE_I18N */
3265 #endif /* not RE_ENABLE_I18N */
3267 parse_bracket_exp_espace
:
3269 parse_bracket_exp_free_return
:
3271 #ifdef RE_ENABLE_I18N
3272 free_charset (mbcset
);
3273 #endif /* RE_ENABLE_I18N */
3277 /* Parse an element in the bracket expression. */
3279 static reg_errcode_t
3280 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3282 bracket_elem_t
*elem
;
3283 re_string_t
*regexp
;
3287 reg_syntax_t syntax
;
3290 #ifdef RE_ENABLE_I18N
3292 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3293 if (cur_char_size
> 1)
3295 elem
->type
= MB_CHAR
;
3296 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3297 re_string_skip_bytes (regexp
, cur_char_size
);
3300 #endif /* RE_ENABLE_I18N */
3301 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3302 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3303 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3304 return parse_bracket_symbol (elem
, regexp
, token
);
3305 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3307 /* A '-' must only appear as anything but a range indicator before
3308 the closing bracket. Everything else is an error. */
3310 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3311 if (token2
.type
!= OP_CLOSE_BRACKET
)
3312 /* The actual error value is not standardized since this whole
3313 case is undefined. But ERANGE makes good sense. */
3316 elem
->type
= SB_CHAR
;
3317 elem
->opr
.ch
= token
->opr
.c
;
3321 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3322 such as [:<character_class>:], [.<collating_element>.], and
3323 [=<equivalent_class>=]. */
3325 static reg_errcode_t
3326 parse_bracket_symbol (elem
, regexp
, token
)
3327 bracket_elem_t
*elem
;
3328 re_string_t
*regexp
;
3331 unsigned char ch
, delim
= token
->opr
.c
;
3333 if (re_string_eoi(regexp
))
3337 if (i
>= BRACKET_NAME_BUF_SIZE
)
3339 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3340 ch
= re_string_fetch_byte_case (regexp
);
3342 ch
= re_string_fetch_byte (regexp
);
3343 if (re_string_eoi(regexp
))
3345 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3347 elem
->opr
.name
[i
] = ch
;
3349 re_string_skip_bytes (regexp
, 1);
3350 elem
->opr
.name
[i
] = '\0';
3351 switch (token
->type
)
3353 case OP_OPEN_COLL_ELEM
:
3354 elem
->type
= COLL_SYM
;
3356 case OP_OPEN_EQUIV_CLASS
:
3357 elem
->type
= EQUIV_CLASS
;
3359 case OP_OPEN_CHAR_CLASS
:
3360 elem
->type
= CHAR_CLASS
;
3368 /* Helper function for parse_bracket_exp.
3369 Build the equivalence class which is represented by NAME.
3370 The result are written to MBCSET and SBCSET.
3371 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3372 is a pointer argument sinse we may update it. */
3374 static reg_errcode_t
3375 #ifdef RE_ENABLE_I18N
3376 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3377 re_charset_t
*mbcset
;
3378 int *equiv_class_alloc
;
3379 #else /* not RE_ENABLE_I18N */
3380 build_equiv_class (sbcset
, name
)
3381 #endif /* not RE_ENABLE_I18N */
3382 re_bitset_ptr_t sbcset
;
3383 const unsigned char *name
;
3386 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3389 const int32_t *table
, *indirect
;
3390 const unsigned char *weights
, *extra
, *cp
;
3391 unsigned char char_buf
[2];
3395 /* This #include defines a local function! */
3396 # include <locale/weight.h>
3397 /* Calculate the index for equivalence class. */
3399 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3400 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3401 _NL_COLLATE_WEIGHTMB
);
3402 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3403 _NL_COLLATE_EXTRAMB
);
3404 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3405 _NL_COLLATE_INDIRECTMB
);
3406 idx1
= findidx (&cp
);
3407 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3408 /* This isn't a valid character. */
3409 return REG_ECOLLATE
;
3411 /* Build single byte matcing table for this equivalence class. */
3412 char_buf
[1] = (unsigned char) '\0';
3413 len
= weights
[idx1
];
3414 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3418 idx2
= findidx (&cp
);
3423 /* This isn't a valid character. */
3425 if (len
== weights
[idx2
])
3428 while (cnt
<= len
&&
3429 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3433 bitset_set (sbcset
, ch
);
3436 /* Check whether the array has enough space. */
3437 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3439 /* Not enough, realloc it. */
3440 /* +1 in case of mbcset->nequiv_classes is 0. */
3441 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3442 /* Use realloc since the array is NULL if *alloc == 0. */
3443 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3445 new_equiv_class_alloc
);
3446 if (BE (new_equiv_classes
== NULL
, 0))
3448 mbcset
->equiv_classes
= new_equiv_classes
;
3449 *equiv_class_alloc
= new_equiv_class_alloc
;
3451 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3456 if (BE (strlen ((const char *) name
) != 1, 0))
3457 return REG_ECOLLATE
;
3458 bitset_set (sbcset
, *name
);
3463 /* Helper function for parse_bracket_exp.
3464 Build the character class which is represented by NAME.
3465 The result are written to MBCSET and SBCSET.
3466 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3467 is a pointer argument sinse we may update it. */
3469 static reg_errcode_t
3470 #ifdef RE_ENABLE_I18N
3471 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3472 re_charset_t
*mbcset
;
3473 int *char_class_alloc
;
3474 #else /* not RE_ENABLE_I18N */
3475 build_charclass (trans
, sbcset
, class_name
, syntax
)
3476 #endif /* not RE_ENABLE_I18N */
3477 unsigned RE_TRANSLATE_TYPE trans
;
3478 re_bitset_ptr_t sbcset
;
3479 const unsigned char *class_name
;
3480 reg_syntax_t syntax
;
3483 const char *name
= (const char *) class_name
;
3485 /* In case of REG_ICASE "upper" and "lower" match the both of
3486 upper and lower cases. */
3487 if ((syntax
& RE_ICASE
)
3488 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3491 #ifdef RE_ENABLE_I18N
3492 /* Check the space of the arrays. */
3493 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3495 /* Not enough, realloc it. */
3496 /* +1 in case of mbcset->nchar_classes is 0. */
3497 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3498 /* Use realloc since array is NULL if *alloc == 0. */
3499 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3500 new_char_class_alloc
);
3501 if (BE (new_char_classes
== NULL
, 0))
3503 mbcset
->char_classes
= new_char_classes
;
3504 *char_class_alloc
= new_char_class_alloc
;
3506 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3507 #endif /* RE_ENABLE_I18N */
3509 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3510 for (i = 0; i < SBC_MAX; ++i) \
3512 if (ctype_func (i)) \
3514 int ch = trans ? trans[i] : i; \
3515 bitset_set (sbcset, ch); \
3519 if (strcmp (name
, "alnum") == 0)
3520 BUILD_CHARCLASS_LOOP (isalnum
)
3521 else if (strcmp (name
, "cntrl") == 0)
3522 BUILD_CHARCLASS_LOOP (iscntrl
)
3523 else if (strcmp (name
, "lower") == 0)
3524 BUILD_CHARCLASS_LOOP (islower
)
3525 else if (strcmp (name
, "space") == 0)
3526 BUILD_CHARCLASS_LOOP (isspace
)
3527 else if (strcmp (name
, "alpha") == 0)
3528 BUILD_CHARCLASS_LOOP (isalpha
)
3529 else if (strcmp (name
, "digit") == 0)
3530 BUILD_CHARCLASS_LOOP (isdigit
)
3531 else if (strcmp (name
, "print") == 0)
3532 BUILD_CHARCLASS_LOOP (isprint
)
3533 else if (strcmp (name
, "upper") == 0)
3534 BUILD_CHARCLASS_LOOP (isupper
)
3535 else if (strcmp (name
, "blank") == 0)
3536 BUILD_CHARCLASS_LOOP (isblank
)
3537 else if (strcmp (name
, "graph") == 0)
3538 BUILD_CHARCLASS_LOOP (isgraph
)
3539 else if (strcmp (name
, "punct") == 0)
3540 BUILD_CHARCLASS_LOOP (ispunct
)
3541 else if (strcmp (name
, "xdigit") == 0)
3542 BUILD_CHARCLASS_LOOP (isxdigit
)
3550 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3552 unsigned RE_TRANSLATE_TYPE trans
;
3553 const unsigned char *class_name
;
3554 const unsigned char *extra
;
3558 re_bitset_ptr_t sbcset
;
3559 #ifdef RE_ENABLE_I18N
3560 re_charset_t
*mbcset
;
3562 #endif /* not RE_ENABLE_I18N */
3564 re_token_t br_token
;
3567 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3568 #ifdef RE_ENABLE_I18N
3569 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3570 #endif /* RE_ENABLE_I18N */
3572 #ifdef RE_ENABLE_I18N
3573 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3574 #else /* not RE_ENABLE_I18N */
3575 if (BE (sbcset
== NULL
, 0))
3576 #endif /* not RE_ENABLE_I18N */
3584 #ifdef RE_ENABLE_I18N
3586 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3587 bitset_set(cset->sbcset, '\0');
3589 mbcset
->non_match
= 1;
3590 #endif /* not RE_ENABLE_I18N */
3593 /* We don't care the syntax in this case. */
3594 ret
= build_charclass (trans
, sbcset
,
3595 #ifdef RE_ENABLE_I18N
3597 #endif /* RE_ENABLE_I18N */
3600 if (BE (ret
!= REG_NOERROR
, 0))
3603 #ifdef RE_ENABLE_I18N
3604 free_charset (mbcset
);
3605 #endif /* RE_ENABLE_I18N */
3609 /* \w match '_' also. */
3610 for (; *extra
; extra
++)
3611 bitset_set (sbcset
, *extra
);
3613 /* If it is non-matching list. */
3615 bitset_not (sbcset
);
3617 #ifdef RE_ENABLE_I18N
3618 /* Ensure only single byte characters are set. */
3619 if (dfa
->mb_cur_max
> 1)
3620 bitset_mask (sbcset
, dfa
->sb_char
);
3623 /* Build a tree for simple bracket. */
3624 br_token
.type
= SIMPLE_BRACKET
;
3625 br_token
.opr
.sbcset
= sbcset
;
3626 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3627 if (BE (tree
== NULL
, 0))
3628 goto build_word_op_espace
;
3630 #ifdef RE_ENABLE_I18N
3631 if (dfa
->mb_cur_max
> 1)
3633 re_token_t alt_token
;
3634 bin_tree_t
*mbc_tree
;
3635 /* Build a tree for complex bracket. */
3636 br_token
.type
= COMPLEX_BRACKET
;
3637 br_token
.opr
.mbcset
= mbcset
;
3638 dfa
->has_mb_node
= 1;
3639 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3640 if (BE (mbc_tree
== NULL
, 0))
3641 goto build_word_op_espace
;
3642 /* Then join them by ALT node. */
3643 alt_token
.type
= OP_ALT
;
3644 dfa
->has_plural_match
= 1;
3645 tree
= re_dfa_add_tree_node (dfa
, tree
, mbc_tree
, &alt_token
);
3646 if (BE (mbc_tree
!= NULL
, 1))
3651 free_charset (mbcset
);
3654 #else /* not RE_ENABLE_I18N */
3656 #endif /* not RE_ENABLE_I18N */
3658 build_word_op_espace
:
3660 #ifdef RE_ENABLE_I18N
3661 free_charset (mbcset
);
3662 #endif /* RE_ENABLE_I18N */
3667 /* This is intended for the expressions like "a{1,3}".
3668 Fetch a number from `input', and return the number.
3669 Return -1, if the number field is empty like "{,1}".
3670 Return -2, If an error is occured. */
3673 fetch_number (input
, token
, syntax
)
3676 reg_syntax_t syntax
;
3682 fetch_token (token
, input
, syntax
);
3684 if (BE (token
->type
== END_OF_RE
, 0))
3686 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3688 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3689 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3690 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3695 #ifdef RE_ENABLE_I18N
3697 free_charset (re_charset_t
*cset
)
3699 re_free (cset
->mbchars
);
3701 re_free (cset
->coll_syms
);
3702 re_free (cset
->equiv_classes
);
3703 re_free (cset
->range_starts
);
3704 re_free (cset
->range_ends
);
3706 re_free (cset
->char_classes
);
3709 #endif /* RE_ENABLE_I18N */
3711 /* Functions for binary tree operation. */
3713 /* Create a tree node. */
3716 create_tree (dfa
, left
, right
, type
, index
)
3720 re_token_type_t type
;
3724 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3726 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3728 if (storage
== NULL
)
3730 storage
->next
= dfa
->str_tree_storage
;
3731 dfa
->str_tree_storage
= storage
;
3732 dfa
->str_tree_storage_idx
= 0;
3734 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3736 tree
->parent
= NULL
;
3738 tree
->right
= right
;
3740 tree
->node_idx
= index
;
3743 re_node_set_init_empty (&tree
->eclosure
);
3746 left
->parent
= tree
;
3748 right
->parent
= tree
;
3752 /* Create both a DFA node and a tree for it. */
3755 re_dfa_add_tree_node (dfa
, left
, right
, token
)
3759 const re_token_t
*token
;
3761 int new_idx
= re_dfa_add_node (dfa
, *token
, 0);
3766 return create_tree (dfa
, left
, right
, 0, new_idx
);
3769 /* Mark the tree SRC as an optional subexpression. */
3772 mark_opt_subexp (src
, dfa
)
3773 const bin_tree_t
*src
;
3776 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3778 if (src
->type
== CONCAT
3779 && src
->left
->type
== NON_TYPE
3780 && dfa
->nodes
[src
->left
->node_idx
].type
== OP_OPEN_SUBEXP
)
3781 mark_opt_subexp_iter (src
, dfa
, dfa
->nodes
[src
->left
->node_idx
].opr
.idx
);
3785 /* Recursive tree walker for mark_opt_subexp. */
3788 mark_opt_subexp_iter (src
, dfa
, idx
)
3789 const bin_tree_t
*src
;
3795 if (src
->type
== NON_TYPE
)
3797 node_idx
= src
->node_idx
;
3798 if ((dfa
->nodes
[node_idx
].type
== OP_OPEN_SUBEXP
3799 || dfa
->nodes
[node_idx
].type
== OP_CLOSE_SUBEXP
)
3800 && dfa
->nodes
[node_idx
].opr
.idx
== idx
)
3801 dfa
->nodes
[node_idx
].opt_subexp
= 1;
3804 if (src
->left
!= NULL
)
3805 mark_opt_subexp_iter (src
->left
, dfa
, idx
);
3807 if (src
->right
!= NULL
)
3808 mark_opt_subexp_iter (src
->right
, dfa
, idx
);
3812 /* Duplicate the node SRC, and return new node. */
3815 duplicate_tree (src
, dfa
)
3816 const bin_tree_t
*src
;
3819 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3821 /* Since node indies must be according to Post-order of the tree,
3822 we must duplicate the left at first. */
3823 if (src
->left
!= NULL
)
3825 left
= duplicate_tree (src
->left
, dfa
);
3830 /* Secondaly, duplicate the right. */
3831 if (src
->right
!= NULL
)
3833 right
= duplicate_tree (src
->right
, dfa
);
3838 /* At last, duplicate itself. */
3839 if (src
->type
== NON_TYPE
)
3841 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3842 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3843 if (BE (new_node_idx
== -1, 0))
3847 new_node_idx
= src
->type
;
3849 new_tree
= create_tree (dfa
, left
, right
, src
->type
, new_node_idx
);