1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
28 static void free_charset (re_charset_t
*cset
);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t
*preg
);
31 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
35 static reg_errcode_t
analyze (regex_t
*preg
);
36 static reg_errcode_t
preorder (bin_tree_t
*root
,
37 reg_errcode_t (fn (void *, bin_tree_t
*)),
39 static reg_errcode_t
postorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
43 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
44 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
46 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
49 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
50 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
53 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
55 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
56 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
58 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 reg_syntax_t syntax
) internal_function
;
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
92 int *equiv_class_alloc
,
93 const unsigned char *name
);
94 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
97 int *char_class_alloc
,
98 const unsigned char *class_name
,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
102 const unsigned char *name
);
103 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
105 const unsigned char *class_name
,
106 reg_syntax_t syntax
);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
109 RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 const unsigned char *extra
,
112 int non_match
, reg_errcode_t
*err
);
113 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
114 bin_tree_t
*left
, bin_tree_t
*right
,
115 re_token_type_t type
);
116 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 const re_token_t
*token
);
119 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
120 static void free_token (re_token_t
*node
);
121 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
122 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid
[] attribute_hidden
=
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx
[] attribute_hidden
=
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
214 re_compile_pattern (pattern
, length
, bufp
)
217 struct re_pattern_buffer
*bufp
;
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
226 /* Match anchors at newline. */
227 bufp
->newline_anchor
= 1;
229 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
233 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
236 weak_alias (__re_compile_pattern
, re_compile_pattern
)
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options
;
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
255 re_set_syntax (syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (bufp
)
269 struct re_pattern_buffer
*bufp
;
271 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
272 char *fastmap
= bufp
->fastmap
;
274 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
275 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
276 if (dfa
->init_state
!= dfa
->init_state_word
)
277 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
278 if (dfa
->init_state
!= dfa
->init_state_nl
)
279 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
280 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
281 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
282 bufp
->fastmap_accurate
= 1;
286 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
290 __attribute ((always_inline
))
291 re_set_fastmap (char *fastmap
, int icase
, int ch
)
295 fastmap
[tolower (ch
)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
302 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
305 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
307 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
308 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
310 int node
= init_state
->nodes
.elems
[node_cnt
];
311 re_token_type_t type
= dfa
->nodes
[node
].type
;
313 if (type
== CHARACTER
)
315 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
319 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, 0, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
354 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
355 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
356 || cset
->nranges
|| cset
->nchar_classes
)
359 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
361 /* In this case we want to catch the bytes which are
362 the first byte of any collation elements.
363 e.g. In da_DK, we want to catch 'a' since "aa"
364 is a valid collation element, and don't catch
365 'b' since 'b' is the only collation element
366 which starts from 'b'. */
367 const int32_t *table
= (const int32_t *)
368 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
369 for (i
= 0; i
< SBC_MAX
; ++i
)
371 re_set_fastmap (fastmap
, icase
, i
);
374 if (dfa
->mb_cur_max
> 1)
375 for (i
= 0; i
< SBC_MAX
; ++i
)
376 if (__btowc (i
) == WEOF
)
377 re_set_fastmap (fastmap
, icase
, i
);
378 # endif /* not _LIBC */
380 for (i
= 0; i
< cset
->nmbchars
; ++i
)
384 memset (&state
, '\0', sizeof (state
));
385 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
386 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
387 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
389 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
391 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
395 #endif /* RE_ENABLE_I18N */
396 else if (type
== OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 || type
== OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 || type
== END_OF_RE
)
402 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
403 if (type
== END_OF_RE
)
404 bufp
->can_be_null
= 1;
410 /* Entry point for POSIX code. */
411 /* regcomp takes a regular expression as a string and compiles it.
413 PREG is a regex_t *. We do not expect any fields to be initialized,
414 since POSIX says we shouldn't. Thus, we set
416 `buffer' to the compiled pattern;
417 `used' to the length of the compiled pattern;
418 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419 REG_EXTENDED bit in CFLAGS is set; otherwise, to
420 RE_SYNTAX_POSIX_BASIC;
421 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422 `fastmap' to an allocated space for the fastmap;
423 `fastmap_accurate' to zero;
424 `re_nsub' to the number of subexpressions in PATTERN.
426 PATTERN is the address of the pattern string.
428 CFLAGS is a series of bits which affect compilation.
430 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431 use POSIX basic syntax.
433 If REG_NEWLINE is set, then . and [^...] don't match newline.
434 Also, regexec will try a match beginning after every newline.
436 If REG_ICASE is set, then we considers upper- and lowercase
437 versions of letters to be equivalent when matching.
439 If REG_NOSUB is set, then when PREG is passed to regexec, that
440 routine will report only success or failure, and nothing about the
443 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
444 the return codes and their meanings.) */
447 regcomp (preg
, pattern
, cflags
)
448 regex_t
*__restrict preg
;
449 const char *__restrict pattern
;
453 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
454 : RE_SYNTAX_POSIX_BASIC
);
460 /* Try to allocate space for the fastmap. */
461 preg
->fastmap
= re_malloc (char, SBC_MAX
);
462 if (BE (preg
->fastmap
== NULL
, 0))
465 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
467 /* If REG_NEWLINE is set, newlines are treated differently. */
468 if (cflags
& REG_NEWLINE
)
469 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
470 syntax
&= ~RE_DOT_NEWLINE
;
471 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
472 /* It also changes the matching behavior. */
473 preg
->newline_anchor
= 1;
476 preg
->newline_anchor
= 0;
477 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
478 preg
->translate
= NULL
;
480 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
482 /* POSIX doesn't distinguish between an unmatched open-group and an
483 unmatched close-group: both are REG_EPAREN. */
484 if (ret
== REG_ERPAREN
)
487 /* We have already checked preg->fastmap != NULL. */
488 if (BE (ret
== REG_NOERROR
, 1))
489 /* Compute the fastmap now, since regexec cannot modify the pattern
490 buffer. This function never fails in this implementation. */
491 (void) re_compile_fastmap (preg
);
494 /* Some error occurred while compiling the expression. */
495 re_free (preg
->fastmap
);
496 preg
->fastmap
= NULL
;
502 weak_alias (__regcomp
, regcomp
)
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506 from either regcomp or regexec. We don't use PREG here. */
509 regerror (errcode
, preg
, errbuf
, errbuf_size
)
511 const regex_t
*__restrict preg
;
512 char *__restrict errbuf
;
519 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
520 / sizeof (__re_error_msgid_idx
[0])), 0))
521 /* Only error codes returned by the rest of the code should be passed
522 to this routine. If we are given anything else, or if other regex
523 code generates an invalid error code, then the program has a bug.
524 Dump core so we can fix it. */
527 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
529 msg_size
= strlen (msg
) + 1; /* Includes the null. */
531 if (BE (errbuf_size
!= 0, 1))
533 if (BE (msg_size
> errbuf_size
, 0))
535 #if defined HAVE_MEMPCPY || defined _LIBC
536 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
538 memcpy (errbuf
, msg
, errbuf_size
- 1);
539 errbuf
[errbuf_size
- 1] = 0;
543 memcpy (errbuf
, msg
, msg_size
);
549 weak_alias (__regerror
, regerror
)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map
=
560 /* Set the first 128 bits. */
561 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
567 free_dfa_content (re_dfa_t
*dfa
)
572 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
573 free_token (dfa
->nodes
+ i
);
574 re_free (dfa
->nexts
);
575 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
577 if (dfa
->eclosures
!= NULL
)
578 re_node_set_free (dfa
->eclosures
+ i
);
579 if (dfa
->inveclosures
!= NULL
)
580 re_node_set_free (dfa
->inveclosures
+ i
);
581 if (dfa
->edests
!= NULL
)
582 re_node_set_free (dfa
->edests
+ i
);
584 re_free (dfa
->edests
);
585 re_free (dfa
->eclosures
);
586 re_free (dfa
->inveclosures
);
587 re_free (dfa
->nodes
);
589 if (dfa
->state_table
)
590 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
592 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
593 for (j
= 0; j
< entry
->num
; ++j
)
595 re_dfastate_t
*state
= entry
->array
[j
];
598 re_free (entry
->array
);
600 re_free (dfa
->state_table
);
601 #ifdef RE_ENABLE_I18N
602 if (dfa
->sb_char
!= utf8_sb_map
)
603 re_free (dfa
->sb_char
);
605 re_free (dfa
->subexp_map
);
607 re_free (dfa
->re_str
);
614 /* Free dynamically allocated space used by PREG. */
620 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
621 if (BE (dfa
!= NULL
, 1))
622 free_dfa_content (dfa
);
626 re_free (preg
->fastmap
);
627 preg
->fastmap
= NULL
;
629 re_free (preg
->translate
);
630 preg
->translate
= NULL
;
633 weak_alias (__regfree
, regfree
)
636 /* Entry points compatible with 4.2 BSD regex library. We don't define
637 them unless specifically requested. */
639 #if defined _REGEX_RE_COMP || defined _LIBC
641 /* BSD has one and only one pattern buffer. */
642 static struct re_pattern_buffer re_comp_buf
;
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647 these names if they don't use our functions, and still use
648 regcomp/regexec above without link errors. */
659 if (!re_comp_buf
.buffer
)
660 return gettext ("No previous regular expression");
664 if (re_comp_buf
.buffer
)
666 fastmap
= re_comp_buf
.fastmap
;
667 re_comp_buf
.fastmap
= NULL
;
668 __regfree (&re_comp_buf
);
669 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
670 re_comp_buf
.fastmap
= fastmap
;
673 if (re_comp_buf
.fastmap
== NULL
)
675 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
676 if (re_comp_buf
.fastmap
== NULL
)
677 return (char *) gettext (__re_error_msgid
678 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
681 /* Since `re_exec' always passes NULL for the `regs' argument, we
682 don't need to initialize the pattern buffer fields which affect it. */
684 /* Match anchors at newlines. */
685 re_comp_buf
.newline_anchor
= 1;
687 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
692 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
693 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
697 libc_freeres_fn (free_mem
)
699 __regfree (&re_comp_buf
);
703 #endif /* _REGEX_RE_COMP */
705 /* Internal entry point.
706 Compile the regular expression PATTERN, whose length is LENGTH.
707 SYNTAX indicate regular expression's syntax. */
710 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
713 reg_errcode_t err
= REG_NOERROR
;
717 /* Initialize the pattern buffer. */
718 preg
->fastmap_accurate
= 0;
719 preg
->syntax
= syntax
;
720 preg
->not_bol
= preg
->not_eol
= 0;
723 preg
->can_be_null
= 0;
724 preg
->regs_allocated
= REGS_UNALLOCATED
;
726 /* Initialize the dfa. */
727 dfa
= (re_dfa_t
*) preg
->buffer
;
728 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
730 /* If zero allocated, but buffer is non-null, try to realloc
731 enough space. This loses if buffer's address is bogus, but
732 that is the user's responsibility. If ->buffer is NULL this
733 is a simple allocation. */
734 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
737 preg
->allocated
= sizeof (re_dfa_t
);
738 preg
->buffer
= (unsigned char *) dfa
;
740 preg
->used
= sizeof (re_dfa_t
);
742 err
= init_dfa (dfa
, length
);
743 if (BE (err
!= REG_NOERROR
, 0))
745 free_dfa_content (dfa
);
751 /* Note: length+1 will not overflow since it is checked in init_dfa. */
752 dfa
->re_str
= re_malloc (char, length
+ 1);
753 strncpy (dfa
->re_str
, pattern
, length
+ 1);
756 __libc_lock_init (dfa
->lock
);
758 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
759 syntax
& RE_ICASE
, dfa
);
760 if (BE (err
!= REG_NOERROR
, 0))
762 re_compile_internal_free_return
:
763 free_workarea_compile (preg
);
764 re_string_destruct (®exp
);
765 free_dfa_content (dfa
);
771 /* Parse the regular expression, and build a structure tree. */
773 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
774 if (BE (dfa
->str_tree
== NULL
, 0))
775 goto re_compile_internal_free_return
;
777 /* Analyze the tree and create the nfa. */
778 err
= analyze (preg
);
779 if (BE (err
!= REG_NOERROR
, 0))
780 goto re_compile_internal_free_return
;
782 #ifdef RE_ENABLE_I18N
783 /* If possible, do searching in single byte encoding to speed things up. */
784 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
788 /* Then create the initial state of the dfa. */
789 err
= create_initial_state (dfa
);
791 /* Release work areas. */
792 free_workarea_compile (preg
);
793 re_string_destruct (®exp
);
795 if (BE (err
!= REG_NOERROR
, 0))
797 free_dfa_content (dfa
);
805 /* Initialize DFA. We use the length of the regular expression PAT_LEN
806 as the initial length of some arrays. */
809 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
811 unsigned int table_size
;
816 memset (dfa
, '\0', sizeof (re_dfa_t
));
818 /* Force allocation of str_tree_storage the first time. */
819 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
821 /* Avoid overflows. */
822 if (pat_len
== SIZE_MAX
)
825 dfa
->nodes_alloc
= pat_len
+ 1;
826 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
828 /* table_size = 2 ^ ceil(log pat_len) */
829 for (table_size
= 1; ; table_size
<<= 1)
830 if (table_size
> pat_len
)
833 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
834 dfa
->state_hash_mask
= table_size
- 1;
836 dfa
->mb_cur_max
= MB_CUR_MAX
;
838 if (dfa
->mb_cur_max
== 6
839 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
841 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
844 # ifdef HAVE_LANGINFO_CODESET
845 codeset_name
= nl_langinfo (CODESET
);
847 codeset_name
= getenv ("LC_ALL");
848 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
849 codeset_name
= getenv ("LC_CTYPE");
850 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
851 codeset_name
= getenv ("LANG");
852 if (codeset_name
== NULL
)
854 else if (strchr (codeset_name
, '.') != NULL
)
855 codeset_name
= strchr (codeset_name
, '.') + 1;
858 if (strcasecmp (codeset_name
, "UTF-8") == 0
859 || strcasecmp (codeset_name
, "UTF8") == 0)
862 /* We check exhaustively in the loop below if this charset is a
863 superset of ASCII. */
864 dfa
->map_notascii
= 0;
867 #ifdef RE_ENABLE_I18N
868 if (dfa
->mb_cur_max
> 1)
871 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
876 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
877 if (BE (dfa
->sb_char
== NULL
, 0))
880 /* Set the bits corresponding to single byte chars. */
881 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
882 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
884 wint_t wch
= __btowc (ch
);
886 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
888 if (isascii (ch
) && wch
!= ch
)
889 dfa
->map_notascii
= 1;
896 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
901 /* Initialize WORD_CHAR table, which indicate which character is
902 "word". In this case "word" means that it is the word construction
903 character used by some operators like "\<", "\>", etc. */
907 init_word_char (re_dfa_t
*dfa
)
910 dfa
->word_ops_used
= 1;
911 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
912 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
913 if (isalnum (ch
) || ch
== '_')
914 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
917 /* Free the work area which are only used while compiling. */
920 free_workarea_compile (regex_t
*preg
)
922 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
923 bin_tree_storage_t
*storage
, *next
;
924 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
926 next
= storage
->next
;
929 dfa
->str_tree_storage
= NULL
;
930 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
931 dfa
->str_tree
= NULL
;
932 re_free (dfa
->org_indices
);
933 dfa
->org_indices
= NULL
;
936 /* Create initial states for all contexts. */
939 create_initial_state (re_dfa_t
*dfa
)
943 re_node_set init_nodes
;
945 /* Initial states have the epsilon closure of the node which is
946 the first node of the regular expression. */
947 first
= dfa
->str_tree
->first
->node_idx
;
948 dfa
->init_node
= first
;
949 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
950 if (BE (err
!= REG_NOERROR
, 0))
953 /* The back-references which are in initial states can epsilon transit,
954 since in this case all of the subexpressions can be null.
955 Then we add epsilon closures of the nodes which are the next nodes of
956 the back-references. */
957 if (dfa
->nbackref
> 0)
958 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
960 int node_idx
= init_nodes
.elems
[i
];
961 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
964 if (type
!= OP_BACK_REF
)
966 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
968 re_token_t
*clexp_node
;
969 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
970 if (clexp_node
->type
== OP_CLOSE_SUBEXP
971 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
974 if (clexp_idx
== init_nodes
.nelem
)
977 if (type
== OP_BACK_REF
)
979 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
980 if (!re_node_set_contains (&init_nodes
, dest_idx
))
982 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
988 /* It must be the first time to invoke acquire_state. */
989 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
990 /* We don't check ERR here, since the initial state must not be NULL. */
991 if (BE (dfa
->init_state
== NULL
, 0))
993 if (dfa
->init_state
->has_constraint
)
995 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
997 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
999 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1003 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1004 || dfa
->init_state_begbuf
== NULL
, 0))
1008 dfa
->init_state_word
= dfa
->init_state_nl
1009 = dfa
->init_state_begbuf
= dfa
->init_state
;
1011 re_node_set_free (&init_nodes
);
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018 DFA nodes where needed. */
1021 optimize_utf8 (re_dfa_t
*dfa
)
1023 int node
, i
, mb_chars
= 0, has_period
= 0;
1025 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1026 switch (dfa
->nodes
[node
].type
)
1029 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1033 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1041 /* Word anchors etc. cannot be handled. It's okay to test
1042 opr.ctx_type since constraints (for all DFA nodes) are
1043 created by ORing one or more opr.ctx_type values. */
1053 case OP_DUP_ASTERISK
:
1054 case OP_OPEN_SUBEXP
:
1055 case OP_CLOSE_SUBEXP
:
1057 case COMPLEX_BRACKET
:
1059 case SIMPLE_BRACKET
:
1060 /* Just double check. The non-ASCII range starts at 0x80. */
1061 assert (0x80 % BITSET_WORD_BITS
== 0);
1062 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1063 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1070 if (mb_chars
|| has_period
)
1071 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1073 if (dfa
->nodes
[node
].type
== CHARACTER
1074 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1075 dfa
->nodes
[node
].mb_partial
= 0;
1076 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1077 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1080 /* The search can be in single byte locale. */
1081 dfa
->mb_cur_max
= 1;
1083 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1087 /* Analyze the structure tree, and calculate "first", "next", "edest",
1088 "eclosure", and "inveclosure". */
1090 static reg_errcode_t
1091 analyze (regex_t
*preg
)
1093 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1096 /* Allocate arrays. */
1097 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1098 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1099 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1100 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1101 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1102 || dfa
->eclosures
== NULL
, 0))
1105 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1106 if (dfa
->subexp_map
!= NULL
)
1109 for (i
= 0; i
< preg
->re_nsub
; i
++)
1110 dfa
->subexp_map
[i
] = i
;
1111 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1112 for (i
= 0; i
< preg
->re_nsub
; i
++)
1113 if (dfa
->subexp_map
[i
] != i
)
1115 if (i
== preg
->re_nsub
)
1117 free (dfa
->subexp_map
);
1118 dfa
->subexp_map
= NULL
;
1122 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1123 if (BE (ret
!= REG_NOERROR
, 0))
1125 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1126 if (BE (ret
!= REG_NOERROR
, 0))
1128 preorder (dfa
->str_tree
, calc_next
, dfa
);
1129 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1130 if (BE (ret
!= REG_NOERROR
, 0))
1132 ret
= calc_eclosure (dfa
);
1133 if (BE (ret
!= REG_NOERROR
, 0))
1136 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1137 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1138 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1141 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1142 if (BE (dfa
->inveclosures
== NULL
, 0))
1144 ret
= calc_inveclosure (dfa
);
1150 /* Our parse trees are very unbalanced, so we cannot use a stack to
1151 implement parse tree visits. Instead, we use parent pointers and
1152 some hairy code in these two functions. */
1153 static reg_errcode_t
1154 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1157 bin_tree_t
*node
, *prev
;
1159 for (node
= root
; ; )
1161 /* Descend down the tree, preferably to the left (or to the right
1162 if that's the only child). */
1163 while (node
->left
|| node
->right
)
1171 reg_errcode_t err
= fn (extra
, node
);
1172 if (BE (err
!= REG_NOERROR
, 0))
1174 if (node
->parent
== NULL
)
1177 node
= node
->parent
;
1179 /* Go up while we have a node that is reached from the right. */
1180 while (node
->right
== prev
|| node
->right
== NULL
);
1185 static reg_errcode_t
1186 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1191 for (node
= root
; ; )
1193 reg_errcode_t err
= fn (extra
, node
);
1194 if (BE (err
!= REG_NOERROR
, 0))
1197 /* Go to the left node, or up and to the right. */
1202 bin_tree_t
*prev
= NULL
;
1203 while (node
->right
== prev
|| node
->right
== NULL
)
1206 node
= node
->parent
;
1215 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1216 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1217 backreferences as well. Requires a preorder visit. */
1218 static reg_errcode_t
1219 optimize_subexps (void *extra
, bin_tree_t
*node
)
1221 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1223 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1225 int idx
= node
->token
.opr
.idx
;
1226 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1227 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1230 else if (node
->token
.type
== SUBEXP
1231 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1233 int other_idx
= node
->left
->token
.opr
.idx
;
1235 node
->left
= node
->left
->left
;
1237 node
->left
->parent
= node
;
1239 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1240 if (other_idx
< BITSET_WORD_BITS
)
1241 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1247 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1248 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1249 static reg_errcode_t
1250 lower_subexps (void *extra
, bin_tree_t
*node
)
1252 regex_t
*preg
= (regex_t
*) extra
;
1253 reg_errcode_t err
= REG_NOERROR
;
1255 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1257 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1259 node
->left
->parent
= node
;
1261 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1263 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1265 node
->right
->parent
= node
;
1272 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1274 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1275 bin_tree_t
*body
= node
->left
;
1276 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1279 /* We do not optimize empty subexpressions, because otherwise we may
1280 have bad CONCAT nodes with NULL children. This is obviously not
1281 very common, so we do not lose much. An example that triggers
1282 this case is the sed "script" /\(\)/x. */
1283 && node
->left
!= NULL
1284 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1285 || !(dfa
->used_bkref_map
1286 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1289 /* Convert the SUBEXP node to the concatenation of an
1290 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1291 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1292 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1293 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1294 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1295 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1301 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1302 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1306 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1307 nodes. Requires a postorder visit. */
1308 static reg_errcode_t
1309 calc_first (void *extra
, bin_tree_t
*node
)
1311 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1312 if (node
->token
.type
== CONCAT
)
1314 node
->first
= node
->left
->first
;
1315 node
->node_idx
= node
->left
->node_idx
;
1320 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1321 if (BE (node
->node_idx
== -1, 0))
1323 if (node
->token
.type
== ANCHOR
)
1324 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1329 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1330 static reg_errcode_t
1331 calc_next (void *extra
, bin_tree_t
*node
)
1333 switch (node
->token
.type
)
1335 case OP_DUP_ASTERISK
:
1336 node
->left
->next
= node
;
1339 node
->left
->next
= node
->right
->first
;
1340 node
->right
->next
= node
->next
;
1344 node
->left
->next
= node
->next
;
1346 node
->right
->next
= node
->next
;
1352 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1353 static reg_errcode_t
1354 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1356 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1357 int idx
= node
->node_idx
;
1358 reg_errcode_t err
= REG_NOERROR
;
1360 switch (node
->token
.type
)
1366 assert (node
->next
== NULL
);
1369 case OP_DUP_ASTERISK
:
1373 dfa
->has_plural_match
= 1;
1374 if (node
->left
!= NULL
)
1375 left
= node
->left
->first
->node_idx
;
1377 left
= node
->next
->node_idx
;
1378 if (node
->right
!= NULL
)
1379 right
= node
->right
->first
->node_idx
;
1381 right
= node
->next
->node_idx
;
1383 assert (right
> -1);
1384 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1389 case OP_OPEN_SUBEXP
:
1390 case OP_CLOSE_SUBEXP
:
1391 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1395 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1396 if (node
->token
.type
== OP_BACK_REF
)
1397 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1401 assert (!IS_EPSILON_NODE (node
->token
.type
));
1402 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1409 /* Duplicate the epsilon closure of the node ROOT_NODE.
1410 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1411 to their own constraint. */
1413 static reg_errcode_t
1415 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1416 int root_node
, unsigned int init_constraint
)
1418 int org_node
, clone_node
, ret
;
1419 unsigned int constraint
= init_constraint
;
1420 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1422 int org_dest
, clone_dest
;
1423 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1425 /* If the back reference epsilon-transit, its destination must
1426 also have the constraint. Then duplicate the epsilon closure
1427 of the destination of the back reference, and store it in
1428 edests of the back reference. */
1429 org_dest
= dfa
->nexts
[org_node
];
1430 re_node_set_empty (dfa
->edests
+ clone_node
);
1431 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1432 if (BE (clone_dest
== -1, 0))
1434 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1435 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1436 if (BE (ret
< 0, 0))
1439 else if (dfa
->edests
[org_node
].nelem
== 0)
1441 /* In case of the node can't epsilon-transit, don't duplicate the
1442 destination and store the original destination as the
1443 destination of the node. */
1444 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1447 else if (dfa
->edests
[org_node
].nelem
== 1)
1449 /* In case of the node can epsilon-transit, and it has only one
1451 org_dest
= dfa
->edests
[org_node
].elems
[0];
1452 re_node_set_empty (dfa
->edests
+ clone_node
);
1453 /* If the node is root_node itself, it means the epsilon clsoure
1454 has a loop. Then tie it to the destination of the root_node. */
1455 if (org_node
== root_node
&& clone_node
!= org_node
)
1457 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1458 if (BE (ret
< 0, 0))
1462 /* In case of the node has another constraint, add it. */
1463 constraint
|= dfa
->nodes
[org_node
].constraint
;
1464 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1465 if (BE (clone_dest
== -1, 0))
1467 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1468 if (BE (ret
< 0, 0))
1471 else /* dfa->edests[org_node].nelem == 2 */
1473 /* In case of the node can epsilon-transit, and it has two
1474 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1475 org_dest
= dfa
->edests
[org_node
].elems
[0];
1476 re_node_set_empty (dfa
->edests
+ clone_node
);
1477 /* Search for a duplicated node which satisfies the constraint. */
1478 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1479 if (clone_dest
== -1)
1481 /* There is no such duplicated node, create a new one. */
1483 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1484 if (BE (clone_dest
== -1, 0))
1486 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1487 if (BE (ret
< 0, 0))
1489 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1490 root_node
, constraint
);
1491 if (BE (err
!= REG_NOERROR
, 0))
1496 /* There is a duplicated node which satisfies the constraint,
1497 use it to avoid infinite loop. */
1498 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1499 if (BE (ret
< 0, 0))
1503 org_dest
= dfa
->edests
[org_node
].elems
[1];
1504 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1505 if (BE (clone_dest
== -1, 0))
1507 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1508 if (BE (ret
< 0, 0))
1511 org_node
= org_dest
;
1512 clone_node
= clone_dest
;
1517 /* Search for a node which is duplicated from the node ORG_NODE, and
1518 satisfies the constraint CONSTRAINT. */
1521 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1522 unsigned int constraint
)
1525 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1527 if (org_node
== dfa
->org_indices
[idx
]
1528 && constraint
== dfa
->nodes
[idx
].constraint
)
1529 return idx
; /* Found. */
1531 return -1; /* Not found. */
1534 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1535 Return the index of the new node, or -1 if insufficient storage is
1539 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1541 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1542 if (BE (dup_idx
!= -1, 1))
1544 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1545 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1546 dfa
->nodes
[dup_idx
].duplicated
= 1;
1548 /* Store the index of the original node. */
1549 dfa
->org_indices
[dup_idx
] = org_idx
;
1554 static reg_errcode_t
1555 calc_inveclosure (re_dfa_t
*dfa
)
1558 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1559 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1561 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1563 int *elems
= dfa
->eclosures
[src
].elems
;
1564 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1566 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1567 if (BE (ret
== -1, 0))
1575 /* Calculate "eclosure" for all the node in DFA. */
1577 static reg_errcode_t
1578 calc_eclosure (re_dfa_t
*dfa
)
1580 int node_idx
, incomplete
;
1582 assert (dfa
->nodes_len
> 0);
1585 /* For each nodes, calculate epsilon closure. */
1586 for (node_idx
= 0; ; ++node_idx
)
1589 re_node_set eclosure_elem
;
1590 if (node_idx
== dfa
->nodes_len
)
1599 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1602 /* If we have already calculated, skip it. */
1603 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1605 /* Calculate epsilon closure of `node_idx'. */
1606 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1607 if (BE (err
!= REG_NOERROR
, 0))
1610 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1613 re_node_set_free (&eclosure_elem
);
1619 /* Calculate epsilon closure of NODE. */
1621 static reg_errcode_t
1622 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1626 re_node_set eclosure
;
1628 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1629 if (BE (err
!= REG_NOERROR
, 0))
1632 /* This indicates that we are calculating this node now.
1633 We reference this value to avoid infinite loop. */
1634 dfa
->eclosures
[node
].nelem
= -1;
1636 /* If the current node has constraints, duplicate all nodes
1637 since they must inherit the constraints. */
1638 if (dfa
->nodes
[node
].constraint
1639 && dfa
->edests
[node
].nelem
1640 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1642 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1643 dfa
->nodes
[node
].constraint
);
1644 if (BE (err
!= REG_NOERROR
, 0))
1648 /* Expand each epsilon destination nodes. */
1649 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1650 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1652 re_node_set eclosure_elem
;
1653 int edest
= dfa
->edests
[node
].elems
[i
];
1654 /* If calculating the epsilon closure of `edest' is in progress,
1655 return intermediate result. */
1656 if (dfa
->eclosures
[edest
].nelem
== -1)
1661 /* If we haven't calculated the epsilon closure of `edest' yet,
1662 calculate now. Otherwise use calculated epsilon closure. */
1663 if (dfa
->eclosures
[edest
].nelem
== 0)
1665 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1666 if (BE (err
!= REG_NOERROR
, 0))
1670 eclosure_elem
= dfa
->eclosures
[edest
];
1671 /* Merge the epsilon closure of `edest'. */
1672 re_node_set_merge (&eclosure
, &eclosure_elem
);
1673 /* If the epsilon closure of `edest' is incomplete,
1674 the epsilon closure of this node is also incomplete. */
1675 if (dfa
->eclosures
[edest
].nelem
== 0)
1678 re_node_set_free (&eclosure_elem
);
1682 /* Epsilon closures include itself. */
1683 re_node_set_insert (&eclosure
, node
);
1684 if (incomplete
&& !root
)
1685 dfa
->eclosures
[node
].nelem
= 0;
1687 dfa
->eclosures
[node
] = eclosure
;
1688 *new_set
= eclosure
;
1692 /* Functions for token which are used in the parser. */
1694 /* Fetch a token from INPUT.
1695 We must not use this function inside bracket expressions. */
1699 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1701 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1704 /* Peek a token from INPUT, and return the length of the token.
1705 We must not use this function inside bracket expressions. */
1709 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1713 if (re_string_eoi (input
))
1715 token
->type
= END_OF_RE
;
1719 c
= re_string_peek_byte (input
, 0);
1722 token
->word_char
= 0;
1723 #ifdef RE_ENABLE_I18N
1724 token
->mb_partial
= 0;
1725 if (input
->mb_cur_max
> 1 &&
1726 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1728 token
->type
= CHARACTER
;
1729 token
->mb_partial
= 1;
1736 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1738 token
->type
= BACK_SLASH
;
1742 c2
= re_string_peek_byte_case (input
, 1);
1744 token
->type
= CHARACTER
;
1745 #ifdef RE_ENABLE_I18N
1746 if (input
->mb_cur_max
> 1)
1748 wint_t wc
= re_string_wchar_at (input
,
1749 re_string_cur_idx (input
) + 1);
1750 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1754 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1759 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1760 token
->type
= OP_ALT
;
1762 case '1': case '2': case '3': case '4': case '5':
1763 case '6': case '7': case '8': case '9':
1764 if (!(syntax
& RE_NO_BK_REFS
))
1766 token
->type
= OP_BACK_REF
;
1767 token
->opr
.idx
= c2
- '1';
1771 if (!(syntax
& RE_NO_GNU_OPS
))
1773 token
->type
= ANCHOR
;
1774 token
->opr
.ctx_type
= WORD_FIRST
;
1778 if (!(syntax
& RE_NO_GNU_OPS
))
1780 token
->type
= ANCHOR
;
1781 token
->opr
.ctx_type
= WORD_LAST
;
1785 if (!(syntax
& RE_NO_GNU_OPS
))
1787 token
->type
= ANCHOR
;
1788 token
->opr
.ctx_type
= WORD_DELIM
;
1792 if (!(syntax
& RE_NO_GNU_OPS
))
1794 token
->type
= ANCHOR
;
1795 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1799 if (!(syntax
& RE_NO_GNU_OPS
))
1800 token
->type
= OP_WORD
;
1803 if (!(syntax
& RE_NO_GNU_OPS
))
1804 token
->type
= OP_NOTWORD
;
1807 if (!(syntax
& RE_NO_GNU_OPS
))
1808 token
->type
= OP_SPACE
;
1811 if (!(syntax
& RE_NO_GNU_OPS
))
1812 token
->type
= OP_NOTSPACE
;
1815 if (!(syntax
& RE_NO_GNU_OPS
))
1817 token
->type
= ANCHOR
;
1818 token
->opr
.ctx_type
= BUF_FIRST
;
1822 if (!(syntax
& RE_NO_GNU_OPS
))
1824 token
->type
= ANCHOR
;
1825 token
->opr
.ctx_type
= BUF_LAST
;
1829 if (!(syntax
& RE_NO_BK_PARENS
))
1830 token
->type
= OP_OPEN_SUBEXP
;
1833 if (!(syntax
& RE_NO_BK_PARENS
))
1834 token
->type
= OP_CLOSE_SUBEXP
;
1837 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1838 token
->type
= OP_DUP_PLUS
;
1841 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1842 token
->type
= OP_DUP_QUESTION
;
1845 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1846 token
->type
= OP_OPEN_DUP_NUM
;
1849 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1850 token
->type
= OP_CLOSE_DUP_NUM
;
1858 token
->type
= CHARACTER
;
1859 #ifdef RE_ENABLE_I18N
1860 if (input
->mb_cur_max
> 1)
1862 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1863 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1867 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1872 if (syntax
& RE_NEWLINE_ALT
)
1873 token
->type
= OP_ALT
;
1876 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1877 token
->type
= OP_ALT
;
1880 token
->type
= OP_DUP_ASTERISK
;
1883 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1884 token
->type
= OP_DUP_PLUS
;
1887 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1888 token
->type
= OP_DUP_QUESTION
;
1891 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1892 token
->type
= OP_OPEN_DUP_NUM
;
1895 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1896 token
->type
= OP_CLOSE_DUP_NUM
;
1899 if (syntax
& RE_NO_BK_PARENS
)
1900 token
->type
= OP_OPEN_SUBEXP
;
1903 if (syntax
& RE_NO_BK_PARENS
)
1904 token
->type
= OP_CLOSE_SUBEXP
;
1907 token
->type
= OP_OPEN_BRACKET
;
1910 token
->type
= OP_PERIOD
;
1913 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1914 re_string_cur_idx (input
) != 0)
1916 char prev
= re_string_peek_byte (input
, -1);
1917 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1920 token
->type
= ANCHOR
;
1921 token
->opr
.ctx_type
= LINE_FIRST
;
1924 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1925 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1928 re_string_skip_bytes (input
, 1);
1929 peek_token (&next
, input
, syntax
);
1930 re_string_skip_bytes (input
, -1);
1931 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1934 token
->type
= ANCHOR
;
1935 token
->opr
.ctx_type
= LINE_LAST
;
1943 /* Peek a token from INPUT, and return the length of the token.
1944 We must not use this function out of bracket expressions. */
1948 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1951 if (re_string_eoi (input
))
1953 token
->type
= END_OF_RE
;
1956 c
= re_string_peek_byte (input
, 0);
1959 #ifdef RE_ENABLE_I18N
1960 if (input
->mb_cur_max
> 1 &&
1961 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1963 token
->type
= CHARACTER
;
1966 #endif /* RE_ENABLE_I18N */
1968 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1969 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1971 /* In this case, '\' escape a character. */
1973 re_string_skip_bytes (input
, 1);
1974 c2
= re_string_peek_byte (input
, 0);
1976 token
->type
= CHARACTER
;
1979 if (c
== '[') /* '[' is a special char in a bracket exps. */
1983 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
1984 c2
= re_string_peek_byte (input
, 1);
1992 token
->type
= OP_OPEN_COLL_ELEM
;
1995 token
->type
= OP_OPEN_EQUIV_CLASS
;
1998 if (syntax
& RE_CHAR_CLASSES
)
2000 token
->type
= OP_OPEN_CHAR_CLASS
;
2003 /* else fall through. */
2005 token
->type
= CHARACTER
;
2015 token
->type
= OP_CHARSET_RANGE
;
2018 token
->type
= OP_CLOSE_BRACKET
;
2021 token
->type
= OP_NON_MATCH_LIST
;
2024 token
->type
= CHARACTER
;
2029 /* Functions for parser. */
2031 /* Entry point of the parser.
2032 Parse the regular expression REGEXP and return the structure tree.
2033 If an error is occured, ERR is set by error code, and return NULL.
2034 This function build the following tree, from regular expression <reg_exp>:
2040 CAT means concatenation.
2041 EOR means end of regular expression. */
2044 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2047 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2048 bin_tree_t
*tree
, *eor
, *root
;
2049 re_token_t current_token
;
2050 dfa
->syntax
= syntax
;
2051 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2052 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2053 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2055 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2057 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2060 if (BE (eor
== NULL
|| root
== NULL
, 0))
2068 /* This function build the following tree, from regular expression
2069 <branch1>|<branch2>:
2075 ALT means alternative, which represents the operator `|'. */
2078 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2079 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2081 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2082 bin_tree_t
*tree
, *branch
= NULL
;
2083 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2084 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2087 while (token
->type
== OP_ALT
)
2089 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2090 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2091 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2093 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2094 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2099 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2100 if (BE (tree
== NULL
, 0))
2109 /* This function build the following tree, from regular expression
2116 CAT means concatenation. */
2119 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2120 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2122 bin_tree_t
*tree
, *exp
;
2123 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2124 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2125 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2128 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2129 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2131 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2132 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2136 if (tree
!= NULL
&& exp
!= NULL
)
2138 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2145 else if (tree
== NULL
)
2147 /* Otherwise exp == NULL, we don't need to create new tree. */
2152 /* This function build the following tree, from regular expression a*:
2159 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2160 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2162 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2164 switch (token
->type
)
2167 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2168 if (BE (tree
== NULL
, 0))
2173 #ifdef RE_ENABLE_I18N
2174 if (dfa
->mb_cur_max
> 1)
2176 while (!re_string_eoi (regexp
)
2177 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2179 bin_tree_t
*mbc_remain
;
2180 fetch_token (token
, regexp
, syntax
);
2181 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2182 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2183 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2192 case OP_OPEN_SUBEXP
:
2193 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2194 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2197 case OP_OPEN_BRACKET
:
2198 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2199 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2203 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2208 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2209 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2210 if (BE (tree
== NULL
, 0))
2216 dfa
->has_mb_node
= 1;
2218 case OP_OPEN_DUP_NUM
:
2219 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2225 case OP_DUP_ASTERISK
:
2227 case OP_DUP_QUESTION
:
2228 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2233 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2235 fetch_token (token
, regexp
, syntax
);
2236 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2238 /* else fall through */
2239 case OP_CLOSE_SUBEXP
:
2240 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2241 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2246 /* else fall through */
2247 case OP_CLOSE_DUP_NUM
:
2248 /* We treat it as a normal character. */
2250 /* Then we can these characters as normal characters. */
2251 token
->type
= CHARACTER
;
2252 /* mb_partial and word_char bits should be initialized already
2254 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2255 if (BE (tree
== NULL
, 0))
2262 if ((token
->opr
.ctx_type
2263 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2264 && dfa
->word_ops_used
== 0)
2265 init_word_char (dfa
);
2266 if (token
->opr
.ctx_type
== WORD_DELIM
2267 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2269 bin_tree_t
*tree_first
, *tree_last
;
2270 if (token
->opr
.ctx_type
== WORD_DELIM
)
2272 token
->opr
.ctx_type
= WORD_FIRST
;
2273 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2274 token
->opr
.ctx_type
= WORD_LAST
;
2278 token
->opr
.ctx_type
= INSIDE_WORD
;
2279 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2280 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2282 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2283 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2284 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2292 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2293 if (BE (tree
== NULL
, 0))
2299 /* We must return here, since ANCHORs can't be followed
2300 by repetition operators.
2301 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2302 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2303 fetch_token (token
, regexp
, syntax
);
2306 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2307 if (BE (tree
== NULL
, 0))
2312 if (dfa
->mb_cur_max
> 1)
2313 dfa
->has_mb_node
= 1;
2317 tree
= build_charclass_op (dfa
, regexp
->trans
,
2318 (const unsigned char *) "alnum",
2319 (const unsigned char *) "_",
2320 token
->type
== OP_NOTWORD
, err
);
2321 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2326 tree
= build_charclass_op (dfa
, regexp
->trans
,
2327 (const unsigned char *) "space",
2328 (const unsigned char *) "",
2329 token
->type
== OP_NOTSPACE
, err
);
2330 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2340 /* Must not happen? */
2346 fetch_token (token
, regexp
, syntax
);
2348 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2349 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2351 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2352 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2354 /* In BRE consecutive duplications are not allowed. */
2355 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2356 && (token
->type
== OP_DUP_ASTERISK
2357 || token
->type
== OP_OPEN_DUP_NUM
))
2367 /* This function build the following tree, from regular expression
2375 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2376 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2378 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2381 cur_nsub
= preg
->re_nsub
++;
2383 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2385 /* The subexpression may be a null string. */
2386 if (token
->type
== OP_CLOSE_SUBEXP
)
2390 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2391 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2393 if (BE (*err
!= REG_NOERROR
, 0))
2397 if (cur_nsub
<= '9' - '1')
2398 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2400 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2401 if (BE (tree
== NULL
, 0))
2406 tree
->token
.opr
.idx
= cur_nsub
;
2410 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2413 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2414 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2416 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2417 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2418 re_token_t start_token
= *token
;
2420 if (token
->type
== OP_OPEN_DUP_NUM
)
2423 start
= fetch_number (regexp
, token
, syntax
);
2426 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2427 start
= 0; /* We treat "{,m}" as "{0,m}". */
2430 *err
= REG_BADBR
; /* <re>{} is invalid. */
2434 if (BE (start
!= -2, 1))
2436 /* We treat "{n}" as "{n,n}". */
2437 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2438 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2439 ? fetch_number (regexp
, token
, syntax
) : -2));
2441 if (BE (start
== -2 || end
== -2, 0))
2443 /* Invalid sequence. */
2444 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2446 if (token
->type
== END_OF_RE
)
2454 /* If the syntax bit is set, rollback. */
2455 re_string_set_index (regexp
, start_idx
);
2456 *token
= start_token
;
2457 token
->type
= CHARACTER
;
2458 /* mb_partial and word_char bits should be already initialized by
2463 if (BE (end
!= -1 && start
> end
, 0))
2465 /* First number greater than second. */
2472 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2473 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2476 fetch_token (token
, regexp
, syntax
);
2478 if (BE (elem
== NULL
, 0))
2480 if (BE (start
== 0 && end
== 0, 0))
2482 postorder (elem
, free_tree
, NULL
);
2486 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2487 if (BE (start
> 0, 0))
2490 for (i
= 2; i
<= start
; ++i
)
2492 elem
= duplicate_tree (elem
, dfa
);
2493 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2494 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2495 goto parse_dup_op_espace
;
2501 /* Duplicate ELEM before it is marked optional. */
2502 elem
= duplicate_tree (elem
, dfa
);
2508 if (elem
->token
.type
== SUBEXP
)
2509 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2511 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2512 if (BE (tree
== NULL
, 0))
2513 goto parse_dup_op_espace
;
2515 /* This loop is actually executed only when end != -1,
2516 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2517 already created the start+1-th copy. */
2518 for (i
= start
+ 2; i
<= end
; ++i
)
2520 elem
= duplicate_tree (elem
, dfa
);
2521 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2522 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2523 goto parse_dup_op_espace
;
2525 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2526 if (BE (tree
== NULL
, 0))
2527 goto parse_dup_op_espace
;
2531 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2535 parse_dup_op_espace
:
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541 I'm not sure, but maybe enough. */
2542 #define BRACKET_NAME_BUF_SIZE 32
2545 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546 Build the range expression which starts from START_ELEM, and ends
2547 at END_ELEM. The result are written to MBCSET and SBCSET.
2548 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549 mbcset->range_ends, is a pointer argument sinse we may
2552 static reg_errcode_t
2554 # ifdef RE_ENABLE_I18N
2555 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2556 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2557 # else /* not RE_ENABLE_I18N */
2558 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2559 bracket_elem_t
*end_elem
)
2560 # endif /* not RE_ENABLE_I18N */
2562 unsigned int start_ch
, end_ch
;
2563 /* Equivalence Classes and Character Classes can't be a range start/end. */
2564 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2565 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2569 /* We can handle no multi character collating elements without libc
2571 if (BE ((start_elem
->type
== COLL_SYM
2572 && strlen ((char *) start_elem
->opr
.name
) > 1)
2573 || (end_elem
->type
== COLL_SYM
2574 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2575 return REG_ECOLLATE
;
2577 # ifdef RE_ENABLE_I18N
2582 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2584 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2585 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2587 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2588 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2590 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2591 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2592 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2593 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2594 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2595 return REG_ECOLLATE
;
2596 cmp_buf
[0] = start_wc
;
2597 cmp_buf
[4] = end_wc
;
2598 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2601 /* Got valid collation sequence values, add them as a new entry.
2602 However, for !_LIBC we have no collation elements: if the
2603 character set is single byte, the single byte character set
2604 that we build below suffices. parse_bracket_exp passes
2605 no MBCSET if dfa->mb_cur_max == 1. */
2608 /* Check the space of the arrays. */
2609 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2611 /* There is not enough space, need realloc. */
2612 wchar_t *new_array_start
, *new_array_end
;
2615 /* +1 in case of mbcset->nranges is 0. */
2616 new_nranges
= 2 * mbcset
->nranges
+ 1;
2617 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2618 are NULL if *range_alloc == 0. */
2619 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2621 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2624 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2627 mbcset
->range_starts
= new_array_start
;
2628 mbcset
->range_ends
= new_array_end
;
2629 *range_alloc
= new_nranges
;
2632 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2633 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2636 /* Build the table for single byte characters. */
2637 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2640 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2641 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2642 bitset_set (sbcset
, wc
);
2645 # else /* not RE_ENABLE_I18N */
2648 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2649 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2651 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2652 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2654 if (start_ch
> end_ch
)
2656 /* Build the table for single byte characters. */
2657 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2658 if (start_ch
<= ch
&& ch
<= end_ch
)
2659 bitset_set (sbcset
, ch
);
2661 # endif /* not RE_ENABLE_I18N */
2664 #endif /* not _LIBC */
2667 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2668 Build the collating element which is represented by NAME.
2669 The result are written to MBCSET and SBCSET.
2670 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2671 pointer argument since we may update it. */
2673 static reg_errcode_t
2675 # ifdef RE_ENABLE_I18N
2676 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2677 int *coll_sym_alloc
, const unsigned char *name
)
2678 # else /* not RE_ENABLE_I18N */
2679 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2680 # endif /* not RE_ENABLE_I18N */
2682 size_t name_len
= strlen ((const char *) name
);
2683 if (BE (name_len
!= 1, 0))
2684 return REG_ECOLLATE
;
2687 bitset_set (sbcset
, name
[0]);
2691 #endif /* not _LIBC */
2693 /* This function parse bracket expression like "[abc]", "[a-c]",
2697 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2698 reg_syntax_t syntax
, reg_errcode_t
*err
)
2701 const unsigned char *collseqmb
;
2702 const char *collseqwc
;
2705 const int32_t *symb_table
;
2706 const unsigned char *extra
;
2708 /* Local function for parse_bracket_exp used in _LIBC environement.
2709 Seek the collating symbol entry correspondings to NAME.
2710 Return the index of the symbol in the SYMB_TABLE. */
2713 __attribute ((always_inline
))
2714 seek_collating_symbol_entry (name
, name_len
)
2715 const unsigned char *name
;
2718 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2719 int32_t elem
= hash
% table_size
;
2720 if (symb_table
[2 * elem
] != 0)
2722 int32_t second
= hash
% (table_size
- 2) + 1;
2726 /* First compare the hashing value. */
2727 if (symb_table
[2 * elem
] == hash
2728 /* Compare the length of the name. */
2729 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2730 /* Compare the name. */
2731 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2734 /* Yep, this is the entry. */
2741 while (symb_table
[2 * elem
] != 0);
2746 /* Local function for parse_bracket_exp used in _LIBC environment.
2747 Look up the collation sequence value of BR_ELEM.
2748 Return the value if succeeded, UINT_MAX otherwise. */
2750 auto inline unsigned int
2751 __attribute ((always_inline
))
2752 lookup_collation_sequence_value (br_elem
)
2753 bracket_elem_t
*br_elem
;
2755 if (br_elem
->type
== SB_CHAR
)
2758 if (MB_CUR_MAX == 1)
2761 return collseqmb
[br_elem
->opr
.ch
];
2764 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2765 return __collseq_table_lookup (collseqwc
, wc
);
2768 else if (br_elem
->type
== MB_CHAR
)
2771 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2773 else if (br_elem
->type
== COLL_SYM
)
2775 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2779 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2781 if (symb_table
[2 * elem
] != 0)
2783 /* We found the entry. */
2784 idx
= symb_table
[2 * elem
+ 1];
2785 /* Skip the name of collating element name. */
2786 idx
+= 1 + extra
[idx
];
2787 /* Skip the byte sequence of the collating element. */
2788 idx
+= 1 + extra
[idx
];
2789 /* Adjust for the alignment. */
2790 idx
= (idx
+ 3) & ~3;
2791 /* Skip the multibyte collation sequence value. */
2792 idx
+= sizeof (unsigned int);
2793 /* Skip the wide char sequence of the collating element. */
2794 idx
+= sizeof (unsigned int) *
2795 (1 + *(unsigned int *) (extra
+ idx
));
2796 /* Return the collation sequence value. */
2797 return *(unsigned int *) (extra
+ idx
);
2799 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2801 /* No valid character. Match it as a single byte
2803 return collseqmb
[br_elem
->opr
.name
[0]];
2806 else if (sym_name_len
== 1)
2807 return collseqmb
[br_elem
->opr
.name
[0]];
2812 /* Local function for parse_bracket_exp used in _LIBC environement.
2813 Build the range expression which starts from START_ELEM, and ends
2814 at END_ELEM. The result are written to MBCSET and SBCSET.
2815 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2816 mbcset->range_ends, is a pointer argument sinse we may
2819 auto inline reg_errcode_t
2820 __attribute ((always_inline
))
2821 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2822 re_charset_t
*mbcset
;
2825 bracket_elem_t
*start_elem
, *end_elem
;
2828 uint32_t start_collseq
;
2829 uint32_t end_collseq
;
2831 /* Equivalence Classes and Character Classes can't be a range
2833 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2834 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2838 start_collseq
= lookup_collation_sequence_value (start_elem
);
2839 end_collseq
= lookup_collation_sequence_value (end_elem
);
2840 /* Check start/end collation sequence values. */
2841 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2842 return REG_ECOLLATE
;
2843 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2846 /* Got valid collation sequence values, add them as a new entry.
2847 However, if we have no collation elements, and the character set
2848 is single byte, the single byte character set that we
2849 build below suffices. */
2850 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2852 /* Check the space of the arrays. */
2853 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2855 /* There is not enough space, need realloc. */
2856 uint32_t *new_array_start
;
2857 uint32_t *new_array_end
;
2860 /* +1 in case of mbcset->nranges is 0. */
2861 new_nranges
= 2 * mbcset
->nranges
+ 1;
2862 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2864 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2867 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2870 mbcset
->range_starts
= new_array_start
;
2871 mbcset
->range_ends
= new_array_end
;
2872 *range_alloc
= new_nranges
;
2875 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2876 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2879 /* Build the table for single byte characters. */
2880 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2882 uint32_t ch_collseq
;
2884 if (MB_CUR_MAX == 1)
2887 ch_collseq
= collseqmb
[ch
];
2889 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2890 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2891 bitset_set (sbcset
, ch
);
2896 /* Local function for parse_bracket_exp used in _LIBC environement.
2897 Build the collating element which is represented by NAME.
2898 The result are written to MBCSET and SBCSET.
2899 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2900 pointer argument sinse we may update it. */
2902 auto inline reg_errcode_t
2903 __attribute ((always_inline
))
2904 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2905 re_charset_t
*mbcset
;
2906 int *coll_sym_alloc
;
2908 const unsigned char *name
;
2911 size_t name_len
= strlen ((const char *) name
);
2914 elem
= seek_collating_symbol_entry (name
, name_len
);
2915 if (symb_table
[2 * elem
] != 0)
2917 /* We found the entry. */
2918 idx
= symb_table
[2 * elem
+ 1];
2919 /* Skip the name of collating element name. */
2920 idx
+= 1 + extra
[idx
];
2922 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2924 /* No valid character, treat it as a normal
2926 bitset_set (sbcset
, name
[0]);
2930 return REG_ECOLLATE
;
2932 /* Got valid collation sequence, add it as a new entry. */
2933 /* Check the space of the arrays. */
2934 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2936 /* Not enough, realloc it. */
2937 /* +1 in case of mbcset->ncoll_syms is 0. */
2938 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2939 /* Use realloc since mbcset->coll_syms is NULL
2941 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2942 new_coll_sym_alloc
);
2943 if (BE (new_coll_syms
== NULL
, 0))
2945 mbcset
->coll_syms
= new_coll_syms
;
2946 *coll_sym_alloc
= new_coll_sym_alloc
;
2948 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2953 if (BE (name_len
!= 1, 0))
2954 return REG_ECOLLATE
;
2957 bitset_set (sbcset
, name
[0]);
2964 re_token_t br_token
;
2965 re_bitset_ptr_t sbcset
;
2966 #ifdef RE_ENABLE_I18N
2967 re_charset_t
*mbcset
;
2968 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2969 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2970 #endif /* not RE_ENABLE_I18N */
2972 bin_tree_t
*work_tree
;
2974 int first_round
= 1;
2976 collseqmb
= (const unsigned char *)
2977 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2978 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2984 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2985 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2986 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2987 _NL_COLLATE_SYMB_TABLEMB
);
2988 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2989 _NL_COLLATE_SYMB_EXTRAMB
);
2992 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
2993 #ifdef RE_ENABLE_I18N
2994 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2995 #endif /* RE_ENABLE_I18N */
2996 #ifdef RE_ENABLE_I18N
2997 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2999 if (BE (sbcset
== NULL
, 0))
3000 #endif /* RE_ENABLE_I18N */
3006 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3007 if (BE (token
->type
== END_OF_RE
, 0))
3010 goto parse_bracket_exp_free_return
;
3012 if (token
->type
== OP_NON_MATCH_LIST
)
3014 #ifdef RE_ENABLE_I18N
3015 mbcset
->non_match
= 1;
3016 #endif /* not RE_ENABLE_I18N */
3018 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3019 bitset_set (sbcset
, '\n');
3020 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3021 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3022 if (BE (token
->type
== END_OF_RE
, 0))
3025 goto parse_bracket_exp_free_return
;
3029 /* We treat the first ']' as a normal character. */
3030 if (token
->type
== OP_CLOSE_BRACKET
)
3031 token
->type
= CHARACTER
;
3035 bracket_elem_t start_elem
, end_elem
;
3036 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3037 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3039 int token_len2
= 0, is_range_exp
= 0;
3042 start_elem
.opr
.name
= start_name_buf
;
3043 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3044 syntax
, first_round
);
3045 if (BE (ret
!= REG_NOERROR
, 0))
3048 goto parse_bracket_exp_free_return
;
3052 /* Get information about the next token. We need it in any case. */
3053 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3055 /* Do not check for ranges if we know they are not allowed. */
3056 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3058 if (BE (token
->type
== END_OF_RE
, 0))
3061 goto parse_bracket_exp_free_return
;
3063 if (token
->type
== OP_CHARSET_RANGE
)
3065 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3066 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3067 if (BE (token2
.type
== END_OF_RE
, 0))
3070 goto parse_bracket_exp_free_return
;
3072 if (token2
.type
== OP_CLOSE_BRACKET
)
3074 /* We treat the last '-' as a normal character. */
3075 re_string_skip_bytes (regexp
, -token_len
);
3076 token
->type
= CHARACTER
;
3083 if (is_range_exp
== 1)
3085 end_elem
.opr
.name
= end_name_buf
;
3086 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3088 if (BE (ret
!= REG_NOERROR
, 0))
3091 goto parse_bracket_exp_free_return
;
3094 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3097 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3098 &start_elem
, &end_elem
);
3100 # ifdef RE_ENABLE_I18N
3101 *err
= build_range_exp (sbcset
,
3102 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3103 &range_alloc
, &start_elem
, &end_elem
);
3105 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3107 #endif /* RE_ENABLE_I18N */
3108 if (BE (*err
!= REG_NOERROR
, 0))
3109 goto parse_bracket_exp_free_return
;
3113 switch (start_elem
.type
)
3116 bitset_set (sbcset
, start_elem
.opr
.ch
);
3118 #ifdef RE_ENABLE_I18N
3120 /* Check whether the array has enough space. */
3121 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3123 wchar_t *new_mbchars
;
3124 /* Not enough, realloc it. */
3125 /* +1 in case of mbcset->nmbchars is 0. */
3126 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3127 /* Use realloc since array is NULL if *alloc == 0. */
3128 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3130 if (BE (new_mbchars
== NULL
, 0))
3131 goto parse_bracket_exp_espace
;
3132 mbcset
->mbchars
= new_mbchars
;
3134 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3136 #endif /* RE_ENABLE_I18N */
3138 *err
= build_equiv_class (sbcset
,
3139 #ifdef RE_ENABLE_I18N
3140 mbcset
, &equiv_class_alloc
,
3141 #endif /* RE_ENABLE_I18N */
3142 start_elem
.opr
.name
);
3143 if (BE (*err
!= REG_NOERROR
, 0))
3144 goto parse_bracket_exp_free_return
;
3147 *err
= build_collating_symbol (sbcset
,
3148 #ifdef RE_ENABLE_I18N
3149 mbcset
, &coll_sym_alloc
,
3150 #endif /* RE_ENABLE_I18N */
3151 start_elem
.opr
.name
);
3152 if (BE (*err
!= REG_NOERROR
, 0))
3153 goto parse_bracket_exp_free_return
;
3156 *err
= build_charclass (regexp
->trans
, sbcset
,
3157 #ifdef RE_ENABLE_I18N
3158 mbcset
, &char_class_alloc
,
3159 #endif /* RE_ENABLE_I18N */
3160 start_elem
.opr
.name
, syntax
);
3161 if (BE (*err
!= REG_NOERROR
, 0))
3162 goto parse_bracket_exp_free_return
;
3169 if (BE (token
->type
== END_OF_RE
, 0))
3172 goto parse_bracket_exp_free_return
;
3174 if (token
->type
== OP_CLOSE_BRACKET
)
3178 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3180 /* If it is non-matching list. */
3182 bitset_not (sbcset
);
3184 #ifdef RE_ENABLE_I18N
3185 /* Ensure only single byte characters are set. */
3186 if (dfa
->mb_cur_max
> 1)
3187 bitset_mask (sbcset
, dfa
->sb_char
);
3189 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3190 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3191 || mbcset
->non_match
)))
3193 bin_tree_t
*mbc_tree
;
3195 /* Build a tree for complex bracket. */
3196 dfa
->has_mb_node
= 1;
3197 br_token
.type
= COMPLEX_BRACKET
;
3198 br_token
.opr
.mbcset
= mbcset
;
3199 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3200 if (BE (mbc_tree
== NULL
, 0))
3201 goto parse_bracket_exp_espace
;
3202 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3203 if (sbcset
[sbc_idx
])
3205 /* If there are no bits set in sbcset, there is no point
3206 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3207 if (sbc_idx
< BITSET_WORDS
)
3209 /* Build a tree for simple bracket. */
3210 br_token
.type
= SIMPLE_BRACKET
;
3211 br_token
.opr
.sbcset
= sbcset
;
3212 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3213 if (BE (work_tree
== NULL
, 0))
3214 goto parse_bracket_exp_espace
;
3216 /* Then join them by ALT node. */
3217 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3218 if (BE (work_tree
== NULL
, 0))
3219 goto parse_bracket_exp_espace
;
3224 work_tree
= mbc_tree
;
3228 #endif /* not RE_ENABLE_I18N */
3230 #ifdef RE_ENABLE_I18N
3231 free_charset (mbcset
);
3233 /* Build a tree for simple bracket. */
3234 br_token
.type
= SIMPLE_BRACKET
;
3235 br_token
.opr
.sbcset
= sbcset
;
3236 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3237 if (BE (work_tree
== NULL
, 0))
3238 goto parse_bracket_exp_espace
;
3242 parse_bracket_exp_espace
:
3244 parse_bracket_exp_free_return
:
3246 #ifdef RE_ENABLE_I18N
3247 free_charset (mbcset
);
3248 #endif /* RE_ENABLE_I18N */
3252 /* Parse an element in the bracket expression. */
3254 static reg_errcode_t
3255 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3256 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3257 reg_syntax_t syntax
, int accept_hyphen
)
3259 #ifdef RE_ENABLE_I18N
3261 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3262 if (cur_char_size
> 1)
3264 elem
->type
= MB_CHAR
;
3265 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3266 re_string_skip_bytes (regexp
, cur_char_size
);
3269 #endif /* RE_ENABLE_I18N */
3270 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3271 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3272 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3273 return parse_bracket_symbol (elem
, regexp
, token
);
3274 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3276 /* A '-' must only appear as anything but a range indicator before
3277 the closing bracket. Everything else is an error. */
3279 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3280 if (token2
.type
!= OP_CLOSE_BRACKET
)
3281 /* The actual error value is not standardized since this whole
3282 case is undefined. But ERANGE makes good sense. */
3285 elem
->type
= SB_CHAR
;
3286 elem
->opr
.ch
= token
->opr
.c
;
3290 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3291 such as [:<character_class>:], [.<collating_element>.], and
3292 [=<equivalent_class>=]. */
3294 static reg_errcode_t
3295 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3298 unsigned char ch
, delim
= token
->opr
.c
;
3300 if (re_string_eoi(regexp
))
3304 if (i
>= BRACKET_NAME_BUF_SIZE
)
3306 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3307 ch
= re_string_fetch_byte_case (regexp
);
3309 ch
= re_string_fetch_byte (regexp
);
3310 if (re_string_eoi(regexp
))
3312 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3314 elem
->opr
.name
[i
] = ch
;
3316 re_string_skip_bytes (regexp
, 1);
3317 elem
->opr
.name
[i
] = '\0';
3318 switch (token
->type
)
3320 case OP_OPEN_COLL_ELEM
:
3321 elem
->type
= COLL_SYM
;
3323 case OP_OPEN_EQUIV_CLASS
:
3324 elem
->type
= EQUIV_CLASS
;
3326 case OP_OPEN_CHAR_CLASS
:
3327 elem
->type
= CHAR_CLASS
;
3335 /* Helper function for parse_bracket_exp.
3336 Build the equivalence class which is represented by NAME.
3337 The result are written to MBCSET and SBCSET.
3338 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3339 is a pointer argument sinse we may update it. */
3341 static reg_errcode_t
3342 #ifdef RE_ENABLE_I18N
3343 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3344 int *equiv_class_alloc
, const unsigned char *name
)
3345 #else /* not RE_ENABLE_I18N */
3346 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3347 #endif /* not RE_ENABLE_I18N */
3350 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3353 const int32_t *table
, *indirect
;
3354 const unsigned char *weights
, *extra
, *cp
;
3355 unsigned char char_buf
[2];
3359 /* This #include defines a local function! */
3360 # include <locale/weight.h>
3361 /* Calculate the index for equivalence class. */
3363 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3364 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3365 _NL_COLLATE_WEIGHTMB
);
3366 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3367 _NL_COLLATE_EXTRAMB
);
3368 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3369 _NL_COLLATE_INDIRECTMB
);
3370 idx1
= findidx (&cp
);
3371 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3372 /* This isn't a valid character. */
3373 return REG_ECOLLATE
;
3375 /* Build single byte matcing table for this equivalence class. */
3376 char_buf
[1] = (unsigned char) '\0';
3377 len
= weights
[idx1
& 0xffffff];
3378 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3382 idx2
= findidx (&cp
);
3387 /* This isn't a valid character. */
3389 /* Compare only if the length matches and the collation rule
3390 index is the same. */
3391 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3395 while (cnt
<= len
&&
3396 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3397 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3401 bitset_set (sbcset
, ch
);
3404 /* Check whether the array has enough space. */
3405 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3407 /* Not enough, realloc it. */
3408 /* +1 in case of mbcset->nequiv_classes is 0. */
3409 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3410 /* Use realloc since the array is NULL if *alloc == 0. */
3411 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3413 new_equiv_class_alloc
);
3414 if (BE (new_equiv_classes
== NULL
, 0))
3416 mbcset
->equiv_classes
= new_equiv_classes
;
3417 *equiv_class_alloc
= new_equiv_class_alloc
;
3419 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3424 if (BE (strlen ((const char *) name
) != 1, 0))
3425 return REG_ECOLLATE
;
3426 bitset_set (sbcset
, *name
);
3431 /* Helper function for parse_bracket_exp.
3432 Build the character class which is represented by NAME.
3433 The result are written to MBCSET and SBCSET.
3434 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3435 is a pointer argument sinse we may update it. */
3437 static reg_errcode_t
3438 #ifdef RE_ENABLE_I18N
3439 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3440 re_charset_t
*mbcset
, int *char_class_alloc
,
3441 const unsigned char *class_name
, reg_syntax_t syntax
)
3442 #else /* not RE_ENABLE_I18N */
3443 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3444 const unsigned char *class_name
, reg_syntax_t syntax
)
3445 #endif /* not RE_ENABLE_I18N */
3448 const char *name
= (const char *) class_name
;
3450 /* In case of REG_ICASE "upper" and "lower" match the both of
3451 upper and lower cases. */
3452 if ((syntax
& RE_ICASE
)
3453 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3456 #ifdef RE_ENABLE_I18N
3457 /* Check the space of the arrays. */
3458 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3460 /* Not enough, realloc it. */
3461 /* +1 in case of mbcset->nchar_classes is 0. */
3462 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3463 /* Use realloc since array is NULL if *alloc == 0. */
3464 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3465 new_char_class_alloc
);
3466 if (BE (new_char_classes
== NULL
, 0))
3468 mbcset
->char_classes
= new_char_classes
;
3469 *char_class_alloc
= new_char_class_alloc
;
3471 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3472 #endif /* RE_ENABLE_I18N */
3474 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3476 if (BE (trans != NULL, 0)) \
3478 for (i = 0; i < SBC_MAX; ++i) \
3479 if (ctype_func (i)) \
3480 bitset_set (sbcset, trans[i]); \
3484 for (i = 0; i < SBC_MAX; ++i) \
3485 if (ctype_func (i)) \
3486 bitset_set (sbcset, i); \
3490 if (strcmp (name
, "alnum") == 0)
3491 BUILD_CHARCLASS_LOOP (isalnum
);
3492 else if (strcmp (name
, "cntrl") == 0)
3493 BUILD_CHARCLASS_LOOP (iscntrl
);
3494 else if (strcmp (name
, "lower") == 0)
3495 BUILD_CHARCLASS_LOOP (islower
);
3496 else if (strcmp (name
, "space") == 0)
3497 BUILD_CHARCLASS_LOOP (isspace
);
3498 else if (strcmp (name
, "alpha") == 0)
3499 BUILD_CHARCLASS_LOOP (isalpha
);
3500 else if (strcmp (name
, "digit") == 0)
3501 BUILD_CHARCLASS_LOOP (isdigit
);
3502 else if (strcmp (name
, "print") == 0)
3503 BUILD_CHARCLASS_LOOP (isprint
);
3504 else if (strcmp (name
, "upper") == 0)
3505 BUILD_CHARCLASS_LOOP (isupper
);
3506 else if (strcmp (name
, "blank") == 0)
3507 BUILD_CHARCLASS_LOOP (isblank
);
3508 else if (strcmp (name
, "graph") == 0)
3509 BUILD_CHARCLASS_LOOP (isgraph
);
3510 else if (strcmp (name
, "punct") == 0)
3511 BUILD_CHARCLASS_LOOP (ispunct
);
3512 else if (strcmp (name
, "xdigit") == 0)
3513 BUILD_CHARCLASS_LOOP (isxdigit
);
3521 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3522 const unsigned char *class_name
,
3523 const unsigned char *extra
, int non_match
,
3526 re_bitset_ptr_t sbcset
;
3527 #ifdef RE_ENABLE_I18N
3528 re_charset_t
*mbcset
;
3530 #endif /* not RE_ENABLE_I18N */
3532 re_token_t br_token
;
3535 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3536 #ifdef RE_ENABLE_I18N
3537 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3538 #endif /* RE_ENABLE_I18N */
3540 #ifdef RE_ENABLE_I18N
3541 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3542 #else /* not RE_ENABLE_I18N */
3543 if (BE (sbcset
== NULL
, 0))
3544 #endif /* not RE_ENABLE_I18N */
3552 #ifdef RE_ENABLE_I18N
3553 mbcset
->non_match
= 1;
3554 #endif /* not RE_ENABLE_I18N */
3557 /* We don't care the syntax in this case. */
3558 ret
= build_charclass (trans
, sbcset
,
3559 #ifdef RE_ENABLE_I18N
3561 #endif /* RE_ENABLE_I18N */
3564 if (BE (ret
!= REG_NOERROR
, 0))
3567 #ifdef RE_ENABLE_I18N
3568 free_charset (mbcset
);
3569 #endif /* RE_ENABLE_I18N */
3573 /* \w match '_' also. */
3574 for (; *extra
; extra
++)
3575 bitset_set (sbcset
, *extra
);
3577 /* If it is non-matching list. */
3579 bitset_not (sbcset
);
3581 #ifdef RE_ENABLE_I18N
3582 /* Ensure only single byte characters are set. */
3583 if (dfa
->mb_cur_max
> 1)
3584 bitset_mask (sbcset
, dfa
->sb_char
);
3587 /* Build a tree for simple bracket. */
3588 br_token
.type
= SIMPLE_BRACKET
;
3589 br_token
.opr
.sbcset
= sbcset
;
3590 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3591 if (BE (tree
== NULL
, 0))
3592 goto build_word_op_espace
;
3594 #ifdef RE_ENABLE_I18N
3595 if (dfa
->mb_cur_max
> 1)
3597 bin_tree_t
*mbc_tree
;
3598 /* Build a tree for complex bracket. */
3599 br_token
.type
= COMPLEX_BRACKET
;
3600 br_token
.opr
.mbcset
= mbcset
;
3601 dfa
->has_mb_node
= 1;
3602 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3603 if (BE (mbc_tree
== NULL
, 0))
3604 goto build_word_op_espace
;
3605 /* Then join them by ALT node. */
3606 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3607 if (BE (mbc_tree
!= NULL
, 1))
3612 free_charset (mbcset
);
3615 #else /* not RE_ENABLE_I18N */
3617 #endif /* not RE_ENABLE_I18N */
3619 build_word_op_espace
:
3621 #ifdef RE_ENABLE_I18N
3622 free_charset (mbcset
);
3623 #endif /* RE_ENABLE_I18N */
3628 /* This is intended for the expressions like "a{1,3}".
3629 Fetch a number from `input', and return the number.
3630 Return -1, if the number field is empty like "{,1}".
3631 Return -2, If an error is occured. */
3634 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3640 fetch_token (token
, input
, syntax
);
3642 if (BE (token
->type
== END_OF_RE
, 0))
3644 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3646 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3647 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3648 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3653 #ifdef RE_ENABLE_I18N
3655 free_charset (re_charset_t
*cset
)
3657 re_free (cset
->mbchars
);
3659 re_free (cset
->coll_syms
);
3660 re_free (cset
->equiv_classes
);
3661 re_free (cset
->range_starts
);
3662 re_free (cset
->range_ends
);
3664 re_free (cset
->char_classes
);
3667 #endif /* RE_ENABLE_I18N */
3669 /* Functions for binary tree operation. */
3671 /* Create a tree node. */
3674 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3675 re_token_type_t type
)
3679 return create_token_tree (dfa
, left
, right
, &t
);
3683 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3684 const re_token_t
*token
)
3687 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3689 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3691 if (storage
== NULL
)
3693 storage
->next
= dfa
->str_tree_storage
;
3694 dfa
->str_tree_storage
= storage
;
3695 dfa
->str_tree_storage_idx
= 0;
3697 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3699 tree
->parent
= NULL
;
3701 tree
->right
= right
;
3702 tree
->token
= *token
;
3703 tree
->token
.duplicated
= 0;
3704 tree
->token
.opt_subexp
= 0;
3707 tree
->node_idx
= -1;
3710 left
->parent
= tree
;
3712 right
->parent
= tree
;
3716 /* Mark the tree SRC as an optional subexpression.
3717 To be called from preorder or postorder. */
3719 static reg_errcode_t
3720 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3722 int idx
= (int) (long) extra
;
3723 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3724 node
->token
.opt_subexp
= 1;
3729 /* Free the allocated memory inside NODE. */
3732 free_token (re_token_t
*node
)
3734 #ifdef RE_ENABLE_I18N
3735 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3736 free_charset (node
->opr
.mbcset
);
3738 #endif /* RE_ENABLE_I18N */
3739 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3740 re_free (node
->opr
.sbcset
);
3743 /* Worker function for tree walking. Free the allocated memory inside NODE
3744 and its children. */
3746 static reg_errcode_t
3747 free_tree (void *extra
, bin_tree_t
*node
)
3749 free_token (&node
->token
);
3754 /* Duplicate the node SRC, and return new node. This is a preorder
3755 visit similar to the one implemented by the generic visitor, but
3756 we need more infrastructure to maintain two parallel trees --- so,
3757 it's easier to duplicate. */
3760 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3762 const bin_tree_t
*node
;
3763 bin_tree_t
*dup_root
;
3764 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3766 for (node
= root
; ; )
3768 /* Create a new tree and link it back to the current parent. */
3769 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3772 (*p_new
)->parent
= dup_node
;
3773 (*p_new
)->token
.duplicated
= 1;
3776 /* Go to the left node, or up and to the right. */
3780 p_new
= &dup_node
->left
;
3784 const bin_tree_t
*prev
= NULL
;
3785 while (node
->right
== prev
|| node
->right
== NULL
)
3788 node
= node
->parent
;
3789 dup_node
= dup_node
->parent
;
3794 p_new
= &dup_node
->right
;