1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
28 static void free_charset (re_charset_t
*cset
);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t
*preg
);
31 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
35 static reg_errcode_t
analyze (regex_t
*preg
);
36 static reg_errcode_t
preorder (bin_tree_t
*root
,
37 reg_errcode_t (fn (void *, bin_tree_t
*)),
39 static reg_errcode_t
postorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
43 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
44 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
46 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
49 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
50 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
53 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
55 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
56 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
58 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 reg_syntax_t syntax
) internal_function
;
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
92 int *equiv_class_alloc
,
93 const unsigned char *name
);
94 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
97 int *char_class_alloc
,
98 const unsigned char *class_name
,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
102 const unsigned char *name
);
103 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
105 const unsigned char *class_name
,
106 reg_syntax_t syntax
);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
109 RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 const unsigned char *extra
,
112 int non_match
, reg_errcode_t
*err
);
113 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
114 bin_tree_t
*left
, bin_tree_t
*right
,
115 re_token_type_t type
);
116 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 const re_token_t
*token
);
119 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
120 static void free_token (re_token_t
*node
);
121 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
122 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid
[] attribute_hidden
=
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx
[] attribute_hidden
=
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
214 re_compile_pattern (pattern
, length
, bufp
)
217 struct re_pattern_buffer
*bufp
;
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
226 /* Match anchors at newline. */
227 bufp
->newline_anchor
= 1;
229 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
233 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
236 weak_alias (__re_compile_pattern
, re_compile_pattern
)
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options
;
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
255 re_set_syntax (syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (bufp
)
269 struct re_pattern_buffer
*bufp
;
271 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
272 char *fastmap
= bufp
->fastmap
;
274 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
275 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
276 if (dfa
->init_state
!= dfa
->init_state_word
)
277 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
278 if (dfa
->init_state
!= dfa
->init_state_nl
)
279 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
280 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
281 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
282 bufp
->fastmap_accurate
= 1;
286 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
290 __attribute ((always_inline
))
291 re_set_fastmap (char *fastmap
, int icase
, int ch
)
295 fastmap
[tolower (ch
)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
302 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
305 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
307 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
308 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
310 int node
= init_state
->nodes
.elems
[node_cnt
];
311 re_token_type_t type
= dfa
->nodes
[node
].type
;
313 if (type
== CHARACTER
)
315 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
319 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, 0, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
354 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
355 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
356 || cset
->nranges
|| cset
->nchar_classes
)
359 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
361 /* In this case we want to catch the bytes which are
362 the first byte of any collation elements.
363 e.g. In da_DK, we want to catch 'a' since "aa"
364 is a valid collation element, and don't catch
365 'b' since 'b' is the only collation element
366 which starts from 'b'. */
367 const int32_t *table
= (const int32_t *)
368 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
369 for (i
= 0; i
< SBC_MAX
; ++i
)
371 re_set_fastmap (fastmap
, icase
, i
);
374 if (dfa
->mb_cur_max
> 1)
375 for (i
= 0; i
< SBC_MAX
; ++i
)
376 if (__btowc (i
) == WEOF
)
377 re_set_fastmap (fastmap
, icase
, i
);
378 # endif /* not _LIBC */
380 for (i
= 0; i
< cset
->nmbchars
; ++i
)
384 memset (&state
, '\0', sizeof (state
));
385 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
386 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
387 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
389 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
391 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
395 #endif /* RE_ENABLE_I18N */
396 else if (type
== OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 || type
== OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 || type
== END_OF_RE
)
402 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
403 if (type
== END_OF_RE
)
404 bufp
->can_be_null
= 1;
410 /* Entry point for POSIX code. */
411 /* regcomp takes a regular expression as a string and compiles it.
413 PREG is a regex_t *. We do not expect any fields to be initialized,
414 since POSIX says we shouldn't. Thus, we set
416 `buffer' to the compiled pattern;
417 `used' to the length of the compiled pattern;
418 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419 REG_EXTENDED bit in CFLAGS is set; otherwise, to
420 RE_SYNTAX_POSIX_BASIC;
421 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422 `fastmap' to an allocated space for the fastmap;
423 `fastmap_accurate' to zero;
424 `re_nsub' to the number of subexpressions in PATTERN.
426 PATTERN is the address of the pattern string.
428 CFLAGS is a series of bits which affect compilation.
430 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431 use POSIX basic syntax.
433 If REG_NEWLINE is set, then . and [^...] don't match newline.
434 Also, regexec will try a match beginning after every newline.
436 If REG_ICASE is set, then we considers upper- and lowercase
437 versions of letters to be equivalent when matching.
439 If REG_NOSUB is set, then when PREG is passed to regexec, that
440 routine will report only success or failure, and nothing about the
443 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
444 the return codes and their meanings.) */
447 regcomp (preg
, pattern
, cflags
)
448 regex_t
*__restrict preg
;
449 const char *__restrict pattern
;
453 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
454 : RE_SYNTAX_POSIX_BASIC
);
460 /* Try to allocate space for the fastmap. */
461 preg
->fastmap
= re_malloc (char, SBC_MAX
);
462 if (BE (preg
->fastmap
== NULL
, 0))
465 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
467 /* If REG_NEWLINE is set, newlines are treated differently. */
468 if (cflags
& REG_NEWLINE
)
469 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
470 syntax
&= ~RE_DOT_NEWLINE
;
471 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
472 /* It also changes the matching behavior. */
473 preg
->newline_anchor
= 1;
476 preg
->newline_anchor
= 0;
477 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
478 preg
->translate
= NULL
;
480 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
482 /* POSIX doesn't distinguish between an unmatched open-group and an
483 unmatched close-group: both are REG_EPAREN. */
484 if (ret
== REG_ERPAREN
)
487 /* We have already checked preg->fastmap != NULL. */
488 if (BE (ret
== REG_NOERROR
, 1))
489 /* Compute the fastmap now, since regexec cannot modify the pattern
490 buffer. This function never fails in this implementation. */
491 (void) re_compile_fastmap (preg
);
494 /* Some error occurred while compiling the expression. */
495 re_free (preg
->fastmap
);
496 preg
->fastmap
= NULL
;
502 weak_alias (__regcomp
, regcomp
)
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506 from either regcomp or regexec. We don't use PREG here. */
509 regerror (errcode
, preg
, errbuf
, errbuf_size
)
511 const regex_t
*__restrict preg
;
512 char *__restrict errbuf
;
519 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
520 / sizeof (__re_error_msgid_idx
[0])), 0))
521 /* Only error codes returned by the rest of the code should be passed
522 to this routine. If we are given anything else, or if other regex
523 code generates an invalid error code, then the program has a bug.
524 Dump core so we can fix it. */
527 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
529 msg_size
= strlen (msg
) + 1; /* Includes the null. */
531 if (BE (errbuf_size
!= 0, 1))
533 if (BE (msg_size
> errbuf_size
, 0))
535 #if defined HAVE_MEMPCPY || defined _LIBC
536 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
538 memcpy (errbuf
, msg
, errbuf_size
- 1);
539 errbuf
[errbuf_size
- 1] = 0;
543 memcpy (errbuf
, msg
, msg_size
);
549 weak_alias (__regerror
, regerror
)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map
=
560 /* Set the first 128 bits. */
561 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
567 free_dfa_content (re_dfa_t
*dfa
)
572 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
573 free_token (dfa
->nodes
+ i
);
574 re_free (dfa
->nexts
);
575 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
577 if (dfa
->eclosures
!= NULL
)
578 re_node_set_free (dfa
->eclosures
+ i
);
579 if (dfa
->inveclosures
!= NULL
)
580 re_node_set_free (dfa
->inveclosures
+ i
);
581 if (dfa
->edests
!= NULL
)
582 re_node_set_free (dfa
->edests
+ i
);
584 re_free (dfa
->edests
);
585 re_free (dfa
->eclosures
);
586 re_free (dfa
->inveclosures
);
587 re_free (dfa
->nodes
);
589 if (dfa
->state_table
)
590 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
592 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
593 for (j
= 0; j
< entry
->num
; ++j
)
595 re_dfastate_t
*state
= entry
->array
[j
];
598 re_free (entry
->array
);
600 re_free (dfa
->state_table
);
601 #ifdef RE_ENABLE_I18N
602 if (dfa
->sb_char
!= utf8_sb_map
)
603 re_free (dfa
->sb_char
);
605 re_free (dfa
->subexp_map
);
607 re_free (dfa
->re_str
);
614 /* Free dynamically allocated space used by PREG. */
620 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
621 if (BE (dfa
!= NULL
, 1))
622 free_dfa_content (dfa
);
626 re_free (preg
->fastmap
);
627 preg
->fastmap
= NULL
;
629 re_free (preg
->translate
);
630 preg
->translate
= NULL
;
633 weak_alias (__regfree
, regfree
)
636 /* Entry points compatible with 4.2 BSD regex library. We don't define
637 them unless specifically requested. */
639 #if defined _REGEX_RE_COMP || defined _LIBC
641 /* BSD has one and only one pattern buffer. */
642 static struct re_pattern_buffer re_comp_buf
;
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647 these names if they don't use our functions, and still use
648 regcomp/regexec above without link errors. */
659 if (!re_comp_buf
.buffer
)
660 return gettext ("No previous regular expression");
664 if (re_comp_buf
.buffer
)
666 fastmap
= re_comp_buf
.fastmap
;
667 re_comp_buf
.fastmap
= NULL
;
668 __regfree (&re_comp_buf
);
669 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
670 re_comp_buf
.fastmap
= fastmap
;
673 if (re_comp_buf
.fastmap
== NULL
)
675 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
676 if (re_comp_buf
.fastmap
== NULL
)
677 return (char *) gettext (__re_error_msgid
678 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
681 /* Since `re_exec' always passes NULL for the `regs' argument, we
682 don't need to initialize the pattern buffer fields which affect it. */
684 /* Match anchors at newlines. */
685 re_comp_buf
.newline_anchor
= 1;
687 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
692 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
693 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
697 libc_freeres_fn (free_mem
)
699 __regfree (&re_comp_buf
);
703 #endif /* _REGEX_RE_COMP */
705 /* Internal entry point.
706 Compile the regular expression PATTERN, whose length is LENGTH.
707 SYNTAX indicate regular expression's syntax. */
710 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
713 reg_errcode_t err
= REG_NOERROR
;
717 /* Initialize the pattern buffer. */
718 preg
->fastmap_accurate
= 0;
719 preg
->syntax
= syntax
;
720 preg
->not_bol
= preg
->not_eol
= 0;
723 preg
->can_be_null
= 0;
724 preg
->regs_allocated
= REGS_UNALLOCATED
;
726 /* Initialize the dfa. */
727 dfa
= (re_dfa_t
*) preg
->buffer
;
728 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
730 /* If zero allocated, but buffer is non-null, try to realloc
731 enough space. This loses if buffer's address is bogus, but
732 that is the user's responsibility. If ->buffer is NULL this
733 is a simple allocation. */
734 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
737 preg
->allocated
= sizeof (re_dfa_t
);
738 preg
->buffer
= (unsigned char *) dfa
;
740 preg
->used
= sizeof (re_dfa_t
);
742 err
= init_dfa (dfa
, length
);
743 if (BE (err
!= REG_NOERROR
, 0))
745 free_dfa_content (dfa
);
751 /* Note: length+1 will not overflow since it is checked in init_dfa. */
752 dfa
->re_str
= re_malloc (char, length
+ 1);
753 strncpy (dfa
->re_str
, pattern
, length
+ 1);
756 __libc_lock_init (dfa
->lock
);
758 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
759 syntax
& RE_ICASE
, dfa
);
760 if (BE (err
!= REG_NOERROR
, 0))
762 re_compile_internal_free_return
:
763 free_workarea_compile (preg
);
764 re_string_destruct (®exp
);
765 free_dfa_content (dfa
);
771 /* Parse the regular expression, and build a structure tree. */
773 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
774 if (BE (dfa
->str_tree
== NULL
, 0))
775 goto re_compile_internal_free_return
;
777 /* Analyze the tree and create the nfa. */
778 err
= analyze (preg
);
779 if (BE (err
!= REG_NOERROR
, 0))
780 goto re_compile_internal_free_return
;
782 #ifdef RE_ENABLE_I18N
783 /* If possible, do searching in single byte encoding to speed things up. */
784 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
788 /* Then create the initial state of the dfa. */
789 err
= create_initial_state (dfa
);
791 /* Release work areas. */
792 free_workarea_compile (preg
);
793 re_string_destruct (®exp
);
795 if (BE (err
!= REG_NOERROR
, 0))
797 free_dfa_content (dfa
);
805 /* Initialize DFA. We use the length of the regular expression PAT_LEN
806 as the initial length of some arrays. */
809 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
811 unsigned int table_size
;
816 memset (dfa
, '\0', sizeof (re_dfa_t
));
818 /* Force allocation of str_tree_storage the first time. */
819 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
821 /* Avoid overflows. */
822 if (pat_len
== SIZE_MAX
)
825 dfa
->nodes_alloc
= pat_len
+ 1;
826 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
828 /* table_size = 2 ^ ceil(log pat_len) */
829 for (table_size
= 1; ; table_size
<<= 1)
830 if (table_size
> pat_len
)
833 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
834 dfa
->state_hash_mask
= table_size
- 1;
836 dfa
->mb_cur_max
= MB_CUR_MAX
;
838 if (dfa
->mb_cur_max
== 6
839 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
841 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
844 # ifdef HAVE_LANGINFO_CODESET
845 codeset_name
= nl_langinfo (CODESET
);
847 codeset_name
= getenv ("LC_ALL");
848 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
849 codeset_name
= getenv ("LC_CTYPE");
850 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
851 codeset_name
= getenv ("LANG");
852 if (codeset_name
== NULL
)
854 else if (strchr (codeset_name
, '.') != NULL
)
855 codeset_name
= strchr (codeset_name
, '.') + 1;
858 if (strcasecmp (codeset_name
, "UTF-8") == 0
859 || strcasecmp (codeset_name
, "UTF8") == 0)
862 /* We check exhaustively in the loop below if this charset is a
863 superset of ASCII. */
864 dfa
->map_notascii
= 0;
867 #ifdef RE_ENABLE_I18N
868 if (dfa
->mb_cur_max
> 1)
871 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
876 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
877 if (BE (dfa
->sb_char
== NULL
, 0))
880 /* Set the bits corresponding to single byte chars. */
881 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
882 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
884 wint_t wch
= __btowc (ch
);
886 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
888 if (isascii (ch
) && wch
!= ch
)
889 dfa
->map_notascii
= 1;
896 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
901 /* Initialize WORD_CHAR table, which indicate which character is
902 "word". In this case "word" means that it is the word construction
903 character used by some operators like "\<", "\>", etc. */
907 init_word_char (re_dfa_t
*dfa
)
910 dfa
->word_ops_used
= 1;
911 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
912 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
913 if (isalnum (ch
) || ch
== '_')
914 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
917 /* Free the work area which are only used while compiling. */
920 free_workarea_compile (regex_t
*preg
)
922 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
923 bin_tree_storage_t
*storage
, *next
;
924 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
926 next
= storage
->next
;
929 dfa
->str_tree_storage
= NULL
;
930 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
931 dfa
->str_tree
= NULL
;
932 re_free (dfa
->org_indices
);
933 dfa
->org_indices
= NULL
;
936 /* Create initial states for all contexts. */
939 create_initial_state (re_dfa_t
*dfa
)
943 re_node_set init_nodes
;
945 /* Initial states have the epsilon closure of the node which is
946 the first node of the regular expression. */
947 first
= dfa
->str_tree
->first
->node_idx
;
948 dfa
->init_node
= first
;
949 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
950 if (BE (err
!= REG_NOERROR
, 0))
953 /* The back-references which are in initial states can epsilon transit,
954 since in this case all of the subexpressions can be null.
955 Then we add epsilon closures of the nodes which are the next nodes of
956 the back-references. */
957 if (dfa
->nbackref
> 0)
958 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
960 int node_idx
= init_nodes
.elems
[i
];
961 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
964 if (type
!= OP_BACK_REF
)
966 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
968 re_token_t
*clexp_node
;
969 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
970 if (clexp_node
->type
== OP_CLOSE_SUBEXP
971 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
974 if (clexp_idx
== init_nodes
.nelem
)
977 if (type
== OP_BACK_REF
)
979 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
980 if (!re_node_set_contains (&init_nodes
, dest_idx
))
982 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
988 /* It must be the first time to invoke acquire_state. */
989 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
990 /* We don't check ERR here, since the initial state must not be NULL. */
991 if (BE (dfa
->init_state
== NULL
, 0))
993 if (dfa
->init_state
->has_constraint
)
995 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
997 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
999 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1003 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1004 || dfa
->init_state_begbuf
== NULL
, 0))
1008 dfa
->init_state_word
= dfa
->init_state_nl
1009 = dfa
->init_state_begbuf
= dfa
->init_state
;
1011 re_node_set_free (&init_nodes
);
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018 DFA nodes where needed. */
1021 optimize_utf8 (re_dfa_t
*dfa
)
1023 int node
, i
, mb_chars
= 0, has_period
= 0;
1025 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1026 switch (dfa
->nodes
[node
].type
)
1029 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1033 switch (dfa
->nodes
[node
].opr
.idx
)
1041 /* Word anchors etc. cannot be handled. */
1051 case OP_DUP_ASTERISK
:
1052 case OP_OPEN_SUBEXP
:
1053 case OP_CLOSE_SUBEXP
:
1055 case COMPLEX_BRACKET
:
1057 case SIMPLE_BRACKET
:
1058 /* Just double check. The non-ASCII range starts at 0x80. */
1059 assert (0x80 % BITSET_WORD_BITS
== 0);
1060 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1061 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1068 if (mb_chars
|| has_period
)
1069 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1071 if (dfa
->nodes
[node
].type
== CHARACTER
1072 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1073 dfa
->nodes
[node
].mb_partial
= 0;
1074 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1075 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1078 /* The search can be in single byte locale. */
1079 dfa
->mb_cur_max
= 1;
1081 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1085 /* Analyze the structure tree, and calculate "first", "next", "edest",
1086 "eclosure", and "inveclosure". */
1088 static reg_errcode_t
1089 analyze (regex_t
*preg
)
1091 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1094 /* Allocate arrays. */
1095 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1096 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1097 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1098 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1099 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1100 || dfa
->eclosures
== NULL
, 0))
1103 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1104 if (dfa
->subexp_map
!= NULL
)
1107 for (i
= 0; i
< preg
->re_nsub
; i
++)
1108 dfa
->subexp_map
[i
] = i
;
1109 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1110 for (i
= 0; i
< preg
->re_nsub
; i
++)
1111 if (dfa
->subexp_map
[i
] != i
)
1113 if (i
== preg
->re_nsub
)
1115 free (dfa
->subexp_map
);
1116 dfa
->subexp_map
= NULL
;
1120 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1121 if (BE (ret
!= REG_NOERROR
, 0))
1123 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1124 if (BE (ret
!= REG_NOERROR
, 0))
1126 preorder (dfa
->str_tree
, calc_next
, dfa
);
1127 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1128 if (BE (ret
!= REG_NOERROR
, 0))
1130 ret
= calc_eclosure (dfa
);
1131 if (BE (ret
!= REG_NOERROR
, 0))
1134 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1135 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1136 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1139 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1140 if (BE (dfa
->inveclosures
== NULL
, 0))
1142 ret
= calc_inveclosure (dfa
);
1148 /* Our parse trees are very unbalanced, so we cannot use a stack to
1149 implement parse tree visits. Instead, we use parent pointers and
1150 some hairy code in these two functions. */
1151 static reg_errcode_t
1152 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1155 bin_tree_t
*node
, *prev
;
1157 for (node
= root
; ; )
1159 /* Descend down the tree, preferably to the left (or to the right
1160 if that's the only child). */
1161 while (node
->left
|| node
->right
)
1169 reg_errcode_t err
= fn (extra
, node
);
1170 if (BE (err
!= REG_NOERROR
, 0))
1172 if (node
->parent
== NULL
)
1175 node
= node
->parent
;
1177 /* Go up while we have a node that is reached from the right. */
1178 while (node
->right
== prev
|| node
->right
== NULL
);
1183 static reg_errcode_t
1184 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1189 for (node
= root
; ; )
1191 reg_errcode_t err
= fn (extra
, node
);
1192 if (BE (err
!= REG_NOERROR
, 0))
1195 /* Go to the left node, or up and to the right. */
1200 bin_tree_t
*prev
= NULL
;
1201 while (node
->right
== prev
|| node
->right
== NULL
)
1204 node
= node
->parent
;
1213 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1214 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1215 backreferences as well. Requires a preorder visit. */
1216 static reg_errcode_t
1217 optimize_subexps (void *extra
, bin_tree_t
*node
)
1219 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1221 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1223 int idx
= node
->token
.opr
.idx
;
1224 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1225 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1228 else if (node
->token
.type
== SUBEXP
1229 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1231 int other_idx
= node
->left
->token
.opr
.idx
;
1233 node
->left
= node
->left
->left
;
1235 node
->left
->parent
= node
;
1237 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1238 if (other_idx
< BITSET_WORD_BITS
)
1239 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1245 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1246 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1247 static reg_errcode_t
1248 lower_subexps (void *extra
, bin_tree_t
*node
)
1250 regex_t
*preg
= (regex_t
*) extra
;
1251 reg_errcode_t err
= REG_NOERROR
;
1253 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1255 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1257 node
->left
->parent
= node
;
1259 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1261 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1263 node
->right
->parent
= node
;
1270 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1272 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1273 bin_tree_t
*body
= node
->left
;
1274 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1277 /* We do not optimize empty subexpressions, because otherwise we may
1278 have bad CONCAT nodes with NULL children. This is obviously not
1279 very common, so we do not lose much. An example that triggers
1280 this case is the sed "script" /\(\)/x. */
1281 && node
->left
!= NULL
1282 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1283 || !(dfa
->used_bkref_map
1284 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1287 /* Convert the SUBEXP node to the concatenation of an
1288 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1289 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1290 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1291 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1292 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1293 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1299 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1300 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1304 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1305 nodes. Requires a postorder visit. */
1306 static reg_errcode_t
1307 calc_first (void *extra
, bin_tree_t
*node
)
1309 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1310 if (node
->token
.type
== CONCAT
)
1312 node
->first
= node
->left
->first
;
1313 node
->node_idx
= node
->left
->node_idx
;
1318 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1319 if (BE (node
->node_idx
== -1, 0))
1325 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1326 static reg_errcode_t
1327 calc_next (void *extra
, bin_tree_t
*node
)
1329 switch (node
->token
.type
)
1331 case OP_DUP_ASTERISK
:
1332 node
->left
->next
= node
;
1335 node
->left
->next
= node
->right
->first
;
1336 node
->right
->next
= node
->next
;
1340 node
->left
->next
= node
->next
;
1342 node
->right
->next
= node
->next
;
1348 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1349 static reg_errcode_t
1350 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1352 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1353 int idx
= node
->node_idx
;
1354 reg_errcode_t err
= REG_NOERROR
;
1356 switch (node
->token
.type
)
1362 assert (node
->next
== NULL
);
1365 case OP_DUP_ASTERISK
:
1369 dfa
->has_plural_match
= 1;
1370 if (node
->left
!= NULL
)
1371 left
= node
->left
->first
->node_idx
;
1373 left
= node
->next
->node_idx
;
1374 if (node
->right
!= NULL
)
1375 right
= node
->right
->first
->node_idx
;
1377 right
= node
->next
->node_idx
;
1379 assert (right
> -1);
1380 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1385 case OP_OPEN_SUBEXP
:
1386 case OP_CLOSE_SUBEXP
:
1387 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1391 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1392 if (node
->token
.type
== OP_BACK_REF
)
1393 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1397 assert (!IS_EPSILON_NODE (node
->token
.type
));
1398 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1405 /* Duplicate the epsilon closure of the node ROOT_NODE.
1406 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1407 to their own constraint. */
1409 static reg_errcode_t
1411 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1412 int root_node
, unsigned int init_constraint
)
1414 int org_node
, clone_node
, ret
;
1415 unsigned int constraint
= init_constraint
;
1416 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1418 int org_dest
, clone_dest
;
1419 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1421 /* If the back reference epsilon-transit, its destination must
1422 also have the constraint. Then duplicate the epsilon closure
1423 of the destination of the back reference, and store it in
1424 edests of the back reference. */
1425 org_dest
= dfa
->nexts
[org_node
];
1426 re_node_set_empty (dfa
->edests
+ clone_node
);
1427 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1428 if (BE (clone_dest
== -1, 0))
1430 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1431 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1432 if (BE (ret
< 0, 0))
1435 else if (dfa
->edests
[org_node
].nelem
== 0)
1437 /* In case of the node can't epsilon-transit, don't duplicate the
1438 destination and store the original destination as the
1439 destination of the node. */
1440 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1443 else if (dfa
->edests
[org_node
].nelem
== 1)
1445 /* In case of the node can epsilon-transit, and it has only one
1447 org_dest
= dfa
->edests
[org_node
].elems
[0];
1448 re_node_set_empty (dfa
->edests
+ clone_node
);
1449 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1451 /* In case of the node has another constraint, append it. */
1452 if (org_node
== root_node
&& clone_node
!= org_node
)
1454 /* ...but if the node is root_node itself, it means the
1455 epsilon closure have a loop, then tie it to the
1456 destination of the root_node. */
1457 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1459 if (BE (ret
< 0, 0))
1463 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1465 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1466 if (BE (clone_dest
== -1, 0))
1468 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1469 if (BE (ret
< 0, 0))
1472 else /* dfa->edests[org_node].nelem == 2 */
1474 /* In case of the node can epsilon-transit, and it has two
1475 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1476 org_dest
= dfa
->edests
[org_node
].elems
[0];
1477 re_node_set_empty (dfa
->edests
+ clone_node
);
1478 /* Search for a duplicated node which satisfies the constraint. */
1479 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1480 if (clone_dest
== -1)
1482 /* There are no such a duplicated node, create a new one. */
1484 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1485 if (BE (clone_dest
== -1, 0))
1487 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1488 if (BE (ret
< 0, 0))
1490 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1491 root_node
, constraint
);
1492 if (BE (err
!= REG_NOERROR
, 0))
1497 /* There are a duplicated node which satisfy the constraint,
1498 use it to avoid infinite loop. */
1499 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1500 if (BE (ret
< 0, 0))
1504 org_dest
= dfa
->edests
[org_node
].elems
[1];
1505 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1506 if (BE (clone_dest
== -1, 0))
1508 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1509 if (BE (ret
< 0, 0))
1512 org_node
= org_dest
;
1513 clone_node
= clone_dest
;
1518 /* Search for a node which is duplicated from the node ORG_NODE, and
1519 satisfies the constraint CONSTRAINT. */
1522 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1523 unsigned int constraint
)
1526 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1528 if (org_node
== dfa
->org_indices
[idx
]
1529 && constraint
== dfa
->nodes
[idx
].constraint
)
1530 return idx
; /* Found. */
1532 return -1; /* Not found. */
1535 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1536 Return the index of the new node, or -1 if insufficient storage is
1540 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1542 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1543 if (BE (dup_idx
!= -1, 1))
1545 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1546 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1547 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1548 dfa
->nodes
[dup_idx
].duplicated
= 1;
1550 /* Store the index of the original node. */
1551 dfa
->org_indices
[dup_idx
] = org_idx
;
1556 static reg_errcode_t
1557 calc_inveclosure (re_dfa_t
*dfa
)
1560 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1561 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1563 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1565 int *elems
= dfa
->eclosures
[src
].elems
;
1566 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1568 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1569 if (BE (ret
== -1, 0))
1577 /* Calculate "eclosure" for all the node in DFA. */
1579 static reg_errcode_t
1580 calc_eclosure (re_dfa_t
*dfa
)
1582 int node_idx
, incomplete
;
1584 assert (dfa
->nodes_len
> 0);
1587 /* For each nodes, calculate epsilon closure. */
1588 for (node_idx
= 0; ; ++node_idx
)
1591 re_node_set eclosure_elem
;
1592 if (node_idx
== dfa
->nodes_len
)
1601 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1604 /* If we have already calculated, skip it. */
1605 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1607 /* Calculate epsilon closure of `node_idx'. */
1608 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1609 if (BE (err
!= REG_NOERROR
, 0))
1612 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1615 re_node_set_free (&eclosure_elem
);
1621 /* Calculate epsilon closure of NODE. */
1623 static reg_errcode_t
1624 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1627 unsigned int constraint
;
1629 re_node_set eclosure
;
1631 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1632 if (BE (err
!= REG_NOERROR
, 0))
1635 /* This indicates that we are calculating this node now.
1636 We reference this value to avoid infinite loop. */
1637 dfa
->eclosures
[node
].nelem
= -1;
1639 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1640 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1641 /* If the current node has constraints, duplicate all nodes.
1642 Since they must inherit the constraints. */
1644 && dfa
->edests
[node
].nelem
1645 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1647 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1648 if (BE (err
!= REG_NOERROR
, 0))
1652 /* Expand each epsilon destination nodes. */
1653 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1654 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1656 re_node_set eclosure_elem
;
1657 int edest
= dfa
->edests
[node
].elems
[i
];
1658 /* If calculating the epsilon closure of `edest' is in progress,
1659 return intermediate result. */
1660 if (dfa
->eclosures
[edest
].nelem
== -1)
1665 /* If we haven't calculated the epsilon closure of `edest' yet,
1666 calculate now. Otherwise use calculated epsilon closure. */
1667 if (dfa
->eclosures
[edest
].nelem
== 0)
1669 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1670 if (BE (err
!= REG_NOERROR
, 0))
1674 eclosure_elem
= dfa
->eclosures
[edest
];
1675 /* Merge the epsilon closure of `edest'. */
1676 re_node_set_merge (&eclosure
, &eclosure_elem
);
1677 /* If the epsilon closure of `edest' is incomplete,
1678 the epsilon closure of this node is also incomplete. */
1679 if (dfa
->eclosures
[edest
].nelem
== 0)
1682 re_node_set_free (&eclosure_elem
);
1686 /* Epsilon closures include itself. */
1687 re_node_set_insert (&eclosure
, node
);
1688 if (incomplete
&& !root
)
1689 dfa
->eclosures
[node
].nelem
= 0;
1691 dfa
->eclosures
[node
] = eclosure
;
1692 *new_set
= eclosure
;
1696 /* Functions for token which are used in the parser. */
1698 /* Fetch a token from INPUT.
1699 We must not use this function inside bracket expressions. */
1703 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1705 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1708 /* Peek a token from INPUT, and return the length of the token.
1709 We must not use this function inside bracket expressions. */
1713 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1717 if (re_string_eoi (input
))
1719 token
->type
= END_OF_RE
;
1723 c
= re_string_peek_byte (input
, 0);
1726 token
->word_char
= 0;
1727 #ifdef RE_ENABLE_I18N
1728 token
->mb_partial
= 0;
1729 if (input
->mb_cur_max
> 1 &&
1730 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1732 token
->type
= CHARACTER
;
1733 token
->mb_partial
= 1;
1740 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1742 token
->type
= BACK_SLASH
;
1746 c2
= re_string_peek_byte_case (input
, 1);
1748 token
->type
= CHARACTER
;
1749 #ifdef RE_ENABLE_I18N
1750 if (input
->mb_cur_max
> 1)
1752 wint_t wc
= re_string_wchar_at (input
,
1753 re_string_cur_idx (input
) + 1);
1754 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1758 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1763 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1764 token
->type
= OP_ALT
;
1766 case '1': case '2': case '3': case '4': case '5':
1767 case '6': case '7': case '8': case '9':
1768 if (!(syntax
& RE_NO_BK_REFS
))
1770 token
->type
= OP_BACK_REF
;
1771 token
->opr
.idx
= c2
- '1';
1775 if (!(syntax
& RE_NO_GNU_OPS
))
1777 token
->type
= ANCHOR
;
1778 token
->opr
.ctx_type
= WORD_FIRST
;
1782 if (!(syntax
& RE_NO_GNU_OPS
))
1784 token
->type
= ANCHOR
;
1785 token
->opr
.ctx_type
= WORD_LAST
;
1789 if (!(syntax
& RE_NO_GNU_OPS
))
1791 token
->type
= ANCHOR
;
1792 token
->opr
.ctx_type
= WORD_DELIM
;
1796 if (!(syntax
& RE_NO_GNU_OPS
))
1798 token
->type
= ANCHOR
;
1799 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1803 if (!(syntax
& RE_NO_GNU_OPS
))
1804 token
->type
= OP_WORD
;
1807 if (!(syntax
& RE_NO_GNU_OPS
))
1808 token
->type
= OP_NOTWORD
;
1811 if (!(syntax
& RE_NO_GNU_OPS
))
1812 token
->type
= OP_SPACE
;
1815 if (!(syntax
& RE_NO_GNU_OPS
))
1816 token
->type
= OP_NOTSPACE
;
1819 if (!(syntax
& RE_NO_GNU_OPS
))
1821 token
->type
= ANCHOR
;
1822 token
->opr
.ctx_type
= BUF_FIRST
;
1826 if (!(syntax
& RE_NO_GNU_OPS
))
1828 token
->type
= ANCHOR
;
1829 token
->opr
.ctx_type
= BUF_LAST
;
1833 if (!(syntax
& RE_NO_BK_PARENS
))
1834 token
->type
= OP_OPEN_SUBEXP
;
1837 if (!(syntax
& RE_NO_BK_PARENS
))
1838 token
->type
= OP_CLOSE_SUBEXP
;
1841 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1842 token
->type
= OP_DUP_PLUS
;
1845 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1846 token
->type
= OP_DUP_QUESTION
;
1849 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1850 token
->type
= OP_OPEN_DUP_NUM
;
1853 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1854 token
->type
= OP_CLOSE_DUP_NUM
;
1862 token
->type
= CHARACTER
;
1863 #ifdef RE_ENABLE_I18N
1864 if (input
->mb_cur_max
> 1)
1866 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1867 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1871 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1876 if (syntax
& RE_NEWLINE_ALT
)
1877 token
->type
= OP_ALT
;
1880 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1881 token
->type
= OP_ALT
;
1884 token
->type
= OP_DUP_ASTERISK
;
1887 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1888 token
->type
= OP_DUP_PLUS
;
1891 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1892 token
->type
= OP_DUP_QUESTION
;
1895 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1896 token
->type
= OP_OPEN_DUP_NUM
;
1899 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1900 token
->type
= OP_CLOSE_DUP_NUM
;
1903 if (syntax
& RE_NO_BK_PARENS
)
1904 token
->type
= OP_OPEN_SUBEXP
;
1907 if (syntax
& RE_NO_BK_PARENS
)
1908 token
->type
= OP_CLOSE_SUBEXP
;
1911 token
->type
= OP_OPEN_BRACKET
;
1914 token
->type
= OP_PERIOD
;
1917 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1918 re_string_cur_idx (input
) != 0)
1920 char prev
= re_string_peek_byte (input
, -1);
1921 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1924 token
->type
= ANCHOR
;
1925 token
->opr
.ctx_type
= LINE_FIRST
;
1928 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1929 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1932 re_string_skip_bytes (input
, 1);
1933 peek_token (&next
, input
, syntax
);
1934 re_string_skip_bytes (input
, -1);
1935 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1938 token
->type
= ANCHOR
;
1939 token
->opr
.ctx_type
= LINE_LAST
;
1947 /* Peek a token from INPUT, and return the length of the token.
1948 We must not use this function out of bracket expressions. */
1952 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1955 if (re_string_eoi (input
))
1957 token
->type
= END_OF_RE
;
1960 c
= re_string_peek_byte (input
, 0);
1963 #ifdef RE_ENABLE_I18N
1964 if (input
->mb_cur_max
> 1 &&
1965 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1967 token
->type
= CHARACTER
;
1970 #endif /* RE_ENABLE_I18N */
1972 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1973 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1975 /* In this case, '\' escape a character. */
1977 re_string_skip_bytes (input
, 1);
1978 c2
= re_string_peek_byte (input
, 0);
1980 token
->type
= CHARACTER
;
1983 if (c
== '[') /* '[' is a special char in a bracket exps. */
1987 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
1988 c2
= re_string_peek_byte (input
, 1);
1996 token
->type
= OP_OPEN_COLL_ELEM
;
1999 token
->type
= OP_OPEN_EQUIV_CLASS
;
2002 if (syntax
& RE_CHAR_CLASSES
)
2004 token
->type
= OP_OPEN_CHAR_CLASS
;
2007 /* else fall through. */
2009 token
->type
= CHARACTER
;
2019 token
->type
= OP_CHARSET_RANGE
;
2022 token
->type
= OP_CLOSE_BRACKET
;
2025 token
->type
= OP_NON_MATCH_LIST
;
2028 token
->type
= CHARACTER
;
2033 /* Functions for parser. */
2035 /* Entry point of the parser.
2036 Parse the regular expression REGEXP and return the structure tree.
2037 If an error is occured, ERR is set by error code, and return NULL.
2038 This function build the following tree, from regular expression <reg_exp>:
2044 CAT means concatenation.
2045 EOR means end of regular expression. */
2048 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2051 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2052 bin_tree_t
*tree
, *eor
, *root
;
2053 re_token_t current_token
;
2054 dfa
->syntax
= syntax
;
2055 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2056 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2057 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2059 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2061 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2064 if (BE (eor
== NULL
|| root
== NULL
, 0))
2072 /* This function build the following tree, from regular expression
2073 <branch1>|<branch2>:
2079 ALT means alternative, which represents the operator `|'. */
2082 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2083 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2085 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2086 bin_tree_t
*tree
, *branch
= NULL
;
2087 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2088 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2091 while (token
->type
== OP_ALT
)
2093 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2094 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2095 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2097 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2098 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2103 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2104 if (BE (tree
== NULL
, 0))
2113 /* This function build the following tree, from regular expression
2120 CAT means concatenation. */
2123 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2124 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2126 bin_tree_t
*tree
, *exp
;
2127 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2128 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2129 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2132 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2133 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2135 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2136 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2140 if (tree
!= NULL
&& exp
!= NULL
)
2142 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2149 else if (tree
== NULL
)
2151 /* Otherwise exp == NULL, we don't need to create new tree. */
2156 /* This function build the following tree, from regular expression a*:
2163 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2164 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2166 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2168 switch (token
->type
)
2171 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2172 if (BE (tree
== NULL
, 0))
2177 #ifdef RE_ENABLE_I18N
2178 if (dfa
->mb_cur_max
> 1)
2180 while (!re_string_eoi (regexp
)
2181 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2183 bin_tree_t
*mbc_remain
;
2184 fetch_token (token
, regexp
, syntax
);
2185 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2186 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2187 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2196 case OP_OPEN_SUBEXP
:
2197 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2198 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2201 case OP_OPEN_BRACKET
:
2202 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2203 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2207 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2212 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2213 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2214 if (BE (tree
== NULL
, 0))
2220 dfa
->has_mb_node
= 1;
2222 case OP_OPEN_DUP_NUM
:
2223 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2229 case OP_DUP_ASTERISK
:
2231 case OP_DUP_QUESTION
:
2232 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2237 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2239 fetch_token (token
, regexp
, syntax
);
2240 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2242 /* else fall through */
2243 case OP_CLOSE_SUBEXP
:
2244 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2245 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2250 /* else fall through */
2251 case OP_CLOSE_DUP_NUM
:
2252 /* We treat it as a normal character. */
2254 /* Then we can these characters as normal characters. */
2255 token
->type
= CHARACTER
;
2256 /* mb_partial and word_char bits should be initialized already
2258 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2259 if (BE (tree
== NULL
, 0))
2266 if ((token
->opr
.ctx_type
2267 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2268 && dfa
->word_ops_used
== 0)
2269 init_word_char (dfa
);
2270 if (token
->opr
.ctx_type
== WORD_DELIM
2271 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2273 bin_tree_t
*tree_first
, *tree_last
;
2274 if (token
->opr
.ctx_type
== WORD_DELIM
)
2276 token
->opr
.ctx_type
= WORD_FIRST
;
2277 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2278 token
->opr
.ctx_type
= WORD_LAST
;
2282 token
->opr
.ctx_type
= INSIDE_WORD
;
2283 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2284 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2286 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2287 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2288 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2296 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2297 if (BE (tree
== NULL
, 0))
2303 /* We must return here, since ANCHORs can't be followed
2304 by repetition operators.
2305 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2306 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2307 fetch_token (token
, regexp
, syntax
);
2310 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2311 if (BE (tree
== NULL
, 0))
2316 if (dfa
->mb_cur_max
> 1)
2317 dfa
->has_mb_node
= 1;
2321 tree
= build_charclass_op (dfa
, regexp
->trans
,
2322 (const unsigned char *) "alnum",
2323 (const unsigned char *) "_",
2324 token
->type
== OP_NOTWORD
, err
);
2325 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2330 tree
= build_charclass_op (dfa
, regexp
->trans
,
2331 (const unsigned char *) "space",
2332 (const unsigned char *) "",
2333 token
->type
== OP_NOTSPACE
, err
);
2334 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2344 /* Must not happen? */
2350 fetch_token (token
, regexp
, syntax
);
2352 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2353 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2355 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2356 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2358 /* In BRE consecutive duplications are not allowed. */
2359 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2360 && (token
->type
== OP_DUP_ASTERISK
2361 || token
->type
== OP_OPEN_DUP_NUM
))
2371 /* This function build the following tree, from regular expression
2379 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2380 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2382 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2385 cur_nsub
= preg
->re_nsub
++;
2387 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2389 /* The subexpression may be a null string. */
2390 if (token
->type
== OP_CLOSE_SUBEXP
)
2394 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2395 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2397 if (BE (*err
!= REG_NOERROR
, 0))
2401 if (cur_nsub
<= '9' - '1')
2402 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2404 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2405 if (BE (tree
== NULL
, 0))
2410 tree
->token
.opr
.idx
= cur_nsub
;
2414 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2417 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2418 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2420 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2421 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2422 re_token_t start_token
= *token
;
2424 if (token
->type
== OP_OPEN_DUP_NUM
)
2427 start
= fetch_number (regexp
, token
, syntax
);
2430 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2431 start
= 0; /* We treat "{,m}" as "{0,m}". */
2434 *err
= REG_BADBR
; /* <re>{} is invalid. */
2438 if (BE (start
!= -2, 1))
2440 /* We treat "{n}" as "{n,n}". */
2441 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2442 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2443 ? fetch_number (regexp
, token
, syntax
) : -2));
2445 if (BE (start
== -2 || end
== -2, 0))
2447 /* Invalid sequence. */
2448 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2450 if (token
->type
== END_OF_RE
)
2458 /* If the syntax bit is set, rollback. */
2459 re_string_set_index (regexp
, start_idx
);
2460 *token
= start_token
;
2461 token
->type
= CHARACTER
;
2462 /* mb_partial and word_char bits should be already initialized by
2467 if (BE (end
!= -1 && start
> end
, 0))
2469 /* First number greater than second. */
2476 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2477 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2480 fetch_token (token
, regexp
, syntax
);
2482 if (BE (elem
== NULL
, 0))
2484 if (BE (start
== 0 && end
== 0, 0))
2486 postorder (elem
, free_tree
, NULL
);
2490 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2491 if (BE (start
> 0, 0))
2494 for (i
= 2; i
<= start
; ++i
)
2496 elem
= duplicate_tree (elem
, dfa
);
2497 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2498 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2499 goto parse_dup_op_espace
;
2505 /* Duplicate ELEM before it is marked optional. */
2506 elem
= duplicate_tree (elem
, dfa
);
2512 if (elem
->token
.type
== SUBEXP
)
2513 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2515 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2516 if (BE (tree
== NULL
, 0))
2517 goto parse_dup_op_espace
;
2519 /* This loop is actually executed only when end != -1,
2520 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2521 already created the start+1-th copy. */
2522 for (i
= start
+ 2; i
<= end
; ++i
)
2524 elem
= duplicate_tree (elem
, dfa
);
2525 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2526 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2527 goto parse_dup_op_espace
;
2529 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2530 if (BE (tree
== NULL
, 0))
2531 goto parse_dup_op_espace
;
2535 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2539 parse_dup_op_espace
:
2544 /* Size of the names for collating symbol/equivalence_class/character_class.
2545 I'm not sure, but maybe enough. */
2546 #define BRACKET_NAME_BUF_SIZE 32
2549 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2550 Build the range expression which starts from START_ELEM, and ends
2551 at END_ELEM. The result are written to MBCSET and SBCSET.
2552 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2553 mbcset->range_ends, is a pointer argument sinse we may
2556 static reg_errcode_t
2558 # ifdef RE_ENABLE_I18N
2559 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2560 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2561 # else /* not RE_ENABLE_I18N */
2562 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2563 bracket_elem_t
*end_elem
)
2564 # endif /* not RE_ENABLE_I18N */
2566 unsigned int start_ch
, end_ch
;
2567 /* Equivalence Classes and Character Classes can't be a range start/end. */
2568 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2569 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2573 /* We can handle no multi character collating elements without libc
2575 if (BE ((start_elem
->type
== COLL_SYM
2576 && strlen ((char *) start_elem
->opr
.name
) > 1)
2577 || (end_elem
->type
== COLL_SYM
2578 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2579 return REG_ECOLLATE
;
2581 # ifdef RE_ENABLE_I18N
2586 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2588 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2589 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2591 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2592 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2594 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2595 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2596 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2597 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2598 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2599 return REG_ECOLLATE
;
2600 cmp_buf
[0] = start_wc
;
2601 cmp_buf
[4] = end_wc
;
2602 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2605 /* Got valid collation sequence values, add them as a new entry.
2606 However, for !_LIBC we have no collation elements: if the
2607 character set is single byte, the single byte character set
2608 that we build below suffices. parse_bracket_exp passes
2609 no MBCSET if dfa->mb_cur_max == 1. */
2612 /* Check the space of the arrays. */
2613 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2615 /* There is not enough space, need realloc. */
2616 wchar_t *new_array_start
, *new_array_end
;
2619 /* +1 in case of mbcset->nranges is 0. */
2620 new_nranges
= 2 * mbcset
->nranges
+ 1;
2621 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2622 are NULL if *range_alloc == 0. */
2623 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2625 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2628 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2631 mbcset
->range_starts
= new_array_start
;
2632 mbcset
->range_ends
= new_array_end
;
2633 *range_alloc
= new_nranges
;
2636 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2637 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2640 /* Build the table for single byte characters. */
2641 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2644 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2645 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2646 bitset_set (sbcset
, wc
);
2649 # else /* not RE_ENABLE_I18N */
2652 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2653 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2655 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2656 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2658 if (start_ch
> end_ch
)
2660 /* Build the table for single byte characters. */
2661 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2662 if (start_ch
<= ch
&& ch
<= end_ch
)
2663 bitset_set (sbcset
, ch
);
2665 # endif /* not RE_ENABLE_I18N */
2668 #endif /* not _LIBC */
2671 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2672 Build the collating element which is represented by NAME.
2673 The result are written to MBCSET and SBCSET.
2674 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2675 pointer argument since we may update it. */
2677 static reg_errcode_t
2679 # ifdef RE_ENABLE_I18N
2680 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2681 int *coll_sym_alloc
, const unsigned char *name
)
2682 # else /* not RE_ENABLE_I18N */
2683 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2684 # endif /* not RE_ENABLE_I18N */
2686 size_t name_len
= strlen ((const char *) name
);
2687 if (BE (name_len
!= 1, 0))
2688 return REG_ECOLLATE
;
2691 bitset_set (sbcset
, name
[0]);
2695 #endif /* not _LIBC */
2697 /* This function parse bracket expression like "[abc]", "[a-c]",
2701 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2702 reg_syntax_t syntax
, reg_errcode_t
*err
)
2705 const unsigned char *collseqmb
;
2706 const char *collseqwc
;
2709 const int32_t *symb_table
;
2710 const unsigned char *extra
;
2712 /* Local function for parse_bracket_exp used in _LIBC environement.
2713 Seek the collating symbol entry correspondings to NAME.
2714 Return the index of the symbol in the SYMB_TABLE. */
2717 __attribute ((always_inline
))
2718 seek_collating_symbol_entry (name
, name_len
)
2719 const unsigned char *name
;
2722 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2723 int32_t elem
= hash
% table_size
;
2724 if (symb_table
[2 * elem
] != 0)
2726 int32_t second
= hash
% (table_size
- 2) + 1;
2730 /* First compare the hashing value. */
2731 if (symb_table
[2 * elem
] == hash
2732 /* Compare the length of the name. */
2733 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2734 /* Compare the name. */
2735 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2738 /* Yep, this is the entry. */
2745 while (symb_table
[2 * elem
] != 0);
2750 /* Local function for parse_bracket_exp used in _LIBC environment.
2751 Look up the collation sequence value of BR_ELEM.
2752 Return the value if succeeded, UINT_MAX otherwise. */
2754 auto inline unsigned int
2755 __attribute ((always_inline
))
2756 lookup_collation_sequence_value (br_elem
)
2757 bracket_elem_t
*br_elem
;
2759 if (br_elem
->type
== SB_CHAR
)
2762 if (MB_CUR_MAX == 1)
2765 return collseqmb
[br_elem
->opr
.ch
];
2768 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2769 return __collseq_table_lookup (collseqwc
, wc
);
2772 else if (br_elem
->type
== MB_CHAR
)
2775 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2777 else if (br_elem
->type
== COLL_SYM
)
2779 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2783 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2785 if (symb_table
[2 * elem
] != 0)
2787 /* We found the entry. */
2788 idx
= symb_table
[2 * elem
+ 1];
2789 /* Skip the name of collating element name. */
2790 idx
+= 1 + extra
[idx
];
2791 /* Skip the byte sequence of the collating element. */
2792 idx
+= 1 + extra
[idx
];
2793 /* Adjust for the alignment. */
2794 idx
= (idx
+ 3) & ~3;
2795 /* Skip the multibyte collation sequence value. */
2796 idx
+= sizeof (unsigned int);
2797 /* Skip the wide char sequence of the collating element. */
2798 idx
+= sizeof (unsigned int) *
2799 (1 + *(unsigned int *) (extra
+ idx
));
2800 /* Return the collation sequence value. */
2801 return *(unsigned int *) (extra
+ idx
);
2803 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2805 /* No valid character. Match it as a single byte
2807 return collseqmb
[br_elem
->opr
.name
[0]];
2810 else if (sym_name_len
== 1)
2811 return collseqmb
[br_elem
->opr
.name
[0]];
2816 /* Local function for parse_bracket_exp used in _LIBC environement.
2817 Build the range expression which starts from START_ELEM, and ends
2818 at END_ELEM. The result are written to MBCSET and SBCSET.
2819 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2820 mbcset->range_ends, is a pointer argument sinse we may
2823 auto inline reg_errcode_t
2824 __attribute ((always_inline
))
2825 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2826 re_charset_t
*mbcset
;
2829 bracket_elem_t
*start_elem
, *end_elem
;
2832 uint32_t start_collseq
;
2833 uint32_t end_collseq
;
2835 /* Equivalence Classes and Character Classes can't be a range
2837 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2838 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2842 start_collseq
= lookup_collation_sequence_value (start_elem
);
2843 end_collseq
= lookup_collation_sequence_value (end_elem
);
2844 /* Check start/end collation sequence values. */
2845 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2846 return REG_ECOLLATE
;
2847 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2850 /* Got valid collation sequence values, add them as a new entry.
2851 However, if we have no collation elements, and the character set
2852 is single byte, the single byte character set that we
2853 build below suffices. */
2854 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2856 /* Check the space of the arrays. */
2857 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2859 /* There is not enough space, need realloc. */
2860 uint32_t *new_array_start
;
2861 uint32_t *new_array_end
;
2864 /* +1 in case of mbcset->nranges is 0. */
2865 new_nranges
= 2 * mbcset
->nranges
+ 1;
2866 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2868 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2871 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2874 mbcset
->range_starts
= new_array_start
;
2875 mbcset
->range_ends
= new_array_end
;
2876 *range_alloc
= new_nranges
;
2879 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2880 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2883 /* Build the table for single byte characters. */
2884 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2886 uint32_t ch_collseq
;
2888 if (MB_CUR_MAX == 1)
2891 ch_collseq
= collseqmb
[ch
];
2893 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2894 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2895 bitset_set (sbcset
, ch
);
2900 /* Local function for parse_bracket_exp used in _LIBC environement.
2901 Build the collating element which is represented by NAME.
2902 The result are written to MBCSET and SBCSET.
2903 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2904 pointer argument sinse we may update it. */
2906 auto inline reg_errcode_t
2907 __attribute ((always_inline
))
2908 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2909 re_charset_t
*mbcset
;
2910 int *coll_sym_alloc
;
2912 const unsigned char *name
;
2915 size_t name_len
= strlen ((const char *) name
);
2918 elem
= seek_collating_symbol_entry (name
, name_len
);
2919 if (symb_table
[2 * elem
] != 0)
2921 /* We found the entry. */
2922 idx
= symb_table
[2 * elem
+ 1];
2923 /* Skip the name of collating element name. */
2924 idx
+= 1 + extra
[idx
];
2926 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2928 /* No valid character, treat it as a normal
2930 bitset_set (sbcset
, name
[0]);
2934 return REG_ECOLLATE
;
2936 /* Got valid collation sequence, add it as a new entry. */
2937 /* Check the space of the arrays. */
2938 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2940 /* Not enough, realloc it. */
2941 /* +1 in case of mbcset->ncoll_syms is 0. */
2942 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2943 /* Use realloc since mbcset->coll_syms is NULL
2945 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2946 new_coll_sym_alloc
);
2947 if (BE (new_coll_syms
== NULL
, 0))
2949 mbcset
->coll_syms
= new_coll_syms
;
2950 *coll_sym_alloc
= new_coll_sym_alloc
;
2952 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2957 if (BE (name_len
!= 1, 0))
2958 return REG_ECOLLATE
;
2961 bitset_set (sbcset
, name
[0]);
2968 re_token_t br_token
;
2969 re_bitset_ptr_t sbcset
;
2970 #ifdef RE_ENABLE_I18N
2971 re_charset_t
*mbcset
;
2972 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2973 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2974 #endif /* not RE_ENABLE_I18N */
2976 bin_tree_t
*work_tree
;
2978 int first_round
= 1;
2980 collseqmb
= (const unsigned char *)
2981 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2982 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2988 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2989 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2990 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2991 _NL_COLLATE_SYMB_TABLEMB
);
2992 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2993 _NL_COLLATE_SYMB_EXTRAMB
);
2996 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
2997 #ifdef RE_ENABLE_I18N
2998 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2999 #endif /* RE_ENABLE_I18N */
3000 #ifdef RE_ENABLE_I18N
3001 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3003 if (BE (sbcset
== NULL
, 0))
3004 #endif /* RE_ENABLE_I18N */
3010 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3011 if (BE (token
->type
== END_OF_RE
, 0))
3014 goto parse_bracket_exp_free_return
;
3016 if (token
->type
== OP_NON_MATCH_LIST
)
3018 #ifdef RE_ENABLE_I18N
3019 mbcset
->non_match
= 1;
3020 #endif /* not RE_ENABLE_I18N */
3022 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3023 bitset_set (sbcset
, '\n');
3024 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3025 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3026 if (BE (token
->type
== END_OF_RE
, 0))
3029 goto parse_bracket_exp_free_return
;
3033 /* We treat the first ']' as a normal character. */
3034 if (token
->type
== OP_CLOSE_BRACKET
)
3035 token
->type
= CHARACTER
;
3039 bracket_elem_t start_elem
, end_elem
;
3040 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3041 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3043 int token_len2
= 0, is_range_exp
= 0;
3046 start_elem
.opr
.name
= start_name_buf
;
3047 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3048 syntax
, first_round
);
3049 if (BE (ret
!= REG_NOERROR
, 0))
3052 goto parse_bracket_exp_free_return
;
3056 /* Get information about the next token. We need it in any case. */
3057 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3059 /* Do not check for ranges if we know they are not allowed. */
3060 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3062 if (BE (token
->type
== END_OF_RE
, 0))
3065 goto parse_bracket_exp_free_return
;
3067 if (token
->type
== OP_CHARSET_RANGE
)
3069 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3070 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3071 if (BE (token2
.type
== END_OF_RE
, 0))
3074 goto parse_bracket_exp_free_return
;
3076 if (token2
.type
== OP_CLOSE_BRACKET
)
3078 /* We treat the last '-' as a normal character. */
3079 re_string_skip_bytes (regexp
, -token_len
);
3080 token
->type
= CHARACTER
;
3087 if (is_range_exp
== 1)
3089 end_elem
.opr
.name
= end_name_buf
;
3090 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3092 if (BE (ret
!= REG_NOERROR
, 0))
3095 goto parse_bracket_exp_free_return
;
3098 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3101 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3102 &start_elem
, &end_elem
);
3104 # ifdef RE_ENABLE_I18N
3105 *err
= build_range_exp (sbcset
,
3106 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3107 &range_alloc
, &start_elem
, &end_elem
);
3109 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3111 #endif /* RE_ENABLE_I18N */
3112 if (BE (*err
!= REG_NOERROR
, 0))
3113 goto parse_bracket_exp_free_return
;
3117 switch (start_elem
.type
)
3120 bitset_set (sbcset
, start_elem
.opr
.ch
);
3122 #ifdef RE_ENABLE_I18N
3124 /* Check whether the array has enough space. */
3125 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3127 wchar_t *new_mbchars
;
3128 /* Not enough, realloc it. */
3129 /* +1 in case of mbcset->nmbchars is 0. */
3130 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3131 /* Use realloc since array is NULL if *alloc == 0. */
3132 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3134 if (BE (new_mbchars
== NULL
, 0))
3135 goto parse_bracket_exp_espace
;
3136 mbcset
->mbchars
= new_mbchars
;
3138 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3140 #endif /* RE_ENABLE_I18N */
3142 *err
= build_equiv_class (sbcset
,
3143 #ifdef RE_ENABLE_I18N
3144 mbcset
, &equiv_class_alloc
,
3145 #endif /* RE_ENABLE_I18N */
3146 start_elem
.opr
.name
);
3147 if (BE (*err
!= REG_NOERROR
, 0))
3148 goto parse_bracket_exp_free_return
;
3151 *err
= build_collating_symbol (sbcset
,
3152 #ifdef RE_ENABLE_I18N
3153 mbcset
, &coll_sym_alloc
,
3154 #endif /* RE_ENABLE_I18N */
3155 start_elem
.opr
.name
);
3156 if (BE (*err
!= REG_NOERROR
, 0))
3157 goto parse_bracket_exp_free_return
;
3160 *err
= build_charclass (regexp
->trans
, sbcset
,
3161 #ifdef RE_ENABLE_I18N
3162 mbcset
, &char_class_alloc
,
3163 #endif /* RE_ENABLE_I18N */
3164 start_elem
.opr
.name
, syntax
);
3165 if (BE (*err
!= REG_NOERROR
, 0))
3166 goto parse_bracket_exp_free_return
;
3173 if (BE (token
->type
== END_OF_RE
, 0))
3176 goto parse_bracket_exp_free_return
;
3178 if (token
->type
== OP_CLOSE_BRACKET
)
3182 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3184 /* If it is non-matching list. */
3186 bitset_not (sbcset
);
3188 #ifdef RE_ENABLE_I18N
3189 /* Ensure only single byte characters are set. */
3190 if (dfa
->mb_cur_max
> 1)
3191 bitset_mask (sbcset
, dfa
->sb_char
);
3193 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3194 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3195 || mbcset
->non_match
)))
3197 bin_tree_t
*mbc_tree
;
3199 /* Build a tree for complex bracket. */
3200 dfa
->has_mb_node
= 1;
3201 br_token
.type
= COMPLEX_BRACKET
;
3202 br_token
.opr
.mbcset
= mbcset
;
3203 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3204 if (BE (mbc_tree
== NULL
, 0))
3205 goto parse_bracket_exp_espace
;
3206 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3207 if (sbcset
[sbc_idx
])
3209 /* If there are no bits set in sbcset, there is no point
3210 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3211 if (sbc_idx
< BITSET_WORDS
)
3213 /* Build a tree for simple bracket. */
3214 br_token
.type
= SIMPLE_BRACKET
;
3215 br_token
.opr
.sbcset
= sbcset
;
3216 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3217 if (BE (work_tree
== NULL
, 0))
3218 goto parse_bracket_exp_espace
;
3220 /* Then join them by ALT node. */
3221 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3222 if (BE (work_tree
== NULL
, 0))
3223 goto parse_bracket_exp_espace
;
3228 work_tree
= mbc_tree
;
3232 #endif /* not RE_ENABLE_I18N */
3234 #ifdef RE_ENABLE_I18N
3235 free_charset (mbcset
);
3237 /* Build a tree for simple bracket. */
3238 br_token
.type
= SIMPLE_BRACKET
;
3239 br_token
.opr
.sbcset
= sbcset
;
3240 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3241 if (BE (work_tree
== NULL
, 0))
3242 goto parse_bracket_exp_espace
;
3246 parse_bracket_exp_espace
:
3248 parse_bracket_exp_free_return
:
3250 #ifdef RE_ENABLE_I18N
3251 free_charset (mbcset
);
3252 #endif /* RE_ENABLE_I18N */
3256 /* Parse an element in the bracket expression. */
3258 static reg_errcode_t
3259 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3260 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3261 reg_syntax_t syntax
, int accept_hyphen
)
3263 #ifdef RE_ENABLE_I18N
3265 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3266 if (cur_char_size
> 1)
3268 elem
->type
= MB_CHAR
;
3269 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3270 re_string_skip_bytes (regexp
, cur_char_size
);
3273 #endif /* RE_ENABLE_I18N */
3274 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3275 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3276 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3277 return parse_bracket_symbol (elem
, regexp
, token
);
3278 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3280 /* A '-' must only appear as anything but a range indicator before
3281 the closing bracket. Everything else is an error. */
3283 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3284 if (token2
.type
!= OP_CLOSE_BRACKET
)
3285 /* The actual error value is not standardized since this whole
3286 case is undefined. But ERANGE makes good sense. */
3289 elem
->type
= SB_CHAR
;
3290 elem
->opr
.ch
= token
->opr
.c
;
3294 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3295 such as [:<character_class>:], [.<collating_element>.], and
3296 [=<equivalent_class>=]. */
3298 static reg_errcode_t
3299 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3302 unsigned char ch
, delim
= token
->opr
.c
;
3304 if (re_string_eoi(regexp
))
3308 if (i
>= BRACKET_NAME_BUF_SIZE
)
3310 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3311 ch
= re_string_fetch_byte_case (regexp
);
3313 ch
= re_string_fetch_byte (regexp
);
3314 if (re_string_eoi(regexp
))
3316 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3318 elem
->opr
.name
[i
] = ch
;
3320 re_string_skip_bytes (regexp
, 1);
3321 elem
->opr
.name
[i
] = '\0';
3322 switch (token
->type
)
3324 case OP_OPEN_COLL_ELEM
:
3325 elem
->type
= COLL_SYM
;
3327 case OP_OPEN_EQUIV_CLASS
:
3328 elem
->type
= EQUIV_CLASS
;
3330 case OP_OPEN_CHAR_CLASS
:
3331 elem
->type
= CHAR_CLASS
;
3339 /* Helper function for parse_bracket_exp.
3340 Build the equivalence class which is represented by NAME.
3341 The result are written to MBCSET and SBCSET.
3342 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3343 is a pointer argument sinse we may update it. */
3345 static reg_errcode_t
3346 #ifdef RE_ENABLE_I18N
3347 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3348 int *equiv_class_alloc
, const unsigned char *name
)
3349 #else /* not RE_ENABLE_I18N */
3350 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3351 #endif /* not RE_ENABLE_I18N */
3354 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3357 const int32_t *table
, *indirect
;
3358 const unsigned char *weights
, *extra
, *cp
;
3359 unsigned char char_buf
[2];
3363 /* This #include defines a local function! */
3364 # include <locale/weight.h>
3365 /* Calculate the index for equivalence class. */
3367 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3368 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3369 _NL_COLLATE_WEIGHTMB
);
3370 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3371 _NL_COLLATE_EXTRAMB
);
3372 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3373 _NL_COLLATE_INDIRECTMB
);
3374 idx1
= findidx (&cp
);
3375 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3376 /* This isn't a valid character. */
3377 return REG_ECOLLATE
;
3379 /* Build single byte matcing table for this equivalence class. */
3380 char_buf
[1] = (unsigned char) '\0';
3381 len
= weights
[idx1
& 0xffffff];
3382 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3386 idx2
= findidx (&cp
);
3391 /* This isn't a valid character. */
3393 /* Compare only if the length matches and the collation rule
3394 index is the same. */
3395 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3399 while (cnt
<= len
&&
3400 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3401 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3405 bitset_set (sbcset
, ch
);
3408 /* Check whether the array has enough space. */
3409 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3411 /* Not enough, realloc it. */
3412 /* +1 in case of mbcset->nequiv_classes is 0. */
3413 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3414 /* Use realloc since the array is NULL if *alloc == 0. */
3415 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3417 new_equiv_class_alloc
);
3418 if (BE (new_equiv_classes
== NULL
, 0))
3420 mbcset
->equiv_classes
= new_equiv_classes
;
3421 *equiv_class_alloc
= new_equiv_class_alloc
;
3423 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3428 if (BE (strlen ((const char *) name
) != 1, 0))
3429 return REG_ECOLLATE
;
3430 bitset_set (sbcset
, *name
);
3435 /* Helper function for parse_bracket_exp.
3436 Build the character class which is represented by NAME.
3437 The result are written to MBCSET and SBCSET.
3438 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3439 is a pointer argument sinse we may update it. */
3441 static reg_errcode_t
3442 #ifdef RE_ENABLE_I18N
3443 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3444 re_charset_t
*mbcset
, int *char_class_alloc
,
3445 const unsigned char *class_name
, reg_syntax_t syntax
)
3446 #else /* not RE_ENABLE_I18N */
3447 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3448 const unsigned char *class_name
, reg_syntax_t syntax
)
3449 #endif /* not RE_ENABLE_I18N */
3452 const char *name
= (const char *) class_name
;
3454 /* In case of REG_ICASE "upper" and "lower" match the both of
3455 upper and lower cases. */
3456 if ((syntax
& RE_ICASE
)
3457 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3460 #ifdef RE_ENABLE_I18N
3461 /* Check the space of the arrays. */
3462 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3464 /* Not enough, realloc it. */
3465 /* +1 in case of mbcset->nchar_classes is 0. */
3466 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3467 /* Use realloc since array is NULL if *alloc == 0. */
3468 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3469 new_char_class_alloc
);
3470 if (BE (new_char_classes
== NULL
, 0))
3472 mbcset
->char_classes
= new_char_classes
;
3473 *char_class_alloc
= new_char_class_alloc
;
3475 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3476 #endif /* RE_ENABLE_I18N */
3478 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3480 if (BE (trans != NULL, 0)) \
3482 for (i = 0; i < SBC_MAX; ++i) \
3483 if (ctype_func (i)) \
3484 bitset_set (sbcset, trans[i]); \
3488 for (i = 0; i < SBC_MAX; ++i) \
3489 if (ctype_func (i)) \
3490 bitset_set (sbcset, i); \
3494 if (strcmp (name
, "alnum") == 0)
3495 BUILD_CHARCLASS_LOOP (isalnum
);
3496 else if (strcmp (name
, "cntrl") == 0)
3497 BUILD_CHARCLASS_LOOP (iscntrl
);
3498 else if (strcmp (name
, "lower") == 0)
3499 BUILD_CHARCLASS_LOOP (islower
);
3500 else if (strcmp (name
, "space") == 0)
3501 BUILD_CHARCLASS_LOOP (isspace
);
3502 else if (strcmp (name
, "alpha") == 0)
3503 BUILD_CHARCLASS_LOOP (isalpha
);
3504 else if (strcmp (name
, "digit") == 0)
3505 BUILD_CHARCLASS_LOOP (isdigit
);
3506 else if (strcmp (name
, "print") == 0)
3507 BUILD_CHARCLASS_LOOP (isprint
);
3508 else if (strcmp (name
, "upper") == 0)
3509 BUILD_CHARCLASS_LOOP (isupper
);
3510 else if (strcmp (name
, "blank") == 0)
3511 BUILD_CHARCLASS_LOOP (isblank
);
3512 else if (strcmp (name
, "graph") == 0)
3513 BUILD_CHARCLASS_LOOP (isgraph
);
3514 else if (strcmp (name
, "punct") == 0)
3515 BUILD_CHARCLASS_LOOP (ispunct
);
3516 else if (strcmp (name
, "xdigit") == 0)
3517 BUILD_CHARCLASS_LOOP (isxdigit
);
3525 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3526 const unsigned char *class_name
,
3527 const unsigned char *extra
, int non_match
,
3530 re_bitset_ptr_t sbcset
;
3531 #ifdef RE_ENABLE_I18N
3532 re_charset_t
*mbcset
;
3534 #endif /* not RE_ENABLE_I18N */
3536 re_token_t br_token
;
3539 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3540 #ifdef RE_ENABLE_I18N
3541 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3542 #endif /* RE_ENABLE_I18N */
3544 #ifdef RE_ENABLE_I18N
3545 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3546 #else /* not RE_ENABLE_I18N */
3547 if (BE (sbcset
== NULL
, 0))
3548 #endif /* not RE_ENABLE_I18N */
3556 #ifdef RE_ENABLE_I18N
3557 mbcset
->non_match
= 1;
3558 #endif /* not RE_ENABLE_I18N */
3561 /* We don't care the syntax in this case. */
3562 ret
= build_charclass (trans
, sbcset
,
3563 #ifdef RE_ENABLE_I18N
3565 #endif /* RE_ENABLE_I18N */
3568 if (BE (ret
!= REG_NOERROR
, 0))
3571 #ifdef RE_ENABLE_I18N
3572 free_charset (mbcset
);
3573 #endif /* RE_ENABLE_I18N */
3577 /* \w match '_' also. */
3578 for (; *extra
; extra
++)
3579 bitset_set (sbcset
, *extra
);
3581 /* If it is non-matching list. */
3583 bitset_not (sbcset
);
3585 #ifdef RE_ENABLE_I18N
3586 /* Ensure only single byte characters are set. */
3587 if (dfa
->mb_cur_max
> 1)
3588 bitset_mask (sbcset
, dfa
->sb_char
);
3591 /* Build a tree for simple bracket. */
3592 br_token
.type
= SIMPLE_BRACKET
;
3593 br_token
.opr
.sbcset
= sbcset
;
3594 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3595 if (BE (tree
== NULL
, 0))
3596 goto build_word_op_espace
;
3598 #ifdef RE_ENABLE_I18N
3599 if (dfa
->mb_cur_max
> 1)
3601 bin_tree_t
*mbc_tree
;
3602 /* Build a tree for complex bracket. */
3603 br_token
.type
= COMPLEX_BRACKET
;
3604 br_token
.opr
.mbcset
= mbcset
;
3605 dfa
->has_mb_node
= 1;
3606 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3607 if (BE (mbc_tree
== NULL
, 0))
3608 goto build_word_op_espace
;
3609 /* Then join them by ALT node. */
3610 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3611 if (BE (mbc_tree
!= NULL
, 1))
3616 free_charset (mbcset
);
3619 #else /* not RE_ENABLE_I18N */
3621 #endif /* not RE_ENABLE_I18N */
3623 build_word_op_espace
:
3625 #ifdef RE_ENABLE_I18N
3626 free_charset (mbcset
);
3627 #endif /* RE_ENABLE_I18N */
3632 /* This is intended for the expressions like "a{1,3}".
3633 Fetch a number from `input', and return the number.
3634 Return -1, if the number field is empty like "{,1}".
3635 Return -2, If an error is occured. */
3638 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3644 fetch_token (token
, input
, syntax
);
3646 if (BE (token
->type
== END_OF_RE
, 0))
3648 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3650 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3651 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3652 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3657 #ifdef RE_ENABLE_I18N
3659 free_charset (re_charset_t
*cset
)
3661 re_free (cset
->mbchars
);
3663 re_free (cset
->coll_syms
);
3664 re_free (cset
->equiv_classes
);
3665 re_free (cset
->range_starts
);
3666 re_free (cset
->range_ends
);
3668 re_free (cset
->char_classes
);
3671 #endif /* RE_ENABLE_I18N */
3673 /* Functions for binary tree operation. */
3675 /* Create a tree node. */
3678 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3679 re_token_type_t type
)
3683 return create_token_tree (dfa
, left
, right
, &t
);
3687 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3688 const re_token_t
*token
)
3691 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3693 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3695 if (storage
== NULL
)
3697 storage
->next
= dfa
->str_tree_storage
;
3698 dfa
->str_tree_storage
= storage
;
3699 dfa
->str_tree_storage_idx
= 0;
3701 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3703 tree
->parent
= NULL
;
3705 tree
->right
= right
;
3706 tree
->token
= *token
;
3707 tree
->token
.duplicated
= 0;
3708 tree
->token
.opt_subexp
= 0;
3711 tree
->node_idx
= -1;
3714 left
->parent
= tree
;
3716 right
->parent
= tree
;
3720 /* Mark the tree SRC as an optional subexpression.
3721 To be called from preorder or postorder. */
3723 static reg_errcode_t
3724 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3726 int idx
= (int) (long) extra
;
3727 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3728 node
->token
.opt_subexp
= 1;
3733 /* Free the allocated memory inside NODE. */
3736 free_token (re_token_t
*node
)
3738 #ifdef RE_ENABLE_I18N
3739 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3740 free_charset (node
->opr
.mbcset
);
3742 #endif /* RE_ENABLE_I18N */
3743 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3744 re_free (node
->opr
.sbcset
);
3747 /* Worker function for tree walking. Free the allocated memory inside NODE
3748 and its children. */
3750 static reg_errcode_t
3751 free_tree (void *extra
, bin_tree_t
*node
)
3753 free_token (&node
->token
);
3758 /* Duplicate the node SRC, and return new node. This is a preorder
3759 visit similar to the one implemented by the generic visitor, but
3760 we need more infrastructure to maintain two parallel trees --- so,
3761 it's easier to duplicate. */
3764 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3766 const bin_tree_t
*node
;
3767 bin_tree_t
*dup_root
;
3768 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3770 for (node
= root
; ; )
3772 /* Create a new tree and link it back to the current parent. */
3773 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3776 (*p_new
)->parent
= dup_node
;
3777 (*p_new
)->token
.duplicated
= 1;
3780 /* Go to the left node, or up and to the right. */
3784 p_new
= &dup_node
->left
;
3788 const bin_tree_t
*prev
= NULL
;
3789 while (node
->right
== prev
|| node
->right
== NULL
)
3792 node
= node
->parent
;
3793 dup_node
= dup_node
->parent
;
3798 p_new
= &dup_node
->right
;