1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
28 static void free_charset (re_charset_t
*cset
);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t
*preg
);
31 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
35 static reg_errcode_t
analyze (regex_t
*preg
);
36 static reg_errcode_t
preorder (bin_tree_t
*root
,
37 reg_errcode_t (fn (void *, bin_tree_t
*)),
39 static reg_errcode_t
postorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
43 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
44 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
46 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
49 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
50 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
53 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
55 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
56 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
58 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 reg_syntax_t syntax
) internal_function
;
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
92 int *equiv_class_alloc
,
93 const unsigned char *name
);
94 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
97 int *char_class_alloc
,
98 const char *class_name
,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
102 const unsigned char *name
);
103 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
105 const char *class_name
,
106 reg_syntax_t syntax
);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
109 RE_TRANSLATE_TYPE trans
,
110 const char *class_name
,
112 int non_match
, reg_errcode_t
*err
);
113 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
114 bin_tree_t
*left
, bin_tree_t
*right
,
115 re_token_type_t type
);
116 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 const re_token_t
*token
);
119 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
120 static void free_token (re_token_t
*node
);
121 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
122 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid
[] attribute_hidden
=
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx
[] attribute_hidden
=
204 /* Entry points for GNU code. */
209 /* For ZOS USS we must define btowc */
220 mbtowc (wtmp
, tmp
, 1);
225 /* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
233 re_compile_pattern (pattern
, length
, bufp
)
236 struct re_pattern_buffer
*bufp
;
240 /* And GNU code determines whether or not to get register information
241 by passing null for the REGS argument to re_match, etc., not by
242 setting no_sub, unless RE_NO_SUB is set. */
243 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
245 /* Match anchors at newline. */
246 bufp
->newline_anchor
= 1;
248 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
252 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
255 weak_alias (__re_compile_pattern
, re_compile_pattern
)
258 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
259 also be assigned to arbitrarily: each pattern buffer stores its own
260 syntax, so it can be changed between regex compilations. */
261 /* This has no initializer because initialized variables in Emacs
262 become read-only after dumping. */
263 reg_syntax_t re_syntax_options
;
266 /* Specify the precise syntax of regexps for compilation. This provides
267 for compatibility for various utilities which historically have
268 different, incompatible syntaxes.
270 The argument SYNTAX is a bit mask comprised of the various bits
271 defined in regex.h. We return the old syntax. */
274 re_set_syntax (syntax
)
277 reg_syntax_t ret
= re_syntax_options
;
279 re_syntax_options
= syntax
;
283 weak_alias (__re_set_syntax
, re_set_syntax
)
287 re_compile_fastmap (bufp
)
288 struct re_pattern_buffer
*bufp
;
290 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
291 char *fastmap
= bufp
->fastmap
;
293 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
294 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
295 if (dfa
->init_state
!= dfa
->init_state_word
)
296 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
297 if (dfa
->init_state
!= dfa
->init_state_nl
)
298 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
299 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
300 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
301 bufp
->fastmap_accurate
= 1;
305 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
309 __attribute ((always_inline
))
310 re_set_fastmap (char *fastmap
, int icase
, int ch
)
314 fastmap
[tolower (ch
)] = 1;
317 /* Helper function for re_compile_fastmap.
318 Compile fastmap for the initial_state INIT_STATE. */
321 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
324 volatile re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
326 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
327 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
329 int node
= init_state
->nodes
.elems
[node_cnt
];
330 re_token_type_t type
= dfa
->nodes
[node
].type
;
332 if (type
== CHARACTER
)
334 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
335 #ifdef RE_ENABLE_I18N
336 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
338 unsigned char *buf
= re_malloc (unsigned char, dfa
->mb_cur_max
), *p
;
343 *p
++ = dfa
->nodes
[node
].opr
.c
;
344 while (++node
< dfa
->nodes_len
345 && dfa
->nodes
[node
].type
== CHARACTER
346 && dfa
->nodes
[node
].mb_partial
)
347 *p
++ = dfa
->nodes
[node
].opr
.c
;
348 memset (&state
, '\0', sizeof (state
));
349 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
351 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
353 re_set_fastmap (fastmap
, 0, buf
[0]);
358 else if (type
== SIMPLE_BRACKET
)
361 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
364 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
365 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
366 if (w
& ((bitset_word_t
) 1 << j
))
367 re_set_fastmap (fastmap
, icase
, ch
);
370 #ifdef RE_ENABLE_I18N
371 else if (type
== COMPLEX_BRACKET
)
373 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
377 /* See if we have to try all bytes which start multiple collation
379 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
380 collation element, and don't catch 'b' since 'b' is
381 the only collation element which starts from 'b' (and
382 it is caught by SIMPLE_BRACKET). */
383 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
384 && (cset
->ncoll_syms
|| cset
->nranges
))
386 const int32_t *table
= (const int32_t *)
387 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
388 for (i
= 0; i
< SBC_MAX
; ++i
)
390 re_set_fastmap (fastmap
, icase
, i
);
394 /* See if we have to start the match at all multibyte characters,
395 i.e. where we would not find an invalid sequence. This only
396 applies to multibyte character sets; for single byte character
397 sets, the SIMPLE_BRACKET again suffices. */
398 if (dfa
->mb_cur_max
> 1
399 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
401 || cset
->nequiv_classes
409 memset (&mbs
, 0, sizeof (mbs
));
410 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
411 re_set_fastmap (fastmap
, false, (int) c
);
418 /* ... Else catch all bytes which can start the mbchars. */
419 for (i
= 0; i
< cset
->nmbchars
; ++i
)
423 memset (&state
, '\0', sizeof (state
));
424 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
425 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
426 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
428 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
430 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
435 #endif /* RE_ENABLE_I18N */
436 else if (type
== OP_PERIOD
437 #ifdef RE_ENABLE_I18N
438 || type
== OP_UTF8_PERIOD
439 #endif /* RE_ENABLE_I18N */
440 || type
== END_OF_RE
)
442 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
443 if (type
== END_OF_RE
)
444 bufp
->can_be_null
= 1;
450 /* Entry point for POSIX code. */
451 /* regcomp takes a regular expression as a string and compiles it.
453 PREG is a regex_t *. We do not expect any fields to be initialized,
454 since POSIX says we shouldn't. Thus, we set
456 `buffer' to the compiled pattern;
457 `used' to the length of the compiled pattern;
458 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
459 REG_EXTENDED bit in CFLAGS is set; otherwise, to
460 RE_SYNTAX_POSIX_BASIC;
461 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
462 `fastmap' to an allocated space for the fastmap;
463 `fastmap_accurate' to zero;
464 `re_nsub' to the number of subexpressions in PATTERN.
466 PATTERN is the address of the pattern string.
468 CFLAGS is a series of bits which affect compilation.
470 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
471 use POSIX basic syntax.
473 If REG_NEWLINE is set, then . and [^...] don't match newline.
474 Also, regexec will try a match beginning after every newline.
476 If REG_ICASE is set, then we considers upper- and lowercase
477 versions of letters to be equivalent when matching.
479 If REG_NOSUB is set, then when PREG is passed to regexec, that
480 routine will report only success or failure, and nothing about the
483 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
484 the return codes and their meanings.) */
487 regcomp (preg
, pattern
, cflags
)
488 regex_t
*__restrict preg
;
489 const char *__restrict pattern
;
493 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC
);
500 /* Try to allocate space for the fastmap. */
501 preg
->fastmap
= re_malloc (char, SBC_MAX
);
502 if (BE (preg
->fastmap
== NULL
, 0))
505 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags
& REG_NEWLINE
)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax
&= ~RE_DOT_NEWLINE
;
511 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
512 /* It also changes the matching behavior. */
513 preg
->newline_anchor
= 1;
516 preg
->newline_anchor
= 0;
517 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
518 preg
->translate
= NULL
;
520 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret
== REG_ERPAREN
)
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret
== REG_NOERROR
, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg
);
534 /* Some error occurred while compiling the expression. */
535 re_free (preg
->fastmap
);
536 preg
->fastmap
= NULL
;
542 weak_alias (__regcomp
, regcomp
)
545 /* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
549 regerror (errcode
, preg
, errbuf
, errbuf_size
)
551 const regex_t
*__restrict preg
;
552 char *__restrict errbuf
;
559 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
560 / sizeof (__re_error_msgid_idx
[0])), 0))
561 /* Only error codes returned by the rest of the code should be passed
562 to this routine. If we are given anything else, or if other regex
563 code generates an invalid error code, then the program has a bug.
564 Dump core so we can fix it. */
567 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
569 msg_size
= strlen (msg
) + 1; /* Includes the null. */
571 if (BE (errbuf_size
!= 0, 1))
573 if (BE (msg_size
> errbuf_size
, 0))
575 memcpy (errbuf
, msg
, errbuf_size
- 1);
576 errbuf
[errbuf_size
- 1] = 0;
579 memcpy (errbuf
, msg
, msg_size
);
585 weak_alias (__regerror
, regerror
)
589 #ifdef RE_ENABLE_I18N
590 /* This static array is used for the map to single-byte characters when
591 UTF-8 is used. Otherwise we would allocate memory just to initialize
592 it the same all the time. UTF-8 is the preferred encoding so this is
593 a worthwhile optimization. */
595 static const bitset_t utf8_sb_map
= {
596 /* Set the first 128 bits. */
597 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
599 #else /* ! (__GNUC__ >= 3) */
600 static bitset_t utf8_sb_map
;
601 #endif /* __GNUC__ >= 3 */
602 #endif /* RE_ENABLE_I18N */
606 free_dfa_content (re_dfa_t
*dfa
)
611 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
612 free_token (dfa
->nodes
+ i
);
613 re_free (dfa
->nexts
);
614 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
616 if (dfa
->eclosures
!= NULL
)
617 re_node_set_free (dfa
->eclosures
+ i
);
618 if (dfa
->inveclosures
!= NULL
)
619 re_node_set_free (dfa
->inveclosures
+ i
);
620 if (dfa
->edests
!= NULL
)
621 re_node_set_free (dfa
->edests
+ i
);
623 re_free (dfa
->edests
);
624 re_free (dfa
->eclosures
);
625 re_free (dfa
->inveclosures
);
626 re_free (dfa
->nodes
);
628 if (dfa
->state_table
)
629 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
631 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
632 for (j
= 0; j
< entry
->num
; ++j
)
634 re_dfastate_t
*state
= entry
->array
[j
];
637 re_free (entry
->array
);
639 re_free (dfa
->state_table
);
640 #ifdef RE_ENABLE_I18N
641 if (dfa
->sb_char
!= utf8_sb_map
)
642 re_free (dfa
->sb_char
);
644 re_free (dfa
->subexp_map
);
646 re_free (dfa
->re_str
);
653 /* Free dynamically allocated space used by PREG. */
659 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
660 if (BE (dfa
!= NULL
, 1))
661 free_dfa_content (dfa
);
665 re_free (preg
->fastmap
);
666 preg
->fastmap
= NULL
;
668 re_free (preg
->translate
);
669 preg
->translate
= NULL
;
672 weak_alias (__regfree
, regfree
)
675 /* Entry points compatible with 4.2 BSD regex library. We don't define
676 them unless specifically requested. */
678 #if defined _REGEX_RE_COMP || defined _LIBC
680 /* BSD has one and only one pattern buffer. */
681 static struct re_pattern_buffer re_comp_buf
;
685 /* Make these definitions weak in libc, so POSIX programs can redefine
686 these names if they don't use our functions, and still use
687 regcomp/regexec above without link errors. */
698 if (!re_comp_buf
.buffer
)
699 return gettext ("No previous regular expression");
703 if (re_comp_buf
.buffer
)
705 fastmap
= re_comp_buf
.fastmap
;
706 re_comp_buf
.fastmap
= NULL
;
707 __regfree (&re_comp_buf
);
708 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
709 re_comp_buf
.fastmap
= fastmap
;
712 if (re_comp_buf
.fastmap
== NULL
)
714 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
715 if (re_comp_buf
.fastmap
== NULL
)
716 return (char *) gettext (__re_error_msgid
717 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
720 /* Since `re_exec' always passes NULL for the `regs' argument, we
721 don't need to initialize the pattern buffer fields which affect it. */
723 /* Match anchors at newlines. */
724 re_comp_buf
.newline_anchor
= 1;
726 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
731 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
732 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
736 libc_freeres_fn (free_mem
)
738 __regfree (&re_comp_buf
);
742 #endif /* _REGEX_RE_COMP */
744 /* Internal entry point.
745 Compile the regular expression PATTERN, whose length is LENGTH.
746 SYNTAX indicate regular expression's syntax. */
749 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
752 reg_errcode_t err
= REG_NOERROR
;
756 /* Initialize the pattern buffer. */
757 preg
->fastmap_accurate
= 0;
758 preg
->syntax
= syntax
;
759 preg
->not_bol
= preg
->not_eol
= 0;
762 preg
->can_be_null
= 0;
763 preg
->regs_allocated
= REGS_UNALLOCATED
;
765 /* Initialize the dfa. */
766 dfa
= (re_dfa_t
*) preg
->buffer
;
767 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
769 /* If zero allocated, but buffer is non-null, try to realloc
770 enough space. This loses if buffer's address is bogus, but
771 that is the user's responsibility. If ->buffer is NULL this
772 is a simple allocation. */
773 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
776 preg
->allocated
= sizeof (re_dfa_t
);
777 preg
->buffer
= (unsigned char *) dfa
;
779 preg
->used
= sizeof (re_dfa_t
);
781 err
= init_dfa (dfa
, length
);
782 if (BE (err
!= REG_NOERROR
, 0))
784 free_dfa_content (dfa
);
790 /* Note: length+1 will not overflow since it is checked in init_dfa. */
791 dfa
->re_str
= re_malloc (char, length
+ 1);
792 strncpy (dfa
->re_str
, pattern
, length
+ 1);
795 __libc_lock_init (dfa
->lock
);
797 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
798 syntax
& RE_ICASE
, dfa
);
799 if (BE (err
!= REG_NOERROR
, 0))
801 re_compile_internal_free_return
:
802 free_workarea_compile (preg
);
803 re_string_destruct (®exp
);
804 free_dfa_content (dfa
);
810 /* Parse the regular expression, and build a structure tree. */
812 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
813 if (BE (dfa
->str_tree
== NULL
, 0))
814 goto re_compile_internal_free_return
;
816 /* Analyze the tree and create the nfa. */
817 err
= analyze (preg
);
818 if (BE (err
!= REG_NOERROR
, 0))
819 goto re_compile_internal_free_return
;
821 #ifdef RE_ENABLE_I18N
822 /* If possible, do searching in single byte encoding to speed things up. */
823 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
827 /* Then create the initial state of the dfa. */
828 err
= create_initial_state (dfa
);
830 /* Release work areas. */
831 free_workarea_compile (preg
);
832 re_string_destruct (®exp
);
834 if (BE (err
!= REG_NOERROR
, 0))
836 free_dfa_content (dfa
);
844 /* Initialize DFA. We use the length of the regular expression PAT_LEN
845 as the initial length of some arrays. */
848 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
850 unsigned int table_size
;
855 memset (dfa
, '\0', sizeof (re_dfa_t
));
857 /* Force allocation of str_tree_storage the first time. */
858 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
860 /* Avoid overflows. */
861 if (pat_len
== SIZE_MAX
)
864 dfa
->nodes_alloc
= pat_len
+ 1;
865 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
867 /* table_size = 2 ^ ceil(log pat_len) */
868 for (table_size
= 1; ; table_size
<<= 1)
869 if (table_size
> pat_len
)
872 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
873 dfa
->state_hash_mask
= table_size
- 1;
875 dfa
->mb_cur_max
= MB_CUR_MAX
;
877 if (dfa
->mb_cur_max
== 6
878 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
880 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
883 # ifdef HAVE_LANGINFO_CODESET
884 codeset_name
= nl_langinfo (CODESET
);
886 codeset_name
= getenv ("LC_ALL");
887 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
888 codeset_name
= getenv ("LC_CTYPE");
889 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
890 codeset_name
= getenv ("LANG");
891 if (codeset_name
== NULL
)
893 else if (strchr (codeset_name
, '.') != NULL
)
894 codeset_name
= strchr (codeset_name
, '.') + 1;
897 /* strcasecmp isn't a standard interface. brute force check */
899 if (strcasecmp (codeset_name
, "UTF-8") == 0
900 || strcasecmp (codeset_name
, "UTF8") == 0)
903 if ( (codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
904 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
905 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
906 && (codeset_name
[3] == '-'
907 ? codeset_name
[4] == '8' && codeset_name
[5] == '\0'
908 : codeset_name
[3] == '8' && codeset_name
[4] == '\0'))
912 /* We check exhaustively in the loop below if this charset is a
913 superset of ASCII. */
914 dfa
->map_notascii
= 0;
917 #ifdef RE_ENABLE_I18N
918 if (dfa
->mb_cur_max
> 1)
922 #if !defined(__GNUC__) || __GNUC__ < 3
923 static short utf8_sb_map_inited
= 0;
925 if (! utf8_sb_map_inited
)
929 utf8_sb_map_inited
= 0;
930 for (i
= 0; i
<= 0x80 / BITSET_WORD_BITS
- 1; i
++)
931 utf8_sb_map
[i
] = BITSET_WORD_MAX
;
934 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
940 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
941 if (BE (dfa
->sb_char
== NULL
, 0))
944 /* Set the bits corresponding to single byte chars. */
945 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
946 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
948 wint_t wch
= __btowc (ch
);
950 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
952 if (isascii (ch
) && wch
!= ch
)
953 dfa
->map_notascii
= 1;
960 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
965 /* Initialize WORD_CHAR table, which indicate which character is
966 "word". In this case "word" means that it is the word construction
967 character used by some operators like "\<", "\>", etc. */
971 init_word_char (re_dfa_t
*dfa
)
974 dfa
->word_ops_used
= 1;
975 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
976 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
977 if (isalnum (ch
) || ch
== '_')
978 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
981 /* Free the work area which are only used while compiling. */
984 free_workarea_compile (regex_t
*preg
)
986 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
987 bin_tree_storage_t
*storage
, *next
;
988 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
990 next
= storage
->next
;
993 dfa
->str_tree_storage
= NULL
;
994 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
995 dfa
->str_tree
= NULL
;
996 re_free (dfa
->org_indices
);
997 dfa
->org_indices
= NULL
;
1000 /* Create initial states for all contexts. */
1002 static reg_errcode_t
1003 create_initial_state (re_dfa_t
*dfa
)
1007 re_node_set init_nodes
;
1009 /* Initial states have the epsilon closure of the node which is
1010 the first node of the regular expression. */
1011 first
= dfa
->str_tree
->first
->node_idx
;
1012 dfa
->init_node
= first
;
1013 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1014 if (BE (err
!= REG_NOERROR
, 0))
1017 /* The back-references which are in initial states can epsilon transit,
1018 since in this case all of the subexpressions can be null.
1019 Then we add epsilon closures of the nodes which are the next nodes of
1020 the back-references. */
1021 if (dfa
->nbackref
> 0)
1022 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1024 int node_idx
= init_nodes
.elems
[i
];
1025 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1028 if (type
!= OP_BACK_REF
)
1030 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1032 re_token_t
*clexp_node
;
1033 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1034 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1035 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1038 if (clexp_idx
== init_nodes
.nelem
)
1041 if (type
== OP_BACK_REF
)
1043 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1044 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1046 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1049 if (err
!= REG_NOERROR
)
1056 /* It must be the first time to invoke acquire_state. */
1057 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1058 /* We don't check ERR here, since the initial state must not be NULL. */
1059 if (BE (dfa
->init_state
== NULL
, 0))
1061 if (dfa
->init_state
->has_constraint
)
1063 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1065 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1067 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1071 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1072 || dfa
->init_state_begbuf
== NULL
, 0))
1076 dfa
->init_state_word
= dfa
->init_state_nl
1077 = dfa
->init_state_begbuf
= dfa
->init_state
;
1079 re_node_set_free (&init_nodes
);
1083 #ifdef RE_ENABLE_I18N
1084 /* If it is possible to do searching in single byte encoding instead of UTF-8
1085 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086 DFA nodes where needed. */
1089 optimize_utf8 (re_dfa_t
*dfa
)
1091 int node
, i
, mb_chars
= 0, has_period
= 0;
1093 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1094 switch (dfa
->nodes
[node
].type
)
1097 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1101 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1109 /* Word anchors etc. cannot be handled. It's okay to test
1110 opr.ctx_type since constraints (for all DFA nodes) are
1111 created by ORing one or more opr.ctx_type values. */
1121 case OP_DUP_ASTERISK
:
1122 case OP_OPEN_SUBEXP
:
1123 case OP_CLOSE_SUBEXP
:
1125 case COMPLEX_BRACKET
:
1127 case SIMPLE_BRACKET
:
1128 /* Just double check. The non-ASCII range starts at 0x80. */
1129 assert (0x80 % BITSET_WORD_BITS
== 0);
1130 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1131 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1138 if (mb_chars
|| has_period
)
1139 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1141 if (dfa
->nodes
[node
].type
== CHARACTER
1142 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1143 dfa
->nodes
[node
].mb_partial
= 0;
1144 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1145 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1148 /* The search can be in single byte locale. */
1149 dfa
->mb_cur_max
= 1;
1151 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1155 /* Analyze the structure tree, and calculate "first", "next", "edest",
1156 "eclosure", and "inveclosure". */
1158 static reg_errcode_t
1159 analyze (regex_t
*preg
)
1161 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1164 /* Allocate arrays. */
1165 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1166 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1167 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1168 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1169 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1170 || dfa
->eclosures
== NULL
, 0))
1173 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1174 if (dfa
->subexp_map
!= NULL
)
1177 for (i
= 0; i
< preg
->re_nsub
; i
++)
1178 dfa
->subexp_map
[i
] = i
;
1179 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1180 for (i
= 0; i
< preg
->re_nsub
; i
++)
1181 if (dfa
->subexp_map
[i
] != i
)
1183 if (i
== preg
->re_nsub
)
1185 free (dfa
->subexp_map
);
1186 dfa
->subexp_map
= NULL
;
1190 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1191 if (BE (ret
!= REG_NOERROR
, 0))
1193 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1194 if (BE (ret
!= REG_NOERROR
, 0))
1196 preorder (dfa
->str_tree
, calc_next
, dfa
);
1197 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1198 if (BE (ret
!= REG_NOERROR
, 0))
1200 ret
= calc_eclosure (dfa
);
1201 if (BE (ret
!= REG_NOERROR
, 0))
1204 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1205 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1206 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1209 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1210 if (BE (dfa
->inveclosures
== NULL
, 0))
1212 ret
= calc_inveclosure (dfa
);
1218 /* Our parse trees are very unbalanced, so we cannot use a stack to
1219 implement parse tree visits. Instead, we use parent pointers and
1220 some hairy code in these two functions. */
1221 static reg_errcode_t
1222 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1225 bin_tree_t
*node
, *prev
;
1227 for (node
= root
; ; )
1229 /* Descend down the tree, preferably to the left (or to the right
1230 if that's the only child). */
1231 while (node
->left
|| node
->right
)
1239 reg_errcode_t err
= fn (extra
, node
);
1240 if (BE (err
!= REG_NOERROR
, 0))
1242 if (node
->parent
== NULL
)
1245 node
= node
->parent
;
1247 /* Go up while we have a node that is reached from the right. */
1248 while (node
->right
== prev
|| node
->right
== NULL
);
1253 static reg_errcode_t
1254 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1259 for (node
= root
; ; )
1261 reg_errcode_t err
= fn (extra
, node
);
1262 if (BE (err
!= REG_NOERROR
, 0))
1265 /* Go to the left node, or up and to the right. */
1270 bin_tree_t
*prev
= NULL
;
1271 while (node
->right
== prev
|| node
->right
== NULL
)
1274 node
= node
->parent
;
1283 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1284 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1285 backreferences as well. Requires a preorder visit. */
1286 static reg_errcode_t
1287 optimize_subexps (void *extra
, bin_tree_t
*node
)
1289 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1291 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1293 int idx
= node
->token
.opr
.idx
;
1294 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1295 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1298 else if (node
->token
.type
== SUBEXP
1299 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1301 int other_idx
= node
->left
->token
.opr
.idx
;
1303 node
->left
= node
->left
->left
;
1305 node
->left
->parent
= node
;
1307 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1308 if (other_idx
< BITSET_WORD_BITS
)
1309 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1315 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1316 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1317 static reg_errcode_t
1318 lower_subexps (void *extra
, bin_tree_t
*node
)
1320 regex_t
*preg
= (regex_t
*) extra
;
1321 reg_errcode_t err
= REG_NOERROR
;
1323 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1325 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1327 node
->left
->parent
= node
;
1329 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1331 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1333 node
->right
->parent
= node
;
1340 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1342 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1343 bin_tree_t
*body
= node
->left
;
1344 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1347 /* We do not optimize empty subexpressions, because otherwise we may
1348 have bad CONCAT nodes with NULL children. This is obviously not
1349 very common, so we do not lose much. An example that triggers
1350 this case is the sed "script" /\(\)/x. */
1351 && node
->left
!= NULL
1352 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1353 || !(dfa
->used_bkref_map
1354 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1357 /* Convert the SUBEXP node to the concatenation of an
1358 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1359 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1360 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1361 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1362 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1363 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1369 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1370 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1374 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1375 nodes. Requires a postorder visit. */
1376 static reg_errcode_t
1377 calc_first (void *extra
, bin_tree_t
*node
)
1379 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1380 if (node
->token
.type
== CONCAT
)
1382 node
->first
= node
->left
->first
;
1383 node
->node_idx
= node
->left
->node_idx
;
1388 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1389 if (BE (node
->node_idx
== -1, 0))
1391 if (node
->token
.type
== ANCHOR
)
1392 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1397 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1398 static reg_errcode_t
1399 calc_next (void *extra
, bin_tree_t
*node
)
1401 switch (node
->token
.type
)
1403 case OP_DUP_ASTERISK
:
1404 node
->left
->next
= node
;
1407 node
->left
->next
= node
->right
->first
;
1408 node
->right
->next
= node
->next
;
1412 node
->left
->next
= node
->next
;
1414 node
->right
->next
= node
->next
;
1420 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1421 static reg_errcode_t
1422 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1424 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1425 int idx
= node
->node_idx
;
1426 reg_errcode_t err
= REG_NOERROR
;
1428 switch (node
->token
.type
)
1434 assert (node
->next
== NULL
);
1437 case OP_DUP_ASTERISK
:
1441 dfa
->has_plural_match
= 1;
1442 if (node
->left
!= NULL
)
1443 left
= node
->left
->first
->node_idx
;
1445 left
= node
->next
->node_idx
;
1446 if (node
->right
!= NULL
)
1447 right
= node
->right
->first
->node_idx
;
1449 right
= node
->next
->node_idx
;
1451 assert (right
> -1);
1452 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1457 case OP_OPEN_SUBEXP
:
1458 case OP_CLOSE_SUBEXP
:
1459 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1463 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1464 if (node
->token
.type
== OP_BACK_REF
)
1465 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1469 assert (!IS_EPSILON_NODE (node
->token
.type
));
1470 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1477 /* Duplicate the epsilon closure of the node ROOT_NODE.
1478 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1479 to their own constraint. */
1481 static reg_errcode_t
1483 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1484 int root_node
, unsigned int init_constraint
)
1486 int org_node
, clone_node
, ret
;
1487 unsigned int constraint
= init_constraint
;
1488 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1490 int org_dest
, clone_dest
;
1491 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1493 /* If the back reference epsilon-transit, its destination must
1494 also have the constraint. Then duplicate the epsilon closure
1495 of the destination of the back reference, and store it in
1496 edests of the back reference. */
1497 org_dest
= dfa
->nexts
[org_node
];
1498 re_node_set_empty (dfa
->edests
+ clone_node
);
1499 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1500 if (BE (clone_dest
== -1, 0))
1502 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1503 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1504 if (BE (ret
< 0, 0))
1507 else if (dfa
->edests
[org_node
].nelem
== 0)
1509 /* In case of the node can't epsilon-transit, don't duplicate the
1510 destination and store the original destination as the
1511 destination of the node. */
1512 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1515 else if (dfa
->edests
[org_node
].nelem
== 1)
1517 /* In case of the node can epsilon-transit, and it has only one
1519 org_dest
= dfa
->edests
[org_node
].elems
[0];
1520 re_node_set_empty (dfa
->edests
+ clone_node
);
1521 /* If the node is root_node itself, it means the epsilon clsoure
1522 has a loop. Then tie it to the destination of the root_node. */
1523 if (org_node
== root_node
&& clone_node
!= org_node
)
1525 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1526 if (BE (ret
< 0, 0))
1530 /* In case of the node has another constraint, add it. */
1531 constraint
|= dfa
->nodes
[org_node
].constraint
;
1532 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1533 if (BE (clone_dest
== -1, 0))
1535 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1536 if (BE (ret
< 0, 0))
1539 else /* dfa->edests[org_node].nelem == 2 */
1541 /* In case of the node can epsilon-transit, and it has two
1542 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1543 org_dest
= dfa
->edests
[org_node
].elems
[0];
1544 re_node_set_empty (dfa
->edests
+ clone_node
);
1545 /* Search for a duplicated node which satisfies the constraint. */
1546 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1547 if (clone_dest
== -1)
1549 /* There is no such duplicated node, create a new one. */
1551 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1552 if (BE (clone_dest
== -1, 0))
1554 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1555 if (BE (ret
< 0, 0))
1557 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1558 root_node
, constraint
);
1559 if (BE (err
!= REG_NOERROR
, 0))
1564 /* There is a duplicated node which satisfies the constraint,
1565 use it to avoid infinite loop. */
1566 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1567 if (BE (ret
< 0, 0))
1571 org_dest
= dfa
->edests
[org_node
].elems
[1];
1572 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1573 if (BE (clone_dest
== -1, 0))
1575 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1576 if (BE (ret
< 0, 0))
1579 org_node
= org_dest
;
1580 clone_node
= clone_dest
;
1585 /* Search for a node which is duplicated from the node ORG_NODE, and
1586 satisfies the constraint CONSTRAINT. */
1589 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1590 unsigned int constraint
)
1593 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1595 if (org_node
== dfa
->org_indices
[idx
]
1596 && constraint
== dfa
->nodes
[idx
].constraint
)
1597 return idx
; /* Found. */
1599 return -1; /* Not found. */
1602 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603 Return the index of the new node, or -1 if insufficient storage is
1607 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1609 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1610 if (BE (dup_idx
!= -1, 1))
1612 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1613 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1614 dfa
->nodes
[dup_idx
].duplicated
= 1;
1616 /* Store the index of the original node. */
1617 dfa
->org_indices
[dup_idx
] = org_idx
;
1622 static reg_errcode_t
1623 calc_inveclosure (re_dfa_t
*dfa
)
1626 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1627 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1629 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1631 int *elems
= dfa
->eclosures
[src
].elems
;
1632 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1634 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1635 if (BE (ret
== -1, 0))
1643 /* Calculate "eclosure" for all the node in DFA. */
1645 static reg_errcode_t
1646 calc_eclosure (re_dfa_t
*dfa
)
1648 int node_idx
, incomplete
;
1650 assert (dfa
->nodes_len
> 0);
1653 /* For each nodes, calculate epsilon closure. */
1654 for (node_idx
= 0; ; ++node_idx
)
1657 re_node_set eclosure_elem
;
1658 if (node_idx
== dfa
->nodes_len
)
1667 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1670 /* If we have already calculated, skip it. */
1671 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1673 /* Calculate epsilon closure of `node_idx'. */
1674 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1675 if (BE (err
!= REG_NOERROR
, 0))
1678 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1681 re_node_set_free (&eclosure_elem
);
1687 /* Calculate epsilon closure of NODE. */
1689 static reg_errcode_t
1690 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1694 re_node_set eclosure
;
1697 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1698 if (BE (err
!= REG_NOERROR
, 0))
1701 /* This indicates that we are calculating this node now.
1702 We reference this value to avoid infinite loop. */
1703 dfa
->eclosures
[node
].nelem
= -1;
1705 /* If the current node has constraints, duplicate all nodes
1706 since they must inherit the constraints. */
1707 if (dfa
->nodes
[node
].constraint
1708 && dfa
->edests
[node
].nelem
1709 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1711 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1712 dfa
->nodes
[node
].constraint
);
1713 if (BE (err
!= REG_NOERROR
, 0))
1717 /* Expand each epsilon destination nodes. */
1718 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1719 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1721 re_node_set eclosure_elem
;
1722 int edest
= dfa
->edests
[node
].elems
[i
];
1723 /* If calculating the epsilon closure of `edest' is in progress,
1724 return intermediate result. */
1725 if (dfa
->eclosures
[edest
].nelem
== -1)
1730 /* If we haven't calculated the epsilon closure of `edest' yet,
1731 calculate now. Otherwise use calculated epsilon closure. */
1732 if (dfa
->eclosures
[edest
].nelem
== 0)
1734 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1735 if (BE (err
!= REG_NOERROR
, 0))
1739 eclosure_elem
= dfa
->eclosures
[edest
];
1740 /* Merge the epsilon closure of `edest'. */
1741 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1742 if (BE (err
!= REG_NOERROR
, 0))
1744 /* If the epsilon closure of `edest' is incomplete,
1745 the epsilon closure of this node is also incomplete. */
1746 if (dfa
->eclosures
[edest
].nelem
== 0)
1749 re_node_set_free (&eclosure_elem
);
1753 /* An epsilon closure includes itself. */
1754 ret
= re_node_set_insert (&eclosure
, node
);
1755 if (BE (ret
< 0, 0))
1757 if (incomplete
&& !root
)
1758 dfa
->eclosures
[node
].nelem
= 0;
1760 dfa
->eclosures
[node
] = eclosure
;
1761 *new_set
= eclosure
;
1765 /* Functions for token which are used in the parser. */
1767 /* Fetch a token from INPUT.
1768 We must not use this function inside bracket expressions. */
1772 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1774 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1777 /* Peek a token from INPUT, and return the length of the token.
1778 We must not use this function inside bracket expressions. */
1782 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1786 if (re_string_eoi (input
))
1788 token
->type
= END_OF_RE
;
1792 c
= re_string_peek_byte (input
, 0);
1795 token
->word_char
= 0;
1796 #ifdef RE_ENABLE_I18N
1797 token
->mb_partial
= 0;
1798 if (input
->mb_cur_max
> 1 &&
1799 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1801 token
->type
= CHARACTER
;
1802 token
->mb_partial
= 1;
1809 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1811 token
->type
= BACK_SLASH
;
1815 c2
= re_string_peek_byte_case (input
, 1);
1817 token
->type
= CHARACTER
;
1818 #ifdef RE_ENABLE_I18N
1819 if (input
->mb_cur_max
> 1)
1821 wint_t wc
= re_string_wchar_at (input
,
1822 re_string_cur_idx (input
) + 1);
1823 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1827 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1832 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1833 token
->type
= OP_ALT
;
1835 case '1': case '2': case '3': case '4': case '5':
1836 case '6': case '7': case '8': case '9':
1837 if (!(syntax
& RE_NO_BK_REFS
))
1839 token
->type
= OP_BACK_REF
;
1840 token
->opr
.idx
= c2
- '1';
1844 if (!(syntax
& RE_NO_GNU_OPS
))
1846 token
->type
= ANCHOR
;
1847 token
->opr
.ctx_type
= WORD_FIRST
;
1851 if (!(syntax
& RE_NO_GNU_OPS
))
1853 token
->type
= ANCHOR
;
1854 token
->opr
.ctx_type
= WORD_LAST
;
1858 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= ANCHOR
;
1861 token
->opr
.ctx_type
= WORD_DELIM
;
1865 if (!(syntax
& RE_NO_GNU_OPS
))
1867 token
->type
= ANCHOR
;
1868 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1872 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= OP_WORD
;
1876 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= OP_NOTWORD
;
1880 if (!(syntax
& RE_NO_GNU_OPS
))
1881 token
->type
= OP_SPACE
;
1884 if (!(syntax
& RE_NO_GNU_OPS
))
1885 token
->type
= OP_NOTSPACE
;
1888 if (!(syntax
& RE_NO_GNU_OPS
))
1890 token
->type
= ANCHOR
;
1891 token
->opr
.ctx_type
= BUF_FIRST
;
1895 if (!(syntax
& RE_NO_GNU_OPS
))
1897 token
->type
= ANCHOR
;
1898 token
->opr
.ctx_type
= BUF_LAST
;
1902 if (!(syntax
& RE_NO_BK_PARENS
))
1903 token
->type
= OP_OPEN_SUBEXP
;
1906 if (!(syntax
& RE_NO_BK_PARENS
))
1907 token
->type
= OP_CLOSE_SUBEXP
;
1910 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1911 token
->type
= OP_DUP_PLUS
;
1914 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1915 token
->type
= OP_DUP_QUESTION
;
1918 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1919 token
->type
= OP_OPEN_DUP_NUM
;
1922 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1923 token
->type
= OP_CLOSE_DUP_NUM
;
1931 token
->type
= CHARACTER
;
1932 #ifdef RE_ENABLE_I18N
1933 if (input
->mb_cur_max
> 1)
1935 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1936 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1940 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1945 if (syntax
& RE_NEWLINE_ALT
)
1946 token
->type
= OP_ALT
;
1949 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1950 token
->type
= OP_ALT
;
1953 token
->type
= OP_DUP_ASTERISK
;
1956 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1957 token
->type
= OP_DUP_PLUS
;
1960 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1961 token
->type
= OP_DUP_QUESTION
;
1964 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1965 token
->type
= OP_OPEN_DUP_NUM
;
1968 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1969 token
->type
= OP_CLOSE_DUP_NUM
;
1972 if (syntax
& RE_NO_BK_PARENS
)
1973 token
->type
= OP_OPEN_SUBEXP
;
1976 if (syntax
& RE_NO_BK_PARENS
)
1977 token
->type
= OP_CLOSE_SUBEXP
;
1980 token
->type
= OP_OPEN_BRACKET
;
1983 token
->type
= OP_PERIOD
;
1986 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1987 re_string_cur_idx (input
) != 0)
1989 char prev
= re_string_peek_byte (input
, -1);
1990 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1993 token
->type
= ANCHOR
;
1994 token
->opr
.ctx_type
= LINE_FIRST
;
1997 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1998 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2001 re_string_skip_bytes (input
, 1);
2002 peek_token (&next
, input
, syntax
);
2003 re_string_skip_bytes (input
, -1);
2004 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2007 token
->type
= ANCHOR
;
2008 token
->opr
.ctx_type
= LINE_LAST
;
2016 /* Peek a token from INPUT, and return the length of the token.
2017 We must not use this function out of bracket expressions. */
2021 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2024 if (re_string_eoi (input
))
2026 token
->type
= END_OF_RE
;
2029 c
= re_string_peek_byte (input
, 0);
2032 #ifdef RE_ENABLE_I18N
2033 if (input
->mb_cur_max
> 1 &&
2034 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2036 token
->type
= CHARACTER
;
2039 #endif /* RE_ENABLE_I18N */
2041 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2042 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2044 /* In this case, '\' escape a character. */
2046 re_string_skip_bytes (input
, 1);
2047 c2
= re_string_peek_byte (input
, 0);
2049 token
->type
= CHARACTER
;
2052 if (c
== '[') /* '[' is a special char in a bracket exps. */
2056 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2057 c2
= re_string_peek_byte (input
, 1);
2065 token
->type
= OP_OPEN_COLL_ELEM
;
2068 token
->type
= OP_OPEN_EQUIV_CLASS
;
2071 if (syntax
& RE_CHAR_CLASSES
)
2073 token
->type
= OP_OPEN_CHAR_CLASS
;
2076 /* else fall through. */
2078 token
->type
= CHARACTER
;
2088 token
->type
= OP_CHARSET_RANGE
;
2091 token
->type
= OP_CLOSE_BRACKET
;
2094 token
->type
= OP_NON_MATCH_LIST
;
2097 token
->type
= CHARACTER
;
2102 /* Functions for parser. */
2104 /* Entry point of the parser.
2105 Parse the regular expression REGEXP and return the structure tree.
2106 If an error is occured, ERR is set by error code, and return NULL.
2107 This function build the following tree, from regular expression <reg_exp>:
2113 CAT means concatenation.
2114 EOR means end of regular expression. */
2117 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2120 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2121 bin_tree_t
*tree
, *eor
, *root
;
2122 re_token_t current_token
;
2123 dfa
->syntax
= syntax
;
2124 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2125 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2126 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2128 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2130 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2133 if (BE (eor
== NULL
|| root
== NULL
, 0))
2141 /* This function build the following tree, from regular expression
2142 <branch1>|<branch2>:
2148 ALT means alternative, which represents the operator `|'. */
2151 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2152 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2154 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2155 bin_tree_t
*tree
, *branch
= NULL
;
2156 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2157 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2160 while (token
->type
== OP_ALT
)
2162 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2163 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2164 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2166 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2167 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2172 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2173 if (BE (tree
== NULL
, 0))
2182 /* This function build the following tree, from regular expression
2189 CAT means concatenation. */
2192 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2193 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2195 bin_tree_t
*tree
, *exp
;
2196 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2197 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2198 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2201 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2202 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2204 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2205 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2209 if (tree
!= NULL
&& exp
!= NULL
)
2211 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2218 else if (tree
== NULL
)
2220 /* Otherwise exp == NULL, we don't need to create new tree. */
2225 /* This function build the following tree, from regular expression a*:
2232 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2233 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2235 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2237 switch (token
->type
)
2240 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2241 if (BE (tree
== NULL
, 0))
2246 #ifdef RE_ENABLE_I18N
2247 if (dfa
->mb_cur_max
> 1)
2249 while (!re_string_eoi (regexp
)
2250 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2252 bin_tree_t
*mbc_remain
;
2253 fetch_token (token
, regexp
, syntax
);
2254 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2255 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2256 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2265 case OP_OPEN_SUBEXP
:
2266 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2267 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2270 case OP_OPEN_BRACKET
:
2271 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2272 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2276 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2281 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2282 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2283 if (BE (tree
== NULL
, 0))
2289 dfa
->has_mb_node
= 1;
2291 case OP_OPEN_DUP_NUM
:
2292 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2298 case OP_DUP_ASTERISK
:
2300 case OP_DUP_QUESTION
:
2301 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2306 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2308 fetch_token (token
, regexp
, syntax
);
2309 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2311 /* else fall through */
2312 case OP_CLOSE_SUBEXP
:
2313 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2314 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2319 /* else fall through */
2320 case OP_CLOSE_DUP_NUM
:
2321 /* We treat it as a normal character. */
2323 /* Then we can these characters as normal characters. */
2324 token
->type
= CHARACTER
;
2325 /* mb_partial and word_char bits should be initialized already
2327 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2328 if (BE (tree
== NULL
, 0))
2335 if ((token
->opr
.ctx_type
2336 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2337 && dfa
->word_ops_used
== 0)
2338 init_word_char (dfa
);
2339 if (token
->opr
.ctx_type
== WORD_DELIM
2340 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2342 bin_tree_t
*tree_first
, *tree_last
;
2343 if (token
->opr
.ctx_type
== WORD_DELIM
)
2345 token
->opr
.ctx_type
= WORD_FIRST
;
2346 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2347 token
->opr
.ctx_type
= WORD_LAST
;
2351 token
->opr
.ctx_type
= INSIDE_WORD
;
2352 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2353 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2355 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2356 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2357 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2365 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2366 if (BE (tree
== NULL
, 0))
2372 /* We must return here, since ANCHORs can't be followed
2373 by repetition operators.
2374 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2375 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2376 fetch_token (token
, regexp
, syntax
);
2379 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2380 if (BE (tree
== NULL
, 0))
2385 if (dfa
->mb_cur_max
> 1)
2386 dfa
->has_mb_node
= 1;
2390 tree
= build_charclass_op (dfa
, regexp
->trans
,
2393 token
->type
== OP_NOTWORD
, err
);
2394 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2399 tree
= build_charclass_op (dfa
, regexp
->trans
,
2402 token
->type
== OP_NOTSPACE
, err
);
2403 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2413 /* Must not happen? */
2419 fetch_token (token
, regexp
, syntax
);
2421 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2422 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2424 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2425 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2427 /* In BRE consecutive duplications are not allowed. */
2428 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2429 && (token
->type
== OP_DUP_ASTERISK
2430 || token
->type
== OP_OPEN_DUP_NUM
))
2440 /* This function build the following tree, from regular expression
2448 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2449 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2451 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2454 cur_nsub
= preg
->re_nsub
++;
2456 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2458 /* The subexpression may be a null string. */
2459 if (token
->type
== OP_CLOSE_SUBEXP
)
2463 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2464 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2466 if (BE (*err
!= REG_NOERROR
, 0))
2470 if (cur_nsub
<= '9' - '1')
2471 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2473 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2474 if (BE (tree
== NULL
, 0))
2479 tree
->token
.opr
.idx
= cur_nsub
;
2483 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2486 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2487 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2489 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2490 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2491 #ifndef RE_TOKEN_INIT_BUG
2492 re_token_t start_token
= *token
;
2494 re_token_t start_token
;
2496 memcpy ((void *) &start_token
, (void *) token
, sizeof start_token
);
2499 if (token
->type
== OP_OPEN_DUP_NUM
)
2502 start
= fetch_number (regexp
, token
, syntax
);
2505 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2506 start
= 0; /* We treat "{,m}" as "{0,m}". */
2509 *err
= REG_BADBR
; /* <re>{} is invalid. */
2513 if (BE (start
!= -2, 1))
2515 /* We treat "{n}" as "{n,n}". */
2516 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2517 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2518 ? fetch_number (regexp
, token
, syntax
) : -2));
2520 if (BE (start
== -2 || end
== -2, 0))
2522 /* Invalid sequence. */
2523 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2525 if (token
->type
== END_OF_RE
)
2533 /* If the syntax bit is set, rollback. */
2534 re_string_set_index (regexp
, start_idx
);
2535 *token
= start_token
;
2536 token
->type
= CHARACTER
;
2537 /* mb_partial and word_char bits should be already initialized by
2542 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2544 /* First number greater than second. */
2551 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2552 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2555 fetch_token (token
, regexp
, syntax
);
2557 if (BE (elem
== NULL
, 0))
2559 if (BE (start
== 0 && end
== 0, 0))
2561 postorder (elem
, free_tree
, NULL
);
2565 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2566 if (BE (start
> 0, 0))
2569 for (i
= 2; i
<= start
; ++i
)
2571 elem
= duplicate_tree (elem
, dfa
);
2572 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2573 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2574 goto parse_dup_op_espace
;
2580 /* Duplicate ELEM before it is marked optional. */
2581 elem
= duplicate_tree (elem
, dfa
);
2587 if (elem
->token
.type
== SUBEXP
)
2588 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2590 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2591 if (BE (tree
== NULL
, 0))
2592 goto parse_dup_op_espace
;
2594 /* This loop is actually executed only when end != -1,
2595 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2596 already created the start+1-th copy. */
2597 for (i
= start
+ 2; i
<= end
; ++i
)
2599 elem
= duplicate_tree (elem
, dfa
);
2600 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2601 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2602 goto parse_dup_op_espace
;
2604 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2605 if (BE (tree
== NULL
, 0))
2606 goto parse_dup_op_espace
;
2610 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2614 parse_dup_op_espace
:
2619 /* Size of the names for collating symbol/equivalence_class/character_class.
2620 I'm not sure, but maybe enough. */
2621 #define BRACKET_NAME_BUF_SIZE 32
2624 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2625 Build the range expression which starts from START_ELEM, and ends
2626 at END_ELEM. The result are written to MBCSET and SBCSET.
2627 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2628 mbcset->range_ends, is a pointer argument sinse we may
2631 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2635 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2636 # else /* not RE_ENABLE_I18N */
2637 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2638 bracket_elem_t
*end_elem
)
2639 # endif /* not RE_ENABLE_I18N */
2641 unsigned int start_ch
, end_ch
;
2642 /* Equivalence Classes and Character Classes can't be a range start/end. */
2643 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2644 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2648 /* We can handle no multi character collating elements without libc
2650 if (BE ((start_elem
->type
== COLL_SYM
2651 && strlen ((char *) start_elem
->opr
.name
) > 1)
2652 || (end_elem
->type
== COLL_SYM
2653 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2654 return REG_ECOLLATE
;
2656 # ifdef RE_ENABLE_I18N
2661 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2663 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2664 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2666 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2667 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2671 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2672 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2673 * unsigned, so we don't have sign extension problems.
2675 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2676 ? start_ch
: start_elem
->opr
.wch
);
2677 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2678 ? end_ch
: end_elem
->opr
.wch
);
2680 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2681 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2682 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2683 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2685 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2686 return REG_ECOLLATE
;
2687 cmp_buf
[0] = start_wc
;
2688 cmp_buf
[4] = end_wc
;
2689 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2692 /* Got valid collation sequence values, add them as a new entry.
2693 However, for !_LIBC we have no collation elements: if the
2694 character set is single byte, the single byte character set
2695 that we build below suffices. parse_bracket_exp passes
2696 no MBCSET if dfa->mb_cur_max == 1. */
2699 /* Check the space of the arrays. */
2700 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2702 /* There is not enough space, need realloc. */
2703 wchar_t *new_array_start
, *new_array_end
;
2706 /* +1 in case of mbcset->nranges is 0. */
2707 new_nranges
= 2 * mbcset
->nranges
+ 1;
2708 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2709 are NULL if *range_alloc == 0. */
2710 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2712 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2715 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2718 mbcset
->range_starts
= new_array_start
;
2719 mbcset
->range_ends
= new_array_end
;
2720 *range_alloc
= new_nranges
;
2723 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2724 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2727 /* Build the table for single byte characters. */
2728 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2731 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2732 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2733 bitset_set (sbcset
, wc
);
2736 # else /* not RE_ENABLE_I18N */
2739 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2740 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2742 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2743 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2745 if (start_ch
> end_ch
)
2747 /* Build the table for single byte characters. */
2748 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2749 if (start_ch
<= ch
&& ch
<= end_ch
)
2750 bitset_set (sbcset
, ch
);
2752 # endif /* not RE_ENABLE_I18N */
2755 #endif /* not _LIBC */
2758 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2759 Build the collating element which is represented by NAME.
2760 The result are written to MBCSET and SBCSET.
2761 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2762 pointer argument since we may update it. */
2764 static reg_errcode_t
2766 # ifdef RE_ENABLE_I18N
2767 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2768 int *coll_sym_alloc
, const unsigned char *name
)
2769 # else /* not RE_ENABLE_I18N */
2770 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2771 # endif /* not RE_ENABLE_I18N */
2773 size_t name_len
= strlen ((const char *) name
);
2774 if (BE (name_len
!= 1, 0))
2775 return REG_ECOLLATE
;
2778 bitset_set (sbcset
, name
[0]);
2782 #endif /* not _LIBC */
2784 /* This function parse bracket expression like "[abc]", "[a-c]",
2788 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2789 reg_syntax_t syntax
, reg_errcode_t
*err
)
2792 const unsigned char *collseqmb
;
2793 const char *collseqwc
;
2796 const int32_t *symb_table
;
2797 const unsigned char *extra
;
2799 /* Local function for parse_bracket_exp used in _LIBC environement.
2800 Seek the collating symbol entry correspondings to NAME.
2801 Return the index of the symbol in the SYMB_TABLE. */
2804 __attribute ((always_inline
))
2805 seek_collating_symbol_entry (name
, name_len
)
2806 const unsigned char *name
;
2809 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2810 int32_t elem
= hash
% table_size
;
2811 if (symb_table
[2 * elem
] != 0)
2813 int32_t second
= hash
% (table_size
- 2) + 1;
2817 /* First compare the hashing value. */
2818 if (symb_table
[2 * elem
] == hash
2819 /* Compare the length of the name. */
2820 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2821 /* Compare the name. */
2822 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2825 /* Yep, this is the entry. */
2832 while (symb_table
[2 * elem
] != 0);
2837 /* Local function for parse_bracket_exp used in _LIBC environment.
2838 Look up the collation sequence value of BR_ELEM.
2839 Return the value if succeeded, UINT_MAX otherwise. */
2841 auto inline unsigned int
2842 __attribute ((always_inline
))
2843 lookup_collation_sequence_value (br_elem
)
2844 bracket_elem_t
*br_elem
;
2846 if (br_elem
->type
== SB_CHAR
)
2849 if (MB_CUR_MAX == 1)
2852 return collseqmb
[br_elem
->opr
.ch
];
2855 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2856 return __collseq_table_lookup (collseqwc
, wc
);
2859 else if (br_elem
->type
== MB_CHAR
)
2862 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2864 else if (br_elem
->type
== COLL_SYM
)
2866 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2870 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2872 if (symb_table
[2 * elem
] != 0)
2874 /* We found the entry. */
2875 idx
= symb_table
[2 * elem
+ 1];
2876 /* Skip the name of collating element name. */
2877 idx
+= 1 + extra
[idx
];
2878 /* Skip the byte sequence of the collating element. */
2879 idx
+= 1 + extra
[idx
];
2880 /* Adjust for the alignment. */
2881 idx
= (idx
+ 3) & ~3;
2882 /* Skip the multibyte collation sequence value. */
2883 idx
+= sizeof (unsigned int);
2884 /* Skip the wide char sequence of the collating element. */
2885 idx
+= sizeof (unsigned int) *
2886 (1 + *(unsigned int *) (extra
+ idx
));
2887 /* Return the collation sequence value. */
2888 return *(unsigned int *) (extra
+ idx
);
2890 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2892 /* No valid character. Match it as a single byte
2894 return collseqmb
[br_elem
->opr
.name
[0]];
2897 else if (sym_name_len
== 1)
2898 return collseqmb
[br_elem
->opr
.name
[0]];
2903 /* Local function for parse_bracket_exp used in _LIBC environement.
2904 Build the range expression which starts from START_ELEM, and ends
2905 at END_ELEM. The result are written to MBCSET and SBCSET.
2906 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2907 mbcset->range_ends, is a pointer argument sinse we may
2910 auto inline reg_errcode_t
2911 __attribute ((always_inline
))
2912 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2913 re_charset_t
*mbcset
;
2916 bracket_elem_t
*start_elem
, *end_elem
;
2919 uint32_t start_collseq
;
2920 uint32_t end_collseq
;
2922 /* Equivalence Classes and Character Classes can't be a range
2924 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2925 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2929 start_collseq
= lookup_collation_sequence_value (start_elem
);
2930 end_collseq
= lookup_collation_sequence_value (end_elem
);
2931 /* Check start/end collation sequence values. */
2932 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2933 return REG_ECOLLATE
;
2934 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2937 /* Got valid collation sequence values, add them as a new entry.
2938 However, if we have no collation elements, and the character set
2939 is single byte, the single byte character set that we
2940 build below suffices. */
2941 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2943 /* Check the space of the arrays. */
2944 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2946 /* There is not enough space, need realloc. */
2947 uint32_t *new_array_start
;
2948 uint32_t *new_array_end
;
2951 /* +1 in case of mbcset->nranges is 0. */
2952 new_nranges
= 2 * mbcset
->nranges
+ 1;
2953 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2955 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2958 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2961 mbcset
->range_starts
= new_array_start
;
2962 mbcset
->range_ends
= new_array_end
;
2963 *range_alloc
= new_nranges
;
2966 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2967 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2970 /* Build the table for single byte characters. */
2971 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2973 uint32_t ch_collseq
;
2975 if (MB_CUR_MAX == 1)
2978 ch_collseq
= collseqmb
[ch
];
2980 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2981 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2982 bitset_set (sbcset
, ch
);
2987 /* Local function for parse_bracket_exp used in _LIBC environement.
2988 Build the collating element which is represented by NAME.
2989 The result are written to MBCSET and SBCSET.
2990 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2991 pointer argument sinse we may update it. */
2993 auto inline reg_errcode_t
2994 __attribute ((always_inline
))
2995 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2996 re_charset_t
*mbcset
;
2997 int *coll_sym_alloc
;
2999 const unsigned char *name
;
3002 size_t name_len
= strlen ((const char *) name
);
3005 elem
= seek_collating_symbol_entry (name
, name_len
);
3006 if (symb_table
[2 * elem
] != 0)
3008 /* We found the entry. */
3009 idx
= symb_table
[2 * elem
+ 1];
3010 /* Skip the name of collating element name. */
3011 idx
+= 1 + extra
[idx
];
3013 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3015 /* No valid character, treat it as a normal
3017 bitset_set (sbcset
, name
[0]);
3021 return REG_ECOLLATE
;
3023 /* Got valid collation sequence, add it as a new entry. */
3024 /* Check the space of the arrays. */
3025 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3027 /* Not enough, realloc it. */
3028 /* +1 in case of mbcset->ncoll_syms is 0. */
3029 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3030 /* Use realloc since mbcset->coll_syms is NULL
3032 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3033 new_coll_sym_alloc
);
3034 if (BE (new_coll_syms
== NULL
, 0))
3036 mbcset
->coll_syms
= new_coll_syms
;
3037 *coll_sym_alloc
= new_coll_sym_alloc
;
3039 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3044 if (BE (name_len
!= 1, 0))
3045 return REG_ECOLLATE
;
3048 bitset_set (sbcset
, name
[0]);
3055 re_token_t br_token
;
3056 re_bitset_ptr_t sbcset
;
3057 #ifdef RE_ENABLE_I18N
3058 re_charset_t
*mbcset
;
3059 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3060 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3061 #endif /* not RE_ENABLE_I18N */
3063 bin_tree_t
*work_tree
;
3065 int first_round
= 1;
3067 collseqmb
= (const unsigned char *)
3068 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3069 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3075 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3076 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3077 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3078 _NL_COLLATE_SYMB_TABLEMB
);
3079 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3080 _NL_COLLATE_SYMB_EXTRAMB
);
3083 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3084 #ifdef RE_ENABLE_I18N
3085 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3086 #endif /* RE_ENABLE_I18N */
3087 #ifdef RE_ENABLE_I18N
3088 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3090 if (BE (sbcset
== NULL
, 0))
3091 #endif /* RE_ENABLE_I18N */
3097 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3098 if (BE (token
->type
== END_OF_RE
, 0))
3101 goto parse_bracket_exp_free_return
;
3103 if (token
->type
== OP_NON_MATCH_LIST
)
3105 #ifdef RE_ENABLE_I18N
3106 mbcset
->non_match
= 1;
3107 #endif /* not RE_ENABLE_I18N */
3109 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3110 bitset_set (sbcset
, '\n');
3111 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3112 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3113 if (BE (token
->type
== END_OF_RE
, 0))
3116 goto parse_bracket_exp_free_return
;
3120 /* We treat the first ']' as a normal character. */
3121 if (token
->type
== OP_CLOSE_BRACKET
)
3122 token
->type
= CHARACTER
;
3126 bracket_elem_t start_elem
, end_elem
;
3127 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3128 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3130 int token_len2
= 0, is_range_exp
= 0;
3133 start_elem
.opr
.name
= start_name_buf
;
3134 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3135 syntax
, first_round
);
3136 if (BE (ret
!= REG_NOERROR
, 0))
3139 goto parse_bracket_exp_free_return
;
3143 /* Get information about the next token. We need it in any case. */
3144 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3146 /* Do not check for ranges if we know they are not allowed. */
3147 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3149 if (BE (token
->type
== END_OF_RE
, 0))
3152 goto parse_bracket_exp_free_return
;
3154 if (token
->type
== OP_CHARSET_RANGE
)
3156 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3157 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3158 if (BE (token2
.type
== END_OF_RE
, 0))
3161 goto parse_bracket_exp_free_return
;
3163 if (token2
.type
== OP_CLOSE_BRACKET
)
3165 /* We treat the last '-' as a normal character. */
3166 re_string_skip_bytes (regexp
, -token_len
);
3167 token
->type
= CHARACTER
;
3174 if (is_range_exp
== 1)
3176 end_elem
.opr
.name
= end_name_buf
;
3177 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3179 if (BE (ret
!= REG_NOERROR
, 0))
3182 goto parse_bracket_exp_free_return
;
3185 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3188 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3189 &start_elem
, &end_elem
);
3191 # ifdef RE_ENABLE_I18N
3192 *err
= build_range_exp (sbcset
,
3193 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3194 &range_alloc
, &start_elem
, &end_elem
);
3196 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3198 #endif /* RE_ENABLE_I18N */
3199 if (BE (*err
!= REG_NOERROR
, 0))
3200 goto parse_bracket_exp_free_return
;
3204 switch (start_elem
.type
)
3207 bitset_set (sbcset
, start_elem
.opr
.ch
);
3209 #ifdef RE_ENABLE_I18N
3211 /* Check whether the array has enough space. */
3212 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3214 wchar_t *new_mbchars
;
3215 /* Not enough, realloc it. */
3216 /* +1 in case of mbcset->nmbchars is 0. */
3217 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3218 /* Use realloc since array is NULL if *alloc == 0. */
3219 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3221 if (BE (new_mbchars
== NULL
, 0))
3222 goto parse_bracket_exp_espace
;
3223 mbcset
->mbchars
= new_mbchars
;
3225 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3227 #endif /* RE_ENABLE_I18N */
3229 *err
= build_equiv_class (sbcset
,
3230 #ifdef RE_ENABLE_I18N
3231 mbcset
, &equiv_class_alloc
,
3232 #endif /* RE_ENABLE_I18N */
3233 start_elem
.opr
.name
);
3234 if (BE (*err
!= REG_NOERROR
, 0))
3235 goto parse_bracket_exp_free_return
;
3238 *err
= build_collating_symbol (sbcset
,
3239 #ifdef RE_ENABLE_I18N
3240 mbcset
, &coll_sym_alloc
,
3241 #endif /* RE_ENABLE_I18N */
3242 start_elem
.opr
.name
);
3243 if (BE (*err
!= REG_NOERROR
, 0))
3244 goto parse_bracket_exp_free_return
;
3247 *err
= build_charclass (regexp
->trans
, sbcset
,
3248 #ifdef RE_ENABLE_I18N
3249 mbcset
, &char_class_alloc
,
3250 #endif /* RE_ENABLE_I18N */
3251 (const char *) start_elem
.opr
.name
, syntax
);
3252 if (BE (*err
!= REG_NOERROR
, 0))
3253 goto parse_bracket_exp_free_return
;
3260 if (BE (token
->type
== END_OF_RE
, 0))
3263 goto parse_bracket_exp_free_return
;
3265 if (token
->type
== OP_CLOSE_BRACKET
)
3269 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3271 /* If it is non-matching list. */
3273 bitset_not (sbcset
);
3275 #ifdef RE_ENABLE_I18N
3276 /* Ensure only single byte characters are set. */
3277 if (dfa
->mb_cur_max
> 1)
3278 bitset_mask (sbcset
, dfa
->sb_char
);
3280 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3281 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3282 || mbcset
->non_match
)))
3284 bin_tree_t
*mbc_tree
;
3286 /* Build a tree for complex bracket. */
3287 dfa
->has_mb_node
= 1;
3288 br_token
.type
= COMPLEX_BRACKET
;
3289 br_token
.opr
.mbcset
= mbcset
;
3290 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3291 if (BE (mbc_tree
== NULL
, 0))
3292 goto parse_bracket_exp_espace
;
3293 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3294 if (sbcset
[sbc_idx
])
3296 /* If there are no bits set in sbcset, there is no point
3297 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3298 if (sbc_idx
< BITSET_WORDS
)
3300 /* Build a tree for simple bracket. */
3301 br_token
.type
= SIMPLE_BRACKET
;
3302 br_token
.opr
.sbcset
= sbcset
;
3303 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3304 if (BE (work_tree
== NULL
, 0))
3305 goto parse_bracket_exp_espace
;
3307 /* Then join them by ALT node. */
3308 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3309 if (BE (work_tree
== NULL
, 0))
3310 goto parse_bracket_exp_espace
;
3315 work_tree
= mbc_tree
;
3319 #endif /* not RE_ENABLE_I18N */
3321 #ifdef RE_ENABLE_I18N
3322 free_charset (mbcset
);
3324 /* Build a tree for simple bracket. */
3325 br_token
.type
= SIMPLE_BRACKET
;
3326 br_token
.opr
.sbcset
= sbcset
;
3327 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3328 if (BE (work_tree
== NULL
, 0))
3329 goto parse_bracket_exp_espace
;
3333 parse_bracket_exp_espace
:
3335 parse_bracket_exp_free_return
:
3337 #ifdef RE_ENABLE_I18N
3338 free_charset (mbcset
);
3339 #endif /* RE_ENABLE_I18N */
3343 /* Parse an element in the bracket expression. */
3345 static reg_errcode_t
3346 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3347 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3348 reg_syntax_t syntax
, int accept_hyphen
)
3350 #ifdef RE_ENABLE_I18N
3352 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3353 if (cur_char_size
> 1)
3355 elem
->type
= MB_CHAR
;
3356 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3357 re_string_skip_bytes (regexp
, cur_char_size
);
3360 #endif /* RE_ENABLE_I18N */
3361 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3362 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3363 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3364 return parse_bracket_symbol (elem
, regexp
, token
);
3365 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3367 /* A '-' must only appear as anything but a range indicator before
3368 the closing bracket. Everything else is an error. */
3370 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3371 if (token2
.type
!= OP_CLOSE_BRACKET
)
3372 /* The actual error value is not standardized since this whole
3373 case is undefined. But ERANGE makes good sense. */
3376 elem
->type
= SB_CHAR
;
3377 elem
->opr
.ch
= token
->opr
.c
;
3381 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3382 such as [:<character_class>:], [.<collating_element>.], and
3383 [=<equivalent_class>=]. */
3385 static reg_errcode_t
3386 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3389 unsigned char ch
, delim
= token
->opr
.c
;
3391 if (re_string_eoi(regexp
))
3395 if (i
>= BRACKET_NAME_BUF_SIZE
)
3397 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3398 ch
= re_string_fetch_byte_case (regexp
);
3400 ch
= re_string_fetch_byte (regexp
);
3401 if (re_string_eoi(regexp
))
3403 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3405 elem
->opr
.name
[i
] = ch
;
3407 re_string_skip_bytes (regexp
, 1);
3408 elem
->opr
.name
[i
] = '\0';
3409 switch (token
->type
)
3411 case OP_OPEN_COLL_ELEM
:
3412 elem
->type
= COLL_SYM
;
3414 case OP_OPEN_EQUIV_CLASS
:
3415 elem
->type
= EQUIV_CLASS
;
3417 case OP_OPEN_CHAR_CLASS
:
3418 elem
->type
= CHAR_CLASS
;
3426 /* Helper function for parse_bracket_exp.
3427 Build the equivalence class which is represented by NAME.
3428 The result are written to MBCSET and SBCSET.
3429 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3430 is a pointer argument sinse we may update it. */
3432 static reg_errcode_t
3433 #ifdef RE_ENABLE_I18N
3434 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3435 int *equiv_class_alloc
, const unsigned char *name
)
3436 #else /* not RE_ENABLE_I18N */
3437 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3438 #endif /* not RE_ENABLE_I18N */
3441 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3444 const int32_t *table
, *indirect
;
3445 const unsigned char *weights
, *extra
, *cp
;
3446 unsigned char char_buf
[2];
3450 /* This #include defines a local function! */
3451 # include <locale/weight.h>
3452 /* Calculate the index for equivalence class. */
3454 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3455 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3456 _NL_COLLATE_WEIGHTMB
);
3457 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3458 _NL_COLLATE_EXTRAMB
);
3459 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3460 _NL_COLLATE_INDIRECTMB
);
3461 idx1
= findidx (&cp
);
3462 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3463 /* This isn't a valid character. */
3464 return REG_ECOLLATE
;
3466 /* Build single byte matcing table for this equivalence class. */
3467 char_buf
[1] = (unsigned char) '\0';
3468 len
= weights
[idx1
& 0xffffff];
3469 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3473 idx2
= findidx (&cp
);
3478 /* This isn't a valid character. */
3480 /* Compare only if the length matches and the collation rule
3481 index is the same. */
3482 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3486 while (cnt
<= len
&&
3487 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3488 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3492 bitset_set (sbcset
, ch
);
3495 /* Check whether the array has enough space. */
3496 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3498 /* Not enough, realloc it. */
3499 /* +1 in case of mbcset->nequiv_classes is 0. */
3500 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3501 /* Use realloc since the array is NULL if *alloc == 0. */
3502 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3504 new_equiv_class_alloc
);
3505 if (BE (new_equiv_classes
== NULL
, 0))
3507 mbcset
->equiv_classes
= new_equiv_classes
;
3508 *equiv_class_alloc
= new_equiv_class_alloc
;
3510 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3515 if (BE (strlen ((const char *) name
) != 1, 0))
3516 return REG_ECOLLATE
;
3517 bitset_set (sbcset
, *name
);
3522 /* Helper function for parse_bracket_exp.
3523 Build the character class which is represented by NAME.
3524 The result are written to MBCSET and SBCSET.
3525 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3526 is a pointer argument sinse we may update it. */
3528 static reg_errcode_t
3529 #ifdef RE_ENABLE_I18N
3530 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3531 re_charset_t
*mbcset
, int *char_class_alloc
,
3532 const char *class_name
, reg_syntax_t syntax
)
3533 #else /* not RE_ENABLE_I18N */
3534 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3535 const char *class_name
, reg_syntax_t syntax
)
3536 #endif /* not RE_ENABLE_I18N */
3540 /* In case of REG_ICASE "upper" and "lower" match the both of
3541 upper and lower cases. */
3542 if ((syntax
& RE_ICASE
)
3543 && (strcmp (class_name
, "upper") == 0 || strcmp (class_name
, "lower") == 0))
3544 class_name
= "alpha";
3546 #ifdef RE_ENABLE_I18N
3547 /* Check the space of the arrays. */
3548 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nchar_classes is 0. */
3552 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3553 /* Use realloc since array is NULL if *alloc == 0. */
3554 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3555 new_char_class_alloc
);
3556 if (BE (new_char_classes
== NULL
, 0))
3558 mbcset
->char_classes
= new_char_classes
;
3559 *char_class_alloc
= new_char_class_alloc
;
3561 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (class_name
);
3562 #endif /* RE_ENABLE_I18N */
3564 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3566 if (BE (trans != NULL, 0)) \
3568 for (i = 0; i < SBC_MAX; ++i) \
3569 if (ctype_func (i)) \
3570 bitset_set (sbcset, trans[i]); \
3574 for (i = 0; i < SBC_MAX; ++i) \
3575 if (ctype_func (i)) \
3576 bitset_set (sbcset, i); \
3580 if (strcmp (class_name
, "alnum") == 0)
3581 BUILD_CHARCLASS_LOOP (isalnum
);
3582 else if (strcmp (class_name
, "cntrl") == 0)
3583 BUILD_CHARCLASS_LOOP (iscntrl
);
3584 else if (strcmp (class_name
, "lower") == 0)
3585 BUILD_CHARCLASS_LOOP (islower
);
3586 else if (strcmp (class_name
, "space") == 0)
3587 BUILD_CHARCLASS_LOOP (isspace
);
3588 else if (strcmp (class_name
, "alpha") == 0)
3589 BUILD_CHARCLASS_LOOP (isalpha
);
3590 else if (strcmp (class_name
, "digit") == 0)
3591 BUILD_CHARCLASS_LOOP (isdigit
);
3592 else if (strcmp (class_name
, "print") == 0)
3593 BUILD_CHARCLASS_LOOP (isprint
);
3594 else if (strcmp (class_name
, "upper") == 0)
3595 BUILD_CHARCLASS_LOOP (isupper
);
3596 else if (strcmp (class_name
, "blank") == 0)
3598 BUILD_CHARCLASS_LOOP (isblank
);
3600 /* see comments above */
3601 BUILD_CHARCLASS_LOOP (is_blank
);
3603 else if (strcmp (class_name
, "graph") == 0)
3604 BUILD_CHARCLASS_LOOP (isgraph
);
3605 else if (strcmp (class_name
, "punct") == 0)
3606 BUILD_CHARCLASS_LOOP (ispunct
);
3607 else if (strcmp (class_name
, "xdigit") == 0)
3608 BUILD_CHARCLASS_LOOP (isxdigit
);
3616 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3617 const char *class_name
,
3618 const char *extra
, int non_match
,
3621 re_bitset_ptr_t sbcset
;
3622 #ifdef RE_ENABLE_I18N
3623 re_charset_t
*mbcset
;
3625 #endif /* not RE_ENABLE_I18N */
3627 re_token_t br_token
;
3630 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3631 #ifdef RE_ENABLE_I18N
3632 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3633 #endif /* RE_ENABLE_I18N */
3635 #ifdef RE_ENABLE_I18N
3636 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3637 #else /* not RE_ENABLE_I18N */
3638 if (BE (sbcset
== NULL
, 0))
3639 #endif /* not RE_ENABLE_I18N */
3647 #ifdef RE_ENABLE_I18N
3648 mbcset
->non_match
= 1;
3649 #endif /* not RE_ENABLE_I18N */
3652 /* We don't care the syntax in this case. */
3653 ret
= build_charclass (trans
, sbcset
,
3654 #ifdef RE_ENABLE_I18N
3656 #endif /* RE_ENABLE_I18N */
3659 if (BE (ret
!= REG_NOERROR
, 0))
3662 #ifdef RE_ENABLE_I18N
3663 free_charset (mbcset
);
3664 #endif /* RE_ENABLE_I18N */
3668 /* \w match '_' also. */
3669 for (; *extra
; extra
++)
3670 bitset_set (sbcset
, *extra
);
3672 /* If it is non-matching list. */
3674 bitset_not (sbcset
);
3676 #ifdef RE_ENABLE_I18N
3677 /* Ensure only single byte characters are set. */
3678 if (dfa
->mb_cur_max
> 1)
3679 bitset_mask (sbcset
, dfa
->sb_char
);
3682 /* Build a tree for simple bracket. */
3683 br_token
.type
= SIMPLE_BRACKET
;
3684 br_token
.opr
.sbcset
= sbcset
;
3685 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3686 if (BE (tree
== NULL
, 0))
3687 goto build_word_op_espace
;
3689 #ifdef RE_ENABLE_I18N
3690 if (dfa
->mb_cur_max
> 1)
3692 bin_tree_t
*mbc_tree
;
3693 /* Build a tree for complex bracket. */
3694 br_token
.type
= COMPLEX_BRACKET
;
3695 br_token
.opr
.mbcset
= mbcset
;
3696 dfa
->has_mb_node
= 1;
3697 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3698 if (BE (mbc_tree
== NULL
, 0))
3699 goto build_word_op_espace
;
3700 /* Then join them by ALT node. */
3701 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3702 if (BE (mbc_tree
!= NULL
, 1))
3707 free_charset (mbcset
);
3710 #else /* not RE_ENABLE_I18N */
3712 #endif /* not RE_ENABLE_I18N */
3714 build_word_op_espace
:
3716 #ifdef RE_ENABLE_I18N
3717 free_charset (mbcset
);
3718 #endif /* RE_ENABLE_I18N */
3723 /* This is intended for the expressions like "a{1,3}".
3724 Fetch a number from `input', and return the number.
3725 Return -1, if the number field is empty like "{,1}".
3726 Return -2, If an error is occured. */
3729 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3735 fetch_token (token
, input
, syntax
);
3737 if (BE (token
->type
== END_OF_RE
, 0))
3739 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3741 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3742 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3743 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3748 #ifdef RE_ENABLE_I18N
3750 free_charset (re_charset_t
*cset
)
3752 re_free (cset
->mbchars
);
3754 re_free (cset
->coll_syms
);
3755 re_free (cset
->equiv_classes
);
3756 re_free (cset
->range_starts
);
3757 re_free (cset
->range_ends
);
3759 re_free (cset
->char_classes
);
3762 #endif /* RE_ENABLE_I18N */
3764 /* Functions for binary tree operation. */
3766 /* Create a tree node. */
3769 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3770 re_token_type_t type
)
3774 return create_token_tree (dfa
, left
, right
, &t
);
3778 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3779 const re_token_t
*token
)
3782 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3784 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3786 if (storage
== NULL
)
3788 storage
->next
= dfa
->str_tree_storage
;
3789 dfa
->str_tree_storage
= storage
;
3790 dfa
->str_tree_storage_idx
= 0;
3792 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3794 tree
->parent
= NULL
;
3796 tree
->right
= right
;
3797 tree
->token
= *token
;
3798 tree
->token
.duplicated
= 0;
3799 tree
->token
.opt_subexp
= 0;
3802 tree
->node_idx
= -1;
3805 left
->parent
= tree
;
3807 right
->parent
= tree
;
3811 /* Mark the tree SRC as an optional subexpression.
3812 To be called from preorder or postorder. */
3814 static reg_errcode_t
3815 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3817 int idx
= (int) (long) extra
;
3818 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3819 node
->token
.opt_subexp
= 1;
3824 /* Free the allocated memory inside NODE. */
3827 free_token (re_token_t
*node
)
3829 #ifdef RE_ENABLE_I18N
3830 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3831 free_charset (node
->opr
.mbcset
);
3833 #endif /* RE_ENABLE_I18N */
3834 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3835 re_free (node
->opr
.sbcset
);
3838 /* Worker function for tree walking. Free the allocated memory inside NODE
3839 and its children. */
3841 static reg_errcode_t
3842 free_tree (void *extra
, bin_tree_t
*node
)
3844 free_token (&node
->token
);
3849 /* Duplicate the node SRC, and return new node. This is a preorder
3850 visit similar to the one implemented by the generic visitor, but
3851 we need more infrastructure to maintain two parallel trees --- so,
3852 it's easier to duplicate. */
3855 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3857 const bin_tree_t
*node
;
3858 bin_tree_t
*dup_root
;
3859 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3861 for (node
= root
; ; )
3863 /* Create a new tree and link it back to the current parent. */
3864 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3867 (*p_new
)->parent
= dup_node
;
3868 (*p_new
)->token
.duplicated
= 1;
3871 /* Go to the left node, or up and to the right. */
3875 p_new
= &dup_node
->left
;
3879 const bin_tree_t
*prev
= NULL
;
3880 while (node
->right
== prev
|| node
->right
== NULL
)
3883 node
= node
->parent
;
3884 dup_node
= dup_node
->parent
;
3889 p_new
= &dup_node
->right
;