1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
38 static reg_errcode_t
preorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
postorder (bin_tree_t
*root
,
42 reg_errcode_t (fn (void *, bin_tree_t
*)),
44 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
45 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
46 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
48 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
52 int top_clone_node
, int root_node
,
53 unsigned int constraint
);
54 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
55 unsigned int constraint
);
56 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
57 unsigned int constraint
);
58 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
59 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
61 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
62 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
64 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
66 static int peek_token (re_token_t
*token
, re_string_t
*input
,
68 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
70 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
71 reg_syntax_t syntax
, reg_errcode_t
*err
);
72 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
73 re_token_t
*token
, reg_syntax_t syntax
,
74 int nest
, reg_errcode_t
*err
);
75 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
76 re_token_t
*token
, reg_syntax_t syntax
,
77 int nest
, reg_errcode_t
*err
);
78 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
79 re_token_t
*token
, reg_syntax_t syntax
,
80 int nest
, reg_errcode_t
*err
);
81 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
82 re_token_t
*token
, reg_syntax_t syntax
,
83 int nest
, reg_errcode_t
*err
);
84 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
85 re_dfa_t
*dfa
, re_token_t
*token
,
86 reg_syntax_t syntax
, reg_errcode_t
*err
);
87 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
88 re_token_t
*token
, reg_syntax_t syntax
,
90 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
92 re_token_t
*token
, int token_len
,
96 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
102 re_charset_t
*mbcset
, int *range_alloc
,
103 bracket_elem_t
*start_elem
,
104 bracket_elem_t
*end_elem
);
105 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
106 re_charset_t
*mbcset
,
108 const unsigned char *name
);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
111 bracket_elem_t
*start_elem
,
112 bracket_elem_t
*end_elem
);
113 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
114 const unsigned char *name
);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
119 re_charset_t
*mbcset
,
120 int *equiv_class_alloc
,
121 const unsigned char *name
);
122 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
123 re_bitset_ptr_t sbcset
,
124 re_charset_t
*mbcset
,
125 int *char_class_alloc
,
126 const unsigned char *class_name
,
127 reg_syntax_t syntax
);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
130 const unsigned char *name
);
131 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
132 re_bitset_ptr_t sbcset
,
133 const unsigned char *class_name
,
134 reg_syntax_t syntax
);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
137 unsigned RE_TRANSLATE_TYPE trans
,
138 const unsigned char *class_name
,
139 const unsigned char *extra
,
140 int non_match
, reg_errcode_t
*err
);
141 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
142 bin_tree_t
*left
, bin_tree_t
*right
,
143 re_token_type_t type
);
144 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
145 bin_tree_t
*left
, bin_tree_t
*right
,
146 const re_token_t
*token
);
147 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
148 static void free_token (re_token_t
*node
);
149 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
150 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
157 const char __re_error_msgid
[] attribute_hidden
=
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
211 const size_t __re_error_msgid_idx
[] attribute_hidden
=
232 /* Entry points for GNU code. */
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
242 re_compile_pattern (pattern
, length
, bufp
)
245 struct re_pattern_buffer
*bufp
;
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
254 /* Match anchors at newline. */
255 bufp
->newline_anchor
= 1;
257 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
261 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
264 weak_alias (__re_compile_pattern
, re_compile_pattern
)
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options
;
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
283 re_set_syntax (syntax
)
286 reg_syntax_t ret
= re_syntax_options
;
288 re_syntax_options
= syntax
;
292 weak_alias (__re_set_syntax
, re_set_syntax
)
296 re_compile_fastmap (bufp
)
297 struct re_pattern_buffer
*bufp
;
299 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
300 char *fastmap
= bufp
->fastmap
;
302 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
303 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_word
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_nl
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
308 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
309 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
310 bufp
->fastmap_accurate
= 1;
314 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
318 __attribute ((always_inline
))
319 re_set_fastmap (char *fastmap
, int icase
, int ch
)
323 fastmap
[tolower (ch
)] = 1;
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
330 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
332 const re_dfastate_t
*init_state
;
335 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
337 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
338 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
340 int node
= init_state
->nodes
.elems
[node_cnt
];
341 re_token_type_t type
= dfa
->nodes
[node
].type
;
343 if (type
== CHARACTER
)
345 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
349 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
354 *p
++ = dfa
->nodes
[node
].opr
.c
;
355 while (++node
< dfa
->nodes_len
356 && dfa
->nodes
[node
].type
== CHARACTER
357 && dfa
->nodes
[node
].mb_partial
)
358 *p
++ = dfa
->nodes
[node
].opr
.c
;
359 memset (&state
, 0, sizeof (state
));
360 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
362 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
363 re_set_fastmap (fastmap
, 0, buf
[0]);
367 else if (type
== SIMPLE_BRACKET
)
370 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
371 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
372 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
373 re_set_fastmap (fastmap
, icase
, ch
);
375 #ifdef RE_ENABLE_I18N
376 else if (type
== COMPLEX_BRACKET
)
379 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
380 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
381 || cset
->nranges
|| cset
->nchar_classes
)
384 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
393 const int32_t *table
= (const int32_t *)
394 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
395 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
396 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
398 re_set_fastmap (fastmap
, icase
, ch
);
401 if (dfa
->mb_cur_max
> 1)
402 for (i
= 0; i
< SBC_MAX
; ++i
)
403 if (__btowc (i
) == WEOF
)
404 re_set_fastmap (fastmap
, icase
, i
);
405 # endif /* not _LIBC */
407 for (i
= 0; i
< cset
->nmbchars
; ++i
)
411 memset (&state
, '\0', sizeof (state
));
412 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
413 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
414 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
416 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
417 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
421 #endif /* RE_ENABLE_I18N */
422 else if (type
== OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type
== OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type
== END_OF_RE
)
428 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
429 if (type
== END_OF_RE
)
430 bufp
->can_be_null
= 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg
, pattern
, cflags
)
474 regex_t
*__restrict preg
;
475 const char *__restrict pattern
;
479 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC
);
486 /* Try to allocate space for the fastmap. */
487 preg
->fastmap
= re_malloc (char, SBC_MAX
);
488 if (BE (preg
->fastmap
== NULL
, 0))
491 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags
& REG_NEWLINE
)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax
&= ~RE_DOT_NEWLINE
;
497 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
498 /* It also changes the matching behavior. */
499 preg
->newline_anchor
= 1;
502 preg
->newline_anchor
= 0;
503 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
504 preg
->translate
= NULL
;
506 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret
== REG_ERPAREN
)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret
== REG_NOERROR
, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg
);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg
->fastmap
);
522 preg
->fastmap
= NULL
;
528 weak_alias (__regcomp
, regcomp
)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode
, preg
, errbuf
, errbuf_size
)
545 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
546 / sizeof (__re_error_msgid_idx
[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
553 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
555 msg_size
= strlen (msg
) + 1; /* Includes the null. */
557 if (BE (errbuf_size
!= 0, 1))
559 if (BE (msg_size
> errbuf_size
, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
564 memcpy (errbuf
, msg
, errbuf_size
- 1);
565 errbuf
[errbuf_size
- 1] = 0;
569 memcpy (errbuf
, msg
, msg_size
);
575 weak_alias (__regerror
, regerror
)
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map
=
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
590 # error "Add case for new unsigned int size"
597 free_dfa_content (re_dfa_t
*dfa
)
602 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
603 free_token (dfa
->nodes
+ i
);
604 re_free (dfa
->nexts
);
605 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
607 if (dfa
->eclosures
!= NULL
)
608 re_node_set_free (dfa
->eclosures
+ i
);
609 if (dfa
->inveclosures
!= NULL
)
610 re_node_set_free (dfa
->inveclosures
+ i
);
611 if (dfa
->edests
!= NULL
)
612 re_node_set_free (dfa
->edests
+ i
);
614 re_free (dfa
->edests
);
615 re_free (dfa
->eclosures
);
616 re_free (dfa
->inveclosures
);
617 re_free (dfa
->nodes
);
619 if (dfa
->state_table
)
620 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
622 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
623 for (j
= 0; j
< entry
->num
; ++j
)
625 re_dfastate_t
*state
= entry
->array
[j
];
628 re_free (entry
->array
);
630 re_free (dfa
->state_table
);
631 #ifdef RE_ENABLE_I18N
632 if (dfa
->sb_char
!= utf8_sb_map
)
633 re_free (dfa
->sb_char
);
635 re_free (dfa
->subexp_map
);
637 re_free (dfa
->re_str
);
644 /* Free dynamically allocated space used by PREG. */
650 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
651 if (BE (dfa
!= NULL
, 1))
652 free_dfa_content (dfa
);
656 re_free (preg
->fastmap
);
657 preg
->fastmap
= NULL
;
659 re_free (preg
->translate
);
660 preg
->translate
= NULL
;
663 weak_alias (__regfree
, regfree
)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf
;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
689 if (!re_comp_buf
.buffer
)
690 return gettext ("No previous regular expression");
694 if (re_comp_buf
.buffer
)
696 fastmap
= re_comp_buf
.fastmap
;
697 re_comp_buf
.fastmap
= NULL
;
698 __regfree (&re_comp_buf
);
699 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
700 re_comp_buf
.fastmap
= fastmap
;
703 if (re_comp_buf
.fastmap
== NULL
)
705 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
706 if (re_comp_buf
.fastmap
== NULL
)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf
.newline_anchor
= 1;
717 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
727 libc_freeres_fn (free_mem
)
729 __regfree (&re_comp_buf
);
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
740 re_compile_internal (preg
, pattern
, length
, syntax
)
742 const char * pattern
;
746 reg_errcode_t err
= REG_NOERROR
;
750 /* Initialize the pattern buffer. */
751 preg
->fastmap_accurate
= 0;
752 preg
->syntax
= syntax
;
753 preg
->not_bol
= preg
->not_eol
= 0;
756 preg
->can_be_null
= 0;
757 preg
->regs_allocated
= REGS_UNALLOCATED
;
759 /* Initialize the dfa. */
760 dfa
= (re_dfa_t
*) preg
->buffer
;
761 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
770 preg
->allocated
= sizeof (re_dfa_t
);
771 preg
->buffer
= (unsigned char *) dfa
;
773 preg
->used
= sizeof (re_dfa_t
);
775 err
= init_dfa (dfa
, length
);
776 if (BE (err
!= REG_NOERROR
, 0))
778 free_dfa_content (dfa
);
784 dfa
->re_str
= re_malloc (char, length
+ 1);
785 strncpy (dfa
->re_str
, pattern
, length
+ 1);
788 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
789 syntax
& RE_ICASE
, dfa
);
790 if (BE (err
!= REG_NOERROR
, 0))
792 re_compile_internal_free_return
:
793 free_workarea_compile (preg
);
794 re_string_destruct (®exp
);
795 free_dfa_content (dfa
);
801 /* Parse the regular expression, and build a structure tree. */
803 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
804 if (BE (dfa
->str_tree
== NULL
, 0))
805 goto re_compile_internal_free_return
;
807 /* Analyze the tree and create the nfa. */
808 err
= analyze (preg
);
809 if (BE (err
!= REG_NOERROR
, 0))
810 goto re_compile_internal_free_return
;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
818 /* Then create the initial state of the dfa. */
819 err
= create_initial_state (dfa
);
821 /* Release work areas. */
822 free_workarea_compile (preg
);
823 re_string_destruct (®exp
);
825 if (BE (err
!= REG_NOERROR
, 0))
827 free_dfa_content (dfa
);
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
839 init_dfa (dfa
, pat_len
)
848 memset (dfa
, '\0', sizeof (re_dfa_t
));
850 /* Force allocation of str_tree_storage the first time. */
851 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
853 dfa
->nodes_alloc
= pat_len
+ 1;
854 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
856 dfa
->states_alloc
= pat_len
+ 1;
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
860 if (table_size
> pat_len
)
863 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
864 dfa
->state_hash_mask
= table_size
- 1;
866 dfa
->mb_cur_max
= MB_CUR_MAX
;
868 if (dfa
->mb_cur_max
== 6
869 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
871 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name
= nl_langinfo (CODESET
);
877 codeset_name
= getenv ("LC_ALL");
878 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
879 codeset_name
= getenv ("LC_CTYPE");
880 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
881 codeset_name
= getenv ("LANG");
882 if (codeset_name
== NULL
)
884 else if (strchr (codeset_name
, '.') != NULL
)
885 codeset_name
= strchr (codeset_name
, '.') + 1;
888 if (strcasecmp (codeset_name
, "UTF-8") == 0
889 || strcasecmp (codeset_name
, "UTF8") == 0)
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa
->map_notascii
= 0;
897 #ifdef RE_ENABLE_I18N
898 if (dfa
->mb_cur_max
> 1)
901 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
906 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
907 if (BE (dfa
->sb_char
== NULL
, 0))
910 /* Clear all bits by, then set those corresponding to single
912 bitset_empty (dfa
->sb_char
);
914 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
915 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
917 wchar_t wch
= __btowc (ch
);
919 dfa
->sb_char
[i
] |= 1 << j
;
921 if (isascii (ch
) && wch
!= (wchar_t) ch
)
922 dfa
->map_notascii
= 1;
929 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
943 dfa
->word_ops_used
= 1;
944 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
945 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
946 if (isalnum (ch
) || ch
== '_')
947 dfa
->word_char
[i
] |= 1 << j
;
950 /* Free the work area which are only used while compiling. */
953 free_workarea_compile (preg
)
956 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
957 bin_tree_storage_t
*storage
, *next
;
958 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
960 next
= storage
->next
;
963 dfa
->str_tree_storage
= NULL
;
964 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
965 dfa
->str_tree
= NULL
;
966 re_free (dfa
->org_indices
);
967 dfa
->org_indices
= NULL
;
970 /* Create initial states for all contexts. */
973 create_initial_state (dfa
)
978 re_node_set init_nodes
;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first
= dfa
->str_tree
->first
->node_idx
;
983 dfa
->init_node
= first
;
984 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
985 if (BE (err
!= REG_NOERROR
, 0))
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa
->nbackref
> 0)
993 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
995 int node_idx
= init_nodes
.elems
[i
];
996 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
999 if (type
!= OP_BACK_REF
)
1001 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1003 re_token_t
*clexp_node
;
1004 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1005 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1006 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1009 if (clexp_idx
== init_nodes
.nelem
)
1012 if (type
== OP_BACK_REF
)
1014 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1015 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1017 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa
->init_state
== NULL
, 0))
1028 if (dfa
->init_state
->has_constraint
)
1030 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1032 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1034 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1038 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1039 || dfa
->init_state_begbuf
== NULL
, 0))
1043 dfa
->init_state_word
= dfa
->init_state_nl
1044 = dfa
->init_state_begbuf
= dfa
->init_state
;
1046 re_node_set_free (&init_nodes
);
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1059 int node
, i
, mb_chars
= 0, has_period
= 0;
1061 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1062 switch (dfa
->nodes
[node
].type
)
1065 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1069 switch (dfa
->nodes
[node
].opr
.idx
)
1077 /* Word anchors etc. cannot be handled. */
1087 case OP_DUP_ASTERISK
:
1088 case OP_OPEN_SUBEXP
:
1089 case OP_CLOSE_SUBEXP
:
1091 case COMPLEX_BRACKET
:
1093 case SIMPLE_BRACKET
:
1094 /* Just double check. */
1095 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1096 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1103 if (mb_chars
|| has_period
)
1104 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1106 if (dfa
->nodes
[node
].type
== CHARACTER
1107 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1108 dfa
->nodes
[node
].mb_partial
= 0;
1109 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1110 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1113 /* The search can be in single byte locale. */
1114 dfa
->mb_cur_max
= 1;
1116 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121 "eclosure", and "inveclosure". */
1123 static reg_errcode_t
1127 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1130 /* Allocate arrays. */
1131 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1132 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1133 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1134 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1135 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1136 || dfa
->eclosures
== NULL
, 0))
1139 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1140 if (dfa
->subexp_map
!= NULL
)
1143 for (i
= 0; i
< preg
->re_nsub
; i
++)
1144 dfa
->subexp_map
[i
] = i
;
1145 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1146 for (i
= 0; i
< preg
->re_nsub
; i
++)
1147 if (dfa
->subexp_map
[i
] != i
)
1149 if (i
== preg
->re_nsub
)
1151 free (dfa
->subexp_map
);
1152 dfa
->subexp_map
= NULL
;
1156 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1157 if (BE (ret
!= REG_NOERROR
, 0))
1159 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1160 if (BE (ret
!= REG_NOERROR
, 0))
1162 preorder (dfa
->str_tree
, calc_next
, dfa
);
1163 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1164 if (BE (ret
!= REG_NOERROR
, 0))
1166 ret
= calc_eclosure (dfa
);
1167 if (BE (ret
!= REG_NOERROR
, 0))
1170 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1172 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1175 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1176 if (BE (dfa
->inveclosures
== NULL
, 0))
1178 ret
= calc_inveclosure (dfa
);
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185 implement parse tree visits. Instead, we use parent pointers and
1186 some hairy code in these two functions. */
1187 static reg_errcode_t
1188 postorder (root
, fn
, extra
)
1190 reg_errcode_t (fn (void *, bin_tree_t
*));
1193 bin_tree_t
*node
, *prev
;
1195 for (node
= root
; ; )
1197 /* Descend down the tree, preferably to the left (or to the right
1198 if that's the only child). */
1199 while (node
->left
|| node
->right
)
1207 reg_errcode_t err
= fn (extra
, node
);
1208 if (BE (err
!= REG_NOERROR
, 0))
1210 if (node
->parent
== NULL
)
1213 node
= node
->parent
;
1215 /* Go up while we have a node that is reached from the right. */
1216 while (node
->right
== prev
|| node
->right
== NULL
);
1221 static reg_errcode_t
1222 preorder (root
, fn
, extra
)
1224 reg_errcode_t (fn (void *, bin_tree_t
*));
1229 for (node
= root
; ; )
1231 reg_errcode_t err
= fn (extra
, node
);
1232 if (BE (err
!= REG_NOERROR
, 0))
1235 /* Go to the left node, or up and to the right. */
1240 bin_tree_t
*prev
= NULL
;
1241 while (node
->right
== prev
|| node
->right
== NULL
)
1244 node
= node
->parent
;
1253 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1254 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1255 backreferences as well. Requires a preorder visit. */
1256 static reg_errcode_t
1257 optimize_subexps (extra
, node
)
1261 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1263 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1265 int idx
= node
->token
.opr
.idx
;
1266 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1267 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1270 else if (node
->token
.type
== SUBEXP
1271 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1273 int other_idx
= node
->left
->token
.opr
.idx
;
1275 node
->left
= node
->left
->left
;
1277 node
->left
->parent
= node
;
1279 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1280 if (other_idx
< 8 * sizeof (dfa
->used_bkref_map
))
1281 dfa
->used_bkref_map
&= ~(1 << other_idx
);
1287 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1288 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1289 static reg_errcode_t
1290 lower_subexps (extra
, node
)
1294 regex_t
*preg
= (regex_t
*) extra
;
1295 reg_errcode_t err
= REG_NOERROR
;
1297 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1299 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1301 node
->left
->parent
= node
;
1303 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1305 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1307 node
->right
->parent
= node
;
1314 lower_subexp (err
, preg
, node
)
1319 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1320 bin_tree_t
*body
= node
->left
;
1321 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1324 /* We do not optimize empty subexpressions, because otherwise we may
1325 have bad CONCAT nodes with NULL children. This is obviously not
1326 very common, so we do not lose much. An example that triggers
1327 this case is the sed "script" /\(\)/x. */
1328 && node
->left
!= NULL
1329 && (node
->token
.opr
.idx
>= 8 * sizeof (dfa
->used_bkref_map
)
1330 || !(dfa
->used_bkref_map
& (1 << node
->token
.opr
.idx
))))
1333 /* Convert the SUBEXP node to the concatenation of an
1334 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1335 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1336 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1337 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1338 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1339 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1345 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1346 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1350 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1351 nodes. Requires a postorder visit. */
1352 static reg_errcode_t
1353 calc_first (extra
, node
)
1357 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1358 if (node
->token
.type
== CONCAT
)
1360 node
->first
= node
->left
->first
;
1361 node
->node_idx
= node
->left
->node_idx
;
1366 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1367 if (BE (node
->node_idx
== -1, 0))
1373 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1374 static reg_errcode_t
1375 calc_next (extra
, node
)
1379 switch (node
->token
.type
)
1381 case OP_DUP_ASTERISK
:
1382 node
->left
->next
= node
;
1385 node
->left
->next
= node
->right
->first
;
1386 node
->right
->next
= node
->next
;
1390 node
->left
->next
= node
->next
;
1392 node
->right
->next
= node
->next
;
1398 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1399 static reg_errcode_t
1400 link_nfa_nodes (extra
, node
)
1404 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1405 int idx
= node
->node_idx
;
1406 reg_errcode_t err
= REG_NOERROR
;
1408 switch (node
->token
.type
)
1414 assert (node
->next
== NULL
);
1417 case OP_DUP_ASTERISK
:
1421 dfa
->has_plural_match
= 1;
1422 if (node
->left
!= NULL
)
1423 left
= node
->left
->first
->node_idx
;
1425 left
= node
->next
->node_idx
;
1426 if (node
->right
!= NULL
)
1427 right
= node
->right
->first
->node_idx
;
1429 right
= node
->next
->node_idx
;
1431 assert (right
> -1);
1432 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1437 case OP_OPEN_SUBEXP
:
1438 case OP_CLOSE_SUBEXP
:
1439 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1443 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1444 if (node
->token
.type
== OP_BACK_REF
)
1445 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1449 assert (!IS_EPSILON_NODE (node
->token
.type
));
1450 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1457 /* Duplicate the epsilon closure of the node ROOT_NODE.
1458 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1459 to their own constraint. */
1461 static reg_errcode_t
1462 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1465 int top_org_node
, top_clone_node
, root_node
;
1466 unsigned int init_constraint
;
1469 int org_node
, clone_node
, ret
;
1470 unsigned int constraint
= init_constraint
;
1471 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1473 int org_dest
, clone_dest
;
1474 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1476 /* If the back reference epsilon-transit, its destination must
1477 also have the constraint. Then duplicate the epsilon closure
1478 of the destination of the back reference, and store it in
1479 edests of the back reference. */
1480 org_dest
= dfa
->nexts
[org_node
];
1481 re_node_set_empty (dfa
->edests
+ clone_node
);
1482 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1483 if (BE (err
!= REG_NOERROR
, 0))
1485 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1486 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1487 if (BE (ret
< 0, 0))
1490 else if (dfa
->edests
[org_node
].nelem
== 0)
1492 /* In case of the node can't epsilon-transit, don't duplicate the
1493 destination and store the original destination as the
1494 destination of the node. */
1495 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1498 else if (dfa
->edests
[org_node
].nelem
== 1)
1500 /* In case of the node can epsilon-transit, and it has only one
1502 org_dest
= dfa
->edests
[org_node
].elems
[0];
1503 re_node_set_empty (dfa
->edests
+ clone_node
);
1504 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1506 /* In case of the node has another constraint, append it. */
1507 if (org_node
== root_node
&& clone_node
!= org_node
)
1509 /* ...but if the node is root_node itself, it means the
1510 epsilon closure have a loop, then tie it to the
1511 destination of the root_node. */
1512 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1514 if (BE (ret
< 0, 0))
1518 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1520 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1521 if (BE (err
!= REG_NOERROR
, 0))
1523 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1524 if (BE (ret
< 0, 0))
1527 else /* dfa->edests[org_node].nelem == 2 */
1529 /* In case of the node can epsilon-transit, and it has two
1530 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531 org_dest
= dfa
->edests
[org_node
].elems
[0];
1532 re_node_set_empty (dfa
->edests
+ clone_node
);
1533 /* Search for a duplicated node which satisfies the constraint. */
1534 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1535 if (clone_dest
== -1)
1537 /* There are no such a duplicated node, create a new one. */
1538 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1539 if (BE (err
!= REG_NOERROR
, 0))
1541 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1542 if (BE (ret
< 0, 0))
1544 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1545 root_node
, constraint
);
1546 if (BE (err
!= REG_NOERROR
, 0))
1551 /* There are a duplicated node which satisfy the constraint,
1552 use it to avoid infinite loop. */
1553 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1554 if (BE (ret
< 0, 0))
1558 org_dest
= dfa
->edests
[org_node
].elems
[1];
1559 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1560 if (BE (err
!= REG_NOERROR
, 0))
1562 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1563 if (BE (ret
< 0, 0))
1566 org_node
= org_dest
;
1567 clone_node
= clone_dest
;
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573 satisfies the constraint CONSTRAINT. */
1576 search_duplicated_node (dfa
, org_node
, constraint
)
1579 unsigned int constraint
;
1582 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1584 if (org_node
== dfa
->org_indices
[idx
]
1585 && constraint
== dfa
->nodes
[idx
].constraint
)
1586 return idx
; /* Found. */
1588 return -1; /* Not found. */
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1593 otherwise return the error code. */
1595 static reg_errcode_t
1596 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1598 int *new_idx
, org_idx
;
1599 unsigned int constraint
;
1601 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1602 if (BE (dup_idx
== -1, 0))
1604 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1605 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1606 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1607 dfa
->nodes
[dup_idx
].duplicated
= 1;
1609 /* Store the index of the original node. */
1610 dfa
->org_indices
[dup_idx
] = org_idx
;
1615 static reg_errcode_t
1616 calc_inveclosure (dfa
)
1620 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1621 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1623 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1625 int *elems
= dfa
->eclosures
[src
].elems
;
1626 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1628 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1629 if (BE (ret
== -1, 0))
1637 /* Calculate "eclosure" for all the node in DFA. */
1639 static reg_errcode_t
1643 int node_idx
, incomplete
;
1645 assert (dfa
->nodes_len
> 0);
1648 /* For each nodes, calculate epsilon closure. */
1649 for (node_idx
= 0; ; ++node_idx
)
1652 re_node_set eclosure_elem
;
1653 if (node_idx
== dfa
->nodes_len
)
1662 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1665 /* If we have already calculated, skip it. */
1666 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1668 /* Calculate epsilon closure of `node_idx'. */
1669 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1670 if (BE (err
!= REG_NOERROR
, 0))
1673 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1676 re_node_set_free (&eclosure_elem
);
1682 /* Calculate epsilon closure of NODE. */
1684 static reg_errcode_t
1685 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1686 re_node_set
*new_set
;
1691 unsigned int constraint
;
1693 re_node_set eclosure
;
1695 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1696 if (BE (err
!= REG_NOERROR
, 0))
1699 /* This indicates that we are calculating this node now.
1700 We reference this value to avoid infinite loop. */
1701 dfa
->eclosures
[node
].nelem
= -1;
1703 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1704 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1705 /* If the current node has constraints, duplicate all nodes.
1706 Since they must inherit the constraints. */
1708 && dfa
->edests
[node
].nelem
1709 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1711 int org_node
, cur_node
;
1712 org_node
= cur_node
= node
;
1713 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1714 if (BE (err
!= REG_NOERROR
, 0))
1718 /* Expand each epsilon destination nodes. */
1719 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1720 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1722 re_node_set eclosure_elem
;
1723 int edest
= dfa
->edests
[node
].elems
[i
];
1724 /* If calculating the epsilon closure of `edest' is in progress,
1725 return intermediate result. */
1726 if (dfa
->eclosures
[edest
].nelem
== -1)
1731 /* If we haven't calculated the epsilon closure of `edest' yet,
1732 calculate now. Otherwise use calculated epsilon closure. */
1733 if (dfa
->eclosures
[edest
].nelem
== 0)
1735 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1736 if (BE (err
!= REG_NOERROR
, 0))
1740 eclosure_elem
= dfa
->eclosures
[edest
];
1741 /* Merge the epsilon closure of `edest'. */
1742 re_node_set_merge (&eclosure
, &eclosure_elem
);
1743 /* If the epsilon closure of `edest' is incomplete,
1744 the epsilon closure of this node is also incomplete. */
1745 if (dfa
->eclosures
[edest
].nelem
== 0)
1748 re_node_set_free (&eclosure_elem
);
1752 /* Epsilon closures include itself. */
1753 re_node_set_insert (&eclosure
, node
);
1754 if (incomplete
&& !root
)
1755 dfa
->eclosures
[node
].nelem
= 0;
1757 dfa
->eclosures
[node
] = eclosure
;
1758 *new_set
= eclosure
;
1762 /* Functions for token which are used in the parser. */
1764 /* Fetch a token from INPUT.
1765 We must not use this function inside bracket expressions. */
1768 fetch_token (result
, input
, syntax
)
1771 reg_syntax_t syntax
;
1773 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1776 /* Peek a token from INPUT, and return the length of the token.
1777 We must not use this function inside bracket expressions. */
1780 peek_token (token
, input
, syntax
)
1783 reg_syntax_t syntax
;
1787 if (re_string_eoi (input
))
1789 token
->type
= END_OF_RE
;
1793 c
= re_string_peek_byte (input
, 0);
1796 token
->word_char
= 0;
1797 #ifdef RE_ENABLE_I18N
1798 token
->mb_partial
= 0;
1799 if (input
->mb_cur_max
> 1 &&
1800 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1802 token
->type
= CHARACTER
;
1803 token
->mb_partial
= 1;
1810 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1812 token
->type
= BACK_SLASH
;
1816 c2
= re_string_peek_byte_case (input
, 1);
1818 token
->type
= CHARACTER
;
1819 #ifdef RE_ENABLE_I18N
1820 if (input
->mb_cur_max
> 1)
1822 wint_t wc
= re_string_wchar_at (input
,
1823 re_string_cur_idx (input
) + 1);
1824 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1828 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1833 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1834 token
->type
= OP_ALT
;
1836 case '1': case '2': case '3': case '4': case '5':
1837 case '6': case '7': case '8': case '9':
1838 if (!(syntax
& RE_NO_BK_REFS
))
1840 token
->type
= OP_BACK_REF
;
1841 token
->opr
.idx
= c2
- '1';
1845 if (!(syntax
& RE_NO_GNU_OPS
))
1847 token
->type
= ANCHOR
;
1848 token
->opr
.ctx_type
= WORD_FIRST
;
1852 if (!(syntax
& RE_NO_GNU_OPS
))
1854 token
->type
= ANCHOR
;
1855 token
->opr
.ctx_type
= WORD_LAST
;
1859 if (!(syntax
& RE_NO_GNU_OPS
))
1861 token
->type
= ANCHOR
;
1862 token
->opr
.ctx_type
= WORD_DELIM
;
1866 if (!(syntax
& RE_NO_GNU_OPS
))
1868 token
->type
= ANCHOR
;
1869 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1873 if (!(syntax
& RE_NO_GNU_OPS
))
1874 token
->type
= OP_WORD
;
1877 if (!(syntax
& RE_NO_GNU_OPS
))
1878 token
->type
= OP_NOTWORD
;
1881 if (!(syntax
& RE_NO_GNU_OPS
))
1882 token
->type
= OP_SPACE
;
1885 if (!(syntax
& RE_NO_GNU_OPS
))
1886 token
->type
= OP_NOTSPACE
;
1889 if (!(syntax
& RE_NO_GNU_OPS
))
1891 token
->type
= ANCHOR
;
1892 token
->opr
.ctx_type
= BUF_FIRST
;
1896 if (!(syntax
& RE_NO_GNU_OPS
))
1898 token
->type
= ANCHOR
;
1899 token
->opr
.ctx_type
= BUF_LAST
;
1903 if (!(syntax
& RE_NO_BK_PARENS
))
1904 token
->type
= OP_OPEN_SUBEXP
;
1907 if (!(syntax
& RE_NO_BK_PARENS
))
1908 token
->type
= OP_CLOSE_SUBEXP
;
1911 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1912 token
->type
= OP_DUP_PLUS
;
1915 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1916 token
->type
= OP_DUP_QUESTION
;
1919 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1920 token
->type
= OP_OPEN_DUP_NUM
;
1923 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1924 token
->type
= OP_CLOSE_DUP_NUM
;
1932 token
->type
= CHARACTER
;
1933 #ifdef RE_ENABLE_I18N
1934 if (input
->mb_cur_max
> 1)
1936 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1937 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1941 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1946 if (syntax
& RE_NEWLINE_ALT
)
1947 token
->type
= OP_ALT
;
1950 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1951 token
->type
= OP_ALT
;
1954 token
->type
= OP_DUP_ASTERISK
;
1957 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1958 token
->type
= OP_DUP_PLUS
;
1961 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1962 token
->type
= OP_DUP_QUESTION
;
1965 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1966 token
->type
= OP_OPEN_DUP_NUM
;
1969 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1970 token
->type
= OP_CLOSE_DUP_NUM
;
1973 if (syntax
& RE_NO_BK_PARENS
)
1974 token
->type
= OP_OPEN_SUBEXP
;
1977 if (syntax
& RE_NO_BK_PARENS
)
1978 token
->type
= OP_CLOSE_SUBEXP
;
1981 token
->type
= OP_OPEN_BRACKET
;
1984 token
->type
= OP_PERIOD
;
1987 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1988 re_string_cur_idx (input
) != 0)
1990 char prev
= re_string_peek_byte (input
, -1);
1991 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1994 token
->type
= ANCHOR
;
1995 token
->opr
.ctx_type
= LINE_FIRST
;
1998 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1999 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2002 re_string_skip_bytes (input
, 1);
2003 peek_token (&next
, input
, syntax
);
2004 re_string_skip_bytes (input
, -1);
2005 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2008 token
->type
= ANCHOR
;
2009 token
->opr
.ctx_type
= LINE_LAST
;
2017 /* Peek a token from INPUT, and return the length of the token.
2018 We must not use this function out of bracket expressions. */
2021 peek_token_bracket (token
, input
, syntax
)
2024 reg_syntax_t syntax
;
2027 if (re_string_eoi (input
))
2029 token
->type
= END_OF_RE
;
2032 c
= re_string_peek_byte (input
, 0);
2035 #ifdef RE_ENABLE_I18N
2036 if (input
->mb_cur_max
> 1 &&
2037 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2039 token
->type
= CHARACTER
;
2042 #endif /* RE_ENABLE_I18N */
2044 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2045 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2047 /* In this case, '\' escape a character. */
2049 re_string_skip_bytes (input
, 1);
2050 c2
= re_string_peek_byte (input
, 0);
2052 token
->type
= CHARACTER
;
2055 if (c
== '[') /* '[' is a special char in a bracket exps. */
2059 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2060 c2
= re_string_peek_byte (input
, 1);
2068 token
->type
= OP_OPEN_COLL_ELEM
;
2071 token
->type
= OP_OPEN_EQUIV_CLASS
;
2074 if (syntax
& RE_CHAR_CLASSES
)
2076 token
->type
= OP_OPEN_CHAR_CLASS
;
2079 /* else fall through. */
2081 token
->type
= CHARACTER
;
2091 token
->type
= OP_CHARSET_RANGE
;
2094 token
->type
= OP_CLOSE_BRACKET
;
2097 token
->type
= OP_NON_MATCH_LIST
;
2100 token
->type
= CHARACTER
;
2105 /* Functions for parser. */
2107 /* Entry point of the parser.
2108 Parse the regular expression REGEXP and return the structure tree.
2109 If an error is occured, ERR is set by error code, and return NULL.
2110 This function build the following tree, from regular expression <reg_exp>:
2116 CAT means concatenation.
2117 EOR means end of regular expression. */
2120 parse (regexp
, preg
, syntax
, err
)
2121 re_string_t
*regexp
;
2123 reg_syntax_t syntax
;
2126 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2127 bin_tree_t
*tree
, *eor
, *root
;
2128 re_token_t current_token
;
2129 dfa
->syntax
= syntax
;
2130 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2131 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2132 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2134 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2136 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2139 if (BE (eor
== NULL
|| root
== NULL
, 0))
2147 /* This function build the following tree, from regular expression
2148 <branch1>|<branch2>:
2154 ALT means alternative, which represents the operator `|'. */
2157 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2158 re_string_t
*regexp
;
2161 reg_syntax_t syntax
;
2165 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2166 bin_tree_t
*tree
, *branch
= NULL
;
2167 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2168 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2171 while (token
->type
== OP_ALT
)
2173 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2174 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2175 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2177 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2178 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2183 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2184 if (BE (tree
== NULL
, 0))
2193 /* This function build the following tree, from regular expression
2200 CAT means concatenation. */
2203 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2204 re_string_t
*regexp
;
2207 reg_syntax_t syntax
;
2211 bin_tree_t
*tree
, *exp
;
2212 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2213 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2214 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2217 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2218 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2220 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2221 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2225 if (tree
!= NULL
&& exp
!= NULL
)
2227 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2234 else if (tree
== NULL
)
2236 /* Otherwise exp == NULL, we don't need to create new tree. */
2241 /* This function build the following tree, from regular expression a*:
2248 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2249 re_string_t
*regexp
;
2252 reg_syntax_t syntax
;
2256 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2258 switch (token
->type
)
2261 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2262 if (BE (tree
== NULL
, 0))
2267 #ifdef RE_ENABLE_I18N
2268 if (dfa
->mb_cur_max
> 1)
2270 while (!re_string_eoi (regexp
)
2271 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2273 bin_tree_t
*mbc_remain
;
2274 fetch_token (token
, regexp
, syntax
);
2275 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2276 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2277 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2286 case OP_OPEN_SUBEXP
:
2287 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2288 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2291 case OP_OPEN_BRACKET
:
2292 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2293 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2297 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2302 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2303 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2304 if (BE (tree
== NULL
, 0))
2310 dfa
->has_mb_node
= 1;
2312 case OP_OPEN_DUP_NUM
:
2313 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2319 case OP_DUP_ASTERISK
:
2321 case OP_DUP_QUESTION
:
2322 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2327 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2329 fetch_token (token
, regexp
, syntax
);
2330 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2332 /* else fall through */
2333 case OP_CLOSE_SUBEXP
:
2334 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2335 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2340 /* else fall through */
2341 case OP_CLOSE_DUP_NUM
:
2342 /* We treat it as a normal character. */
2344 /* Then we can these characters as normal characters. */
2345 token
->type
= CHARACTER
;
2346 /* mb_partial and word_char bits should be initialized already
2348 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2349 if (BE (tree
== NULL
, 0))
2356 if ((token
->opr
.ctx_type
2357 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2358 && dfa
->word_ops_used
== 0)
2359 init_word_char (dfa
);
2360 if (token
->opr
.ctx_type
== WORD_DELIM
2361 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2363 bin_tree_t
*tree_first
, *tree_last
;
2364 if (token
->opr
.ctx_type
== WORD_DELIM
)
2366 token
->opr
.ctx_type
= WORD_FIRST
;
2367 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2368 token
->opr
.ctx_type
= WORD_LAST
;
2372 token
->opr
.ctx_type
= INSIDE_WORD
;
2373 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2374 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2376 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2377 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2378 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2386 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2387 if (BE (tree
== NULL
, 0))
2393 /* We must return here, since ANCHORs can't be followed
2394 by repetition operators.
2395 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2396 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2397 fetch_token (token
, regexp
, syntax
);
2400 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2401 if (BE (tree
== NULL
, 0))
2406 if (dfa
->mb_cur_max
> 1)
2407 dfa
->has_mb_node
= 1;
2411 tree
= build_charclass_op (dfa
, regexp
->trans
,
2412 (const unsigned char *) "alnum",
2413 (const unsigned char *) "_",
2414 token
->type
== OP_NOTWORD
, err
);
2415 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2420 tree
= build_charclass_op (dfa
, regexp
->trans
,
2421 (const unsigned char *) "space",
2422 (const unsigned char *) "",
2423 token
->type
== OP_NOTSPACE
, err
);
2424 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2434 /* Must not happen? */
2440 fetch_token (token
, regexp
, syntax
);
2442 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2443 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2445 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2446 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2448 /* In BRE consecutive duplications are not allowed. */
2449 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2450 && (token
->type
== OP_DUP_ASTERISK
2451 || token
->type
== OP_OPEN_DUP_NUM
))
2461 /* This function build the following tree, from regular expression
2469 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2470 re_string_t
*regexp
;
2473 reg_syntax_t syntax
;
2477 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2480 cur_nsub
= preg
->re_nsub
++;
2482 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2484 /* The subexpression may be a null string. */
2485 if (token
->type
== OP_CLOSE_SUBEXP
)
2489 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2490 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2492 if (BE (*err
!= REG_NOERROR
, 0))
2495 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2497 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2498 if (BE (tree
== NULL
, 0))
2503 tree
->token
.opr
.idx
= cur_nsub
;
2507 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2510 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2512 re_string_t
*regexp
;
2515 reg_syntax_t syntax
;
2518 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2519 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2520 re_token_t start_token
= *token
;
2522 if (token
->type
== OP_OPEN_DUP_NUM
)
2525 start
= fetch_number (regexp
, token
, syntax
);
2528 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2529 start
= 0; /* We treat "{,m}" as "{0,m}". */
2532 *err
= REG_BADBR
; /* <re>{} is invalid. */
2536 if (BE (start
!= -2, 1))
2538 /* We treat "{n}" as "{n,n}". */
2539 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2540 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2541 ? fetch_number (regexp
, token
, syntax
) : -2));
2543 if (BE (start
== -2 || end
== -2, 0))
2545 /* Invalid sequence. */
2546 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2548 if (token
->type
== END_OF_RE
)
2556 /* If the syntax bit is set, rollback. */
2557 re_string_set_index (regexp
, start_idx
);
2558 *token
= start_token
;
2559 token
->type
= CHARACTER
;
2560 /* mb_partial and word_char bits should be already initialized by
2565 if (BE (end
!= -1 && start
> end
, 0))
2567 /* First number greater than second. */
2574 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2575 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2578 fetch_token (token
, regexp
, syntax
);
2580 if (BE (elem
== NULL
, 0))
2582 if (BE (start
== 0 && end
== 0, 0))
2584 postorder (elem
, free_tree
, NULL
);
2588 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2589 if (BE (start
> 0, 0))
2592 for (i
= 2; i
<= start
; ++i
)
2594 elem
= duplicate_tree (elem
, dfa
);
2595 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2596 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2597 goto parse_dup_op_espace
;
2603 /* Duplicate ELEM before it is marked optional. */
2604 elem
= duplicate_tree (elem
, dfa
);
2610 if (elem
->token
.type
== SUBEXP
)
2611 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2613 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2614 if (BE (tree
== NULL
, 0))
2615 goto parse_dup_op_espace
;
2617 /* This loop is actually executed only when end != -1,
2618 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2619 already created the start+1-th copy. */
2620 for (i
= start
+ 2; i
<= end
; ++i
)
2622 elem
= duplicate_tree (elem
, dfa
);
2623 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2624 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2625 goto parse_dup_op_espace
;
2627 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2628 if (BE (tree
== NULL
, 0))
2629 goto parse_dup_op_espace
;
2633 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2637 parse_dup_op_espace
:
2642 /* Size of the names for collating symbol/equivalence_class/character_class.
2643 I'm not sure, but maybe enough. */
2644 #define BRACKET_NAME_BUF_SIZE 32
2647 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2648 Build the range expression which starts from START_ELEM, and ends
2649 at END_ELEM. The result are written to MBCSET and SBCSET.
2650 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2651 mbcset->range_ends, is a pointer argument sinse we may
2654 static reg_errcode_t
2655 # ifdef RE_ENABLE_I18N
2656 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2657 re_charset_t
*mbcset
;
2659 # else /* not RE_ENABLE_I18N */
2660 build_range_exp (sbcset
, start_elem
, end_elem
)
2661 # endif /* not RE_ENABLE_I18N */
2662 re_bitset_ptr_t sbcset
;
2663 bracket_elem_t
*start_elem
, *end_elem
;
2665 unsigned int start_ch
, end_ch
;
2666 /* Equivalence Classes and Character Classes can't be a range start/end. */
2667 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2668 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2672 /* We can handle no multi character collating elements without libc
2674 if (BE ((start_elem
->type
== COLL_SYM
2675 && strlen ((char *) start_elem
->opr
.name
) > 1)
2676 || (end_elem
->type
== COLL_SYM
2677 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2678 return REG_ECOLLATE
;
2680 # ifdef RE_ENABLE_I18N
2682 wchar_t wc
, start_wc
, end_wc
;
2683 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2685 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2686 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2688 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2689 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2691 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2692 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2693 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2694 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2695 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2696 return REG_ECOLLATE
;
2697 cmp_buf
[0] = start_wc
;
2698 cmp_buf
[4] = end_wc
;
2699 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2702 /* Got valid collation sequence values, add them as a new entry.
2703 However, for !_LIBC we have no collation elements: if the
2704 character set is single byte, the single byte character set
2705 that we build below suffices. parse_bracket_exp passes
2706 no MBCSET if dfa->mb_cur_max == 1. */
2709 /* Check the space of the arrays. */
2710 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2712 /* There is not enough space, need realloc. */
2713 wchar_t *new_array_start
, *new_array_end
;
2716 /* +1 in case of mbcset->nranges is 0. */
2717 new_nranges
= 2 * mbcset
->nranges
+ 1;
2718 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2719 are NULL if *range_alloc == 0. */
2720 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2722 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2725 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2728 mbcset
->range_starts
= new_array_start
;
2729 mbcset
->range_ends
= new_array_end
;
2730 *range_alloc
= new_nranges
;
2733 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2734 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2737 /* Build the table for single byte characters. */
2738 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2741 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2742 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2743 bitset_set (sbcset
, wc
);
2746 # else /* not RE_ENABLE_I18N */
2749 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2750 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2752 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2753 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2755 if (start_ch
> end_ch
)
2757 /* Build the table for single byte characters. */
2758 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2759 if (start_ch
<= ch
&& ch
<= end_ch
)
2760 bitset_set (sbcset
, ch
);
2762 # endif /* not RE_ENABLE_I18N */
2765 #endif /* not _LIBC */
2768 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2769 Build the collating element which is represented by NAME.
2770 The result are written to MBCSET and SBCSET.
2771 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2772 pointer argument since we may update it. */
2774 static reg_errcode_t
2775 # ifdef RE_ENABLE_I18N
2776 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2777 re_charset_t
*mbcset
;
2778 int *coll_sym_alloc
;
2779 # else /* not RE_ENABLE_I18N */
2780 build_collating_symbol (sbcset
, name
)
2781 # endif /* not RE_ENABLE_I18N */
2782 re_bitset_ptr_t sbcset
;
2783 const unsigned char *name
;
2785 size_t name_len
= strlen ((const char *) name
);
2786 if (BE (name_len
!= 1, 0))
2787 return REG_ECOLLATE
;
2790 bitset_set (sbcset
, name
[0]);
2794 #endif /* not _LIBC */
2796 /* This function parse bracket expression like "[abc]", "[a-c]",
2800 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2801 re_string_t
*regexp
;
2804 reg_syntax_t syntax
;
2808 const unsigned char *collseqmb
;
2809 const char *collseqwc
;
2812 const int32_t *symb_table
;
2813 const unsigned char *extra
;
2815 /* Local function for parse_bracket_exp used in _LIBC environement.
2816 Seek the collating symbol entry correspondings to NAME.
2817 Return the index of the symbol in the SYMB_TABLE. */
2820 __attribute ((always_inline
))
2821 seek_collating_symbol_entry (name
, name_len
)
2822 const unsigned char *name
;
2825 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2826 int32_t elem
= hash
% table_size
;
2827 int32_t second
= hash
% (table_size
- 2);
2828 while (symb_table
[2 * elem
] != 0)
2830 /* First compare the hashing value. */
2831 if (symb_table
[2 * elem
] == hash
2832 /* Compare the length of the name. */
2833 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2834 /* Compare the name. */
2835 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2838 /* Yep, this is the entry. */
2848 /* Local function for parse_bracket_exp used in _LIBC environement.
2849 Look up the collation sequence value of BR_ELEM.
2850 Return the value if succeeded, UINT_MAX otherwise. */
2852 auto inline unsigned int
2853 __attribute ((always_inline
))
2854 lookup_collation_sequence_value (br_elem
)
2855 bracket_elem_t
*br_elem
;
2857 if (br_elem
->type
== SB_CHAR
)
2860 if (MB_CUR_MAX == 1)
2863 return collseqmb
[br_elem
->opr
.ch
];
2866 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2867 return __collseq_table_lookup (collseqwc
, wc
);
2870 else if (br_elem
->type
== MB_CHAR
)
2872 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2874 else if (br_elem
->type
== COLL_SYM
)
2876 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2880 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2882 if (symb_table
[2 * elem
] != 0)
2884 /* We found the entry. */
2885 idx
= symb_table
[2 * elem
+ 1];
2886 /* Skip the name of collating element name. */
2887 idx
+= 1 + extra
[idx
];
2888 /* Skip the byte sequence of the collating element. */
2889 idx
+= 1 + extra
[idx
];
2890 /* Adjust for the alignment. */
2891 idx
= (idx
+ 3) & ~3;
2892 /* Skip the multibyte collation sequence value. */
2893 idx
+= sizeof (unsigned int);
2894 /* Skip the wide char sequence of the collating element. */
2895 idx
+= sizeof (unsigned int) *
2896 (1 + *(unsigned int *) (extra
+ idx
));
2897 /* Return the collation sequence value. */
2898 return *(unsigned int *) (extra
+ idx
);
2900 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2902 /* No valid character. Match it as a single byte
2904 return collseqmb
[br_elem
->opr
.name
[0]];
2907 else if (sym_name_len
== 1)
2908 return collseqmb
[br_elem
->opr
.name
[0]];
2913 /* Local function for parse_bracket_exp used in _LIBC environement.
2914 Build the range expression which starts from START_ELEM, and ends
2915 at END_ELEM. The result are written to MBCSET and SBCSET.
2916 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2917 mbcset->range_ends, is a pointer argument sinse we may
2920 auto inline reg_errcode_t
2921 __attribute ((always_inline
))
2922 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2923 re_charset_t
*mbcset
;
2925 re_bitset_ptr_t sbcset
;
2926 bracket_elem_t
*start_elem
, *end_elem
;
2929 uint32_t start_collseq
;
2930 uint32_t end_collseq
;
2932 /* Equivalence Classes and Character Classes can't be a range
2934 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2935 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2939 start_collseq
= lookup_collation_sequence_value (start_elem
);
2940 end_collseq
= lookup_collation_sequence_value (end_elem
);
2941 /* Check start/end collation sequence values. */
2942 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2943 return REG_ECOLLATE
;
2944 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2947 /* Got valid collation sequence values, add them as a new entry.
2948 However, if we have no collation elements, and the character set
2949 is single byte, the single byte character set that we
2950 build below suffices. */
2951 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2953 /* Check the space of the arrays. */
2954 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2956 /* There is not enough space, need realloc. */
2957 uint32_t *new_array_start
;
2958 uint32_t *new_array_end
;
2961 /* +1 in case of mbcset->nranges is 0. */
2962 new_nranges
= 2 * mbcset
->nranges
+ 1;
2963 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2965 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2968 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2971 mbcset
->range_starts
= new_array_start
;
2972 mbcset
->range_ends
= new_array_end
;
2973 *range_alloc
= new_nranges
;
2976 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2977 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2980 /* Build the table for single byte characters. */
2981 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2983 uint32_t ch_collseq
;
2985 if (MB_CUR_MAX == 1)
2988 ch_collseq
= collseqmb
[ch
];
2990 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2991 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2992 bitset_set (sbcset
, ch
);
2997 /* Local function for parse_bracket_exp used in _LIBC environement.
2998 Build the collating element which is represented by NAME.
2999 The result are written to MBCSET and SBCSET.
3000 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3001 pointer argument sinse we may update it. */
3003 auto inline reg_errcode_t
3004 __attribute ((always_inline
))
3005 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
3006 re_charset_t
*mbcset
;
3007 int *coll_sym_alloc
;
3008 re_bitset_ptr_t sbcset
;
3009 const unsigned char *name
;
3012 size_t name_len
= strlen ((const char *) name
);
3015 elem
= seek_collating_symbol_entry (name
, name_len
);
3016 if (symb_table
[2 * elem
] != 0)
3018 /* We found the entry. */
3019 idx
= symb_table
[2 * elem
+ 1];
3020 /* Skip the name of collating element name. */
3021 idx
+= 1 + extra
[idx
];
3023 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3025 /* No valid character, treat it as a normal
3027 bitset_set (sbcset
, name
[0]);
3031 return REG_ECOLLATE
;
3033 /* Got valid collation sequence, add it as a new entry. */
3034 /* Check the space of the arrays. */
3035 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3037 /* Not enough, realloc it. */
3038 /* +1 in case of mbcset->ncoll_syms is 0. */
3039 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3040 /* Use realloc since mbcset->coll_syms is NULL
3042 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3043 new_coll_sym_alloc
);
3044 if (BE (new_coll_syms
== NULL
, 0))
3046 mbcset
->coll_syms
= new_coll_syms
;
3047 *coll_sym_alloc
= new_coll_sym_alloc
;
3049 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3054 if (BE (name_len
!= 1, 0))
3055 return REG_ECOLLATE
;
3058 bitset_set (sbcset
, name
[0]);
3065 re_token_t br_token
;
3066 re_bitset_ptr_t sbcset
;
3067 #ifdef RE_ENABLE_I18N
3068 re_charset_t
*mbcset
;
3069 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3070 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3071 #endif /* not RE_ENABLE_I18N */
3073 bin_tree_t
*work_tree
;
3075 int first_round
= 1;
3077 collseqmb
= (const unsigned char *)
3078 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3079 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3085 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3086 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3087 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3088 _NL_COLLATE_SYMB_TABLEMB
);
3089 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3090 _NL_COLLATE_SYMB_EXTRAMB
);
3093 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3094 #ifdef RE_ENABLE_I18N
3095 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3096 #endif /* RE_ENABLE_I18N */
3097 #ifdef RE_ENABLE_I18N
3098 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3100 if (BE (sbcset
== NULL
, 0))
3101 #endif /* RE_ENABLE_I18N */
3107 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3108 if (BE (token
->type
== END_OF_RE
, 0))
3111 goto parse_bracket_exp_free_return
;
3113 if (token
->type
== OP_NON_MATCH_LIST
)
3115 #ifdef RE_ENABLE_I18N
3116 mbcset
->non_match
= 1;
3117 #endif /* not RE_ENABLE_I18N */
3119 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3120 bitset_set (sbcset
, '\0');
3121 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3122 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3123 if (BE (token
->type
== END_OF_RE
, 0))
3126 goto parse_bracket_exp_free_return
;
3130 /* We treat the first ']' as a normal character. */
3131 if (token
->type
== OP_CLOSE_BRACKET
)
3132 token
->type
= CHARACTER
;
3136 bracket_elem_t start_elem
, end_elem
;
3137 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3138 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3140 int token_len2
= 0, is_range_exp
= 0;
3143 start_elem
.opr
.name
= start_name_buf
;
3144 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3145 syntax
, first_round
);
3146 if (BE (ret
!= REG_NOERROR
, 0))
3149 goto parse_bracket_exp_free_return
;
3153 /* Get information about the next token. We need it in any case. */
3154 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3156 /* Do not check for ranges if we know they are not allowed. */
3157 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3159 if (BE (token
->type
== END_OF_RE
, 0))
3162 goto parse_bracket_exp_free_return
;
3164 if (token
->type
== OP_CHARSET_RANGE
)
3166 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3167 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3168 if (BE (token2
.type
== END_OF_RE
, 0))
3171 goto parse_bracket_exp_free_return
;
3173 if (token2
.type
== OP_CLOSE_BRACKET
)
3175 /* We treat the last '-' as a normal character. */
3176 re_string_skip_bytes (regexp
, -token_len
);
3177 token
->type
= CHARACTER
;
3184 if (is_range_exp
== 1)
3186 end_elem
.opr
.name
= end_name_buf
;
3187 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3189 if (BE (ret
!= REG_NOERROR
, 0))
3192 goto parse_bracket_exp_free_return
;
3195 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3198 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3199 &start_elem
, &end_elem
);
3201 # ifdef RE_ENABLE_I18N
3202 *err
= build_range_exp (sbcset
,
3203 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3204 &range_alloc
, &start_elem
, &end_elem
);
3206 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3208 #endif /* RE_ENABLE_I18N */
3209 if (BE (*err
!= REG_NOERROR
, 0))
3210 goto parse_bracket_exp_free_return
;
3214 switch (start_elem
.type
)
3217 bitset_set (sbcset
, start_elem
.opr
.ch
);
3219 #ifdef RE_ENABLE_I18N
3221 /* Check whether the array has enough space. */
3222 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3224 wchar_t *new_mbchars
;
3225 /* Not enough, realloc it. */
3226 /* +1 in case of mbcset->nmbchars is 0. */
3227 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3228 /* Use realloc since array is NULL if *alloc == 0. */
3229 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3231 if (BE (new_mbchars
== NULL
, 0))
3232 goto parse_bracket_exp_espace
;
3233 mbcset
->mbchars
= new_mbchars
;
3235 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3237 #endif /* RE_ENABLE_I18N */
3239 *err
= build_equiv_class (sbcset
,
3240 #ifdef RE_ENABLE_I18N
3241 mbcset
, &equiv_class_alloc
,
3242 #endif /* RE_ENABLE_I18N */
3243 start_elem
.opr
.name
);
3244 if (BE (*err
!= REG_NOERROR
, 0))
3245 goto parse_bracket_exp_free_return
;
3248 *err
= build_collating_symbol (sbcset
,
3249 #ifdef RE_ENABLE_I18N
3250 mbcset
, &coll_sym_alloc
,
3251 #endif /* RE_ENABLE_I18N */
3252 start_elem
.opr
.name
);
3253 if (BE (*err
!= REG_NOERROR
, 0))
3254 goto parse_bracket_exp_free_return
;
3257 *err
= build_charclass (regexp
->trans
, sbcset
,
3258 #ifdef RE_ENABLE_I18N
3259 mbcset
, &char_class_alloc
,
3260 #endif /* RE_ENABLE_I18N */
3261 start_elem
.opr
.name
, syntax
);
3262 if (BE (*err
!= REG_NOERROR
, 0))
3263 goto parse_bracket_exp_free_return
;
3270 if (BE (token
->type
== END_OF_RE
, 0))
3273 goto parse_bracket_exp_free_return
;
3275 if (token
->type
== OP_CLOSE_BRACKET
)
3279 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3281 /* If it is non-matching list. */
3283 bitset_not (sbcset
);
3285 #ifdef RE_ENABLE_I18N
3286 /* Ensure only single byte characters are set. */
3287 if (dfa
->mb_cur_max
> 1)
3288 bitset_mask (sbcset
, dfa
->sb_char
);
3290 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3291 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3292 || mbcset
->non_match
)))
3294 bin_tree_t
*mbc_tree
;
3296 /* Build a tree for complex bracket. */
3297 dfa
->has_mb_node
= 1;
3298 br_token
.type
= COMPLEX_BRACKET
;
3299 br_token
.opr
.mbcset
= mbcset
;
3300 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3301 if (BE (mbc_tree
== NULL
, 0))
3302 goto parse_bracket_exp_espace
;
3303 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3304 if (sbcset
[sbc_idx
])
3306 /* If there are no bits set in sbcset, there is no point
3307 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3308 if (sbc_idx
< BITSET_UINTS
)
3310 /* Build a tree for simple bracket. */
3311 br_token
.type
= SIMPLE_BRACKET
;
3312 br_token
.opr
.sbcset
= sbcset
;
3313 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3314 if (BE (work_tree
== NULL
, 0))
3315 goto parse_bracket_exp_espace
;
3317 /* Then join them by ALT node. */
3318 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3319 if (BE (work_tree
== NULL
, 0))
3320 goto parse_bracket_exp_espace
;
3325 work_tree
= mbc_tree
;
3329 #endif /* not RE_ENABLE_I18N */
3331 #ifdef RE_ENABLE_I18N
3332 free_charset (mbcset
);
3334 /* Build a tree for simple bracket. */
3335 br_token
.type
= SIMPLE_BRACKET
;
3336 br_token
.opr
.sbcset
= sbcset
;
3337 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3338 if (BE (work_tree
== NULL
, 0))
3339 goto parse_bracket_exp_espace
;
3343 parse_bracket_exp_espace
:
3345 parse_bracket_exp_free_return
:
3347 #ifdef RE_ENABLE_I18N
3348 free_charset (mbcset
);
3349 #endif /* RE_ENABLE_I18N */
3353 /* Parse an element in the bracket expression. */
3355 static reg_errcode_t
3356 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3358 bracket_elem_t
*elem
;
3359 re_string_t
*regexp
;
3363 reg_syntax_t syntax
;
3366 #ifdef RE_ENABLE_I18N
3368 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3369 if (cur_char_size
> 1)
3371 elem
->type
= MB_CHAR
;
3372 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3373 re_string_skip_bytes (regexp
, cur_char_size
);
3376 #endif /* RE_ENABLE_I18N */
3377 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3378 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3379 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3380 return parse_bracket_symbol (elem
, regexp
, token
);
3381 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3383 /* A '-' must only appear as anything but a range indicator before
3384 the closing bracket. Everything else is an error. */
3386 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3387 if (token2
.type
!= OP_CLOSE_BRACKET
)
3388 /* The actual error value is not standardized since this whole
3389 case is undefined. But ERANGE makes good sense. */
3392 elem
->type
= SB_CHAR
;
3393 elem
->opr
.ch
= token
->opr
.c
;
3397 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3398 such as [:<character_class>:], [.<collating_element>.], and
3399 [=<equivalent_class>=]. */
3401 static reg_errcode_t
3402 parse_bracket_symbol (elem
, regexp
, token
)
3403 bracket_elem_t
*elem
;
3404 re_string_t
*regexp
;
3407 unsigned char ch
, delim
= token
->opr
.c
;
3409 if (re_string_eoi(regexp
))
3413 if (i
>= BRACKET_NAME_BUF_SIZE
)
3415 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3416 ch
= re_string_fetch_byte_case (regexp
);
3418 ch
= re_string_fetch_byte (regexp
);
3419 if (re_string_eoi(regexp
))
3421 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3423 elem
->opr
.name
[i
] = ch
;
3425 re_string_skip_bytes (regexp
, 1);
3426 elem
->opr
.name
[i
] = '\0';
3427 switch (token
->type
)
3429 case OP_OPEN_COLL_ELEM
:
3430 elem
->type
= COLL_SYM
;
3432 case OP_OPEN_EQUIV_CLASS
:
3433 elem
->type
= EQUIV_CLASS
;
3435 case OP_OPEN_CHAR_CLASS
:
3436 elem
->type
= CHAR_CLASS
;
3444 /* Helper function for parse_bracket_exp.
3445 Build the equivalence class which is represented by NAME.
3446 The result are written to MBCSET and SBCSET.
3447 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3448 is a pointer argument sinse we may update it. */
3450 static reg_errcode_t
3451 #ifdef RE_ENABLE_I18N
3452 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3453 re_charset_t
*mbcset
;
3454 int *equiv_class_alloc
;
3455 #else /* not RE_ENABLE_I18N */
3456 build_equiv_class (sbcset
, name
)
3457 #endif /* not RE_ENABLE_I18N */
3458 re_bitset_ptr_t sbcset
;
3459 const unsigned char *name
;
3462 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3465 const int32_t *table
, *indirect
;
3466 const unsigned char *weights
, *extra
, *cp
;
3467 unsigned char char_buf
[2];
3471 /* This #include defines a local function! */
3472 # include <locale/weight.h>
3473 /* Calculate the index for equivalence class. */
3475 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3476 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3477 _NL_COLLATE_WEIGHTMB
);
3478 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3479 _NL_COLLATE_EXTRAMB
);
3480 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3481 _NL_COLLATE_INDIRECTMB
);
3482 idx1
= findidx (&cp
);
3483 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3484 /* This isn't a valid character. */
3485 return REG_ECOLLATE
;
3487 /* Build single byte matcing table for this equivalence class. */
3488 char_buf
[1] = (unsigned char) '\0';
3489 len
= weights
[idx1
];
3490 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3494 idx2
= findidx (&cp
);
3499 /* This isn't a valid character. */
3501 if (len
== weights
[idx2
])
3504 while (cnt
<= len
&&
3505 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3509 bitset_set (sbcset
, ch
);
3512 /* Check whether the array has enough space. */
3513 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3515 /* Not enough, realloc it. */
3516 /* +1 in case of mbcset->nequiv_classes is 0. */
3517 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3518 /* Use realloc since the array is NULL if *alloc == 0. */
3519 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3521 new_equiv_class_alloc
);
3522 if (BE (new_equiv_classes
== NULL
, 0))
3524 mbcset
->equiv_classes
= new_equiv_classes
;
3525 *equiv_class_alloc
= new_equiv_class_alloc
;
3527 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3532 if (BE (strlen ((const char *) name
) != 1, 0))
3533 return REG_ECOLLATE
;
3534 bitset_set (sbcset
, *name
);
3539 /* Helper function for parse_bracket_exp.
3540 Build the character class which is represented by NAME.
3541 The result are written to MBCSET and SBCSET.
3542 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3543 is a pointer argument sinse we may update it. */
3545 static reg_errcode_t
3546 #ifdef RE_ENABLE_I18N
3547 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3548 re_charset_t
*mbcset
;
3549 int *char_class_alloc
;
3550 #else /* not RE_ENABLE_I18N */
3551 build_charclass (trans
, sbcset
, class_name
, syntax
)
3552 #endif /* not RE_ENABLE_I18N */
3553 unsigned RE_TRANSLATE_TYPE trans
;
3554 re_bitset_ptr_t sbcset
;
3555 const unsigned char *class_name
;
3556 reg_syntax_t syntax
;
3559 const char *name
= (const char *) class_name
;
3561 /* In case of REG_ICASE "upper" and "lower" match the both of
3562 upper and lower cases. */
3563 if ((syntax
& RE_ICASE
)
3564 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3567 #ifdef RE_ENABLE_I18N
3568 /* Check the space of the arrays. */
3569 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3571 /* Not enough, realloc it. */
3572 /* +1 in case of mbcset->nchar_classes is 0. */
3573 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3574 /* Use realloc since array is NULL if *alloc == 0. */
3575 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3576 new_char_class_alloc
);
3577 if (BE (new_char_classes
== NULL
, 0))
3579 mbcset
->char_classes
= new_char_classes
;
3580 *char_class_alloc
= new_char_class_alloc
;
3582 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3583 #endif /* RE_ENABLE_I18N */
3585 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3586 for (i = 0; i < SBC_MAX; ++i) \
3588 if (ctype_func (i)) \
3590 int ch = trans ? trans[i] : i; \
3591 bitset_set (sbcset, ch); \
3595 if (strcmp (name
, "alnum") == 0)
3596 BUILD_CHARCLASS_LOOP (isalnum
)
3597 else if (strcmp (name
, "cntrl") == 0)
3598 BUILD_CHARCLASS_LOOP (iscntrl
)
3599 else if (strcmp (name
, "lower") == 0)
3600 BUILD_CHARCLASS_LOOP (islower
)
3601 else if (strcmp (name
, "space") == 0)
3602 BUILD_CHARCLASS_LOOP (isspace
)
3603 else if (strcmp (name
, "alpha") == 0)
3604 BUILD_CHARCLASS_LOOP (isalpha
)
3605 else if (strcmp (name
, "digit") == 0)
3606 BUILD_CHARCLASS_LOOP (isdigit
)
3607 else if (strcmp (name
, "print") == 0)
3608 BUILD_CHARCLASS_LOOP (isprint
)
3609 else if (strcmp (name
, "upper") == 0)
3610 BUILD_CHARCLASS_LOOP (isupper
)
3611 else if (strcmp (name
, "blank") == 0)
3612 BUILD_CHARCLASS_LOOP (isblank
)
3613 else if (strcmp (name
, "graph") == 0)
3614 BUILD_CHARCLASS_LOOP (isgraph
)
3615 else if (strcmp (name
, "punct") == 0)
3616 BUILD_CHARCLASS_LOOP (ispunct
)
3617 else if (strcmp (name
, "xdigit") == 0)
3618 BUILD_CHARCLASS_LOOP (isxdigit
)
3626 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3628 unsigned RE_TRANSLATE_TYPE trans
;
3629 const unsigned char *class_name
;
3630 const unsigned char *extra
;
3634 re_bitset_ptr_t sbcset
;
3635 #ifdef RE_ENABLE_I18N
3636 re_charset_t
*mbcset
;
3638 #endif /* not RE_ENABLE_I18N */
3640 re_token_t br_token
;
3643 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3644 #ifdef RE_ENABLE_I18N
3645 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3646 #endif /* RE_ENABLE_I18N */
3648 #ifdef RE_ENABLE_I18N
3649 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3650 #else /* not RE_ENABLE_I18N */
3651 if (BE (sbcset
== NULL
, 0))
3652 #endif /* not RE_ENABLE_I18N */
3660 #ifdef RE_ENABLE_I18N
3662 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3663 bitset_set(cset->sbcset, '\0');
3665 mbcset
->non_match
= 1;
3666 #endif /* not RE_ENABLE_I18N */
3669 /* We don't care the syntax in this case. */
3670 ret
= build_charclass (trans
, sbcset
,
3671 #ifdef RE_ENABLE_I18N
3673 #endif /* RE_ENABLE_I18N */
3676 if (BE (ret
!= REG_NOERROR
, 0))
3679 #ifdef RE_ENABLE_I18N
3680 free_charset (mbcset
);
3681 #endif /* RE_ENABLE_I18N */
3685 /* \w match '_' also. */
3686 for (; *extra
; extra
++)
3687 bitset_set (sbcset
, *extra
);
3689 /* If it is non-matching list. */
3691 bitset_not (sbcset
);
3693 #ifdef RE_ENABLE_I18N
3694 /* Ensure only single byte characters are set. */
3695 if (dfa
->mb_cur_max
> 1)
3696 bitset_mask (sbcset
, dfa
->sb_char
);
3699 /* Build a tree for simple bracket. */
3700 br_token
.type
= SIMPLE_BRACKET
;
3701 br_token
.opr
.sbcset
= sbcset
;
3702 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3703 if (BE (tree
== NULL
, 0))
3704 goto build_word_op_espace
;
3706 #ifdef RE_ENABLE_I18N
3707 if (dfa
->mb_cur_max
> 1)
3709 bin_tree_t
*mbc_tree
;
3710 /* Build a tree for complex bracket. */
3711 br_token
.type
= COMPLEX_BRACKET
;
3712 br_token
.opr
.mbcset
= mbcset
;
3713 dfa
->has_mb_node
= 1;
3714 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3715 if (BE (mbc_tree
== NULL
, 0))
3716 goto build_word_op_espace
;
3717 /* Then join them by ALT node. */
3718 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3719 if (BE (mbc_tree
!= NULL
, 1))
3724 free_charset (mbcset
);
3727 #else /* not RE_ENABLE_I18N */
3729 #endif /* not RE_ENABLE_I18N */
3731 build_word_op_espace
:
3733 #ifdef RE_ENABLE_I18N
3734 free_charset (mbcset
);
3735 #endif /* RE_ENABLE_I18N */
3740 /* This is intended for the expressions like "a{1,3}".
3741 Fetch a number from `input', and return the number.
3742 Return -1, if the number field is empty like "{,1}".
3743 Return -2, If an error is occured. */
3746 fetch_number (input
, token
, syntax
)
3749 reg_syntax_t syntax
;
3755 fetch_token (token
, input
, syntax
);
3757 if (BE (token
->type
== END_OF_RE
, 0))
3759 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3761 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3762 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3763 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3768 #ifdef RE_ENABLE_I18N
3770 free_charset (re_charset_t
*cset
)
3772 re_free (cset
->mbchars
);
3774 re_free (cset
->coll_syms
);
3775 re_free (cset
->equiv_classes
);
3776 re_free (cset
->range_starts
);
3777 re_free (cset
->range_ends
);
3779 re_free (cset
->char_classes
);
3782 #endif /* RE_ENABLE_I18N */
3784 /* Functions for binary tree operation. */
3786 /* Create a tree node. */
3789 create_tree (dfa
, left
, right
, type
)
3793 re_token_type_t type
;
3797 return create_token_tree (dfa
, left
, right
, &t
);
3801 create_token_tree (dfa
, left
, right
, token
)
3805 const re_token_t
*token
;
3808 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3810 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3812 if (storage
== NULL
)
3814 storage
->next
= dfa
->str_tree_storage
;
3815 dfa
->str_tree_storage
= storage
;
3816 dfa
->str_tree_storage_idx
= 0;
3818 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3820 tree
->parent
= NULL
;
3822 tree
->right
= right
;
3823 tree
->token
= *token
;
3824 tree
->token
.duplicated
= 0;
3825 tree
->token
.opt_subexp
= 0;
3828 tree
->node_idx
= -1;
3831 left
->parent
= tree
;
3833 right
->parent
= tree
;
3837 /* Mark the tree SRC as an optional subexpression.
3838 To be called from preorder or postorder. */
3840 static reg_errcode_t
3841 mark_opt_subexp (extra
, node
)
3845 int idx
= (int) (long) extra
;
3846 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3847 node
->token
.opt_subexp
= 1;
3852 /* Free the allocated memory inside NODE. */
3855 free_token (re_token_t
*node
)
3857 #ifdef RE_ENABLE_I18N
3858 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3859 free_charset (node
->opr
.mbcset
);
3861 #endif /* RE_ENABLE_I18N */
3862 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3863 re_free (node
->opr
.sbcset
);
3866 /* Worker function for tree walking. Free the allocated memory inside NODE
3867 and its children. */
3869 static reg_errcode_t
3870 free_tree (void *extra
, bin_tree_t
*node
)
3872 free_token (&node
->token
);
3877 /* Duplicate the node SRC, and return new node. This is a preorder
3878 visit similar to the one implemented by the generic visitor, but
3879 we need more infrastructure to maintain two parallel trees --- so,
3880 it's easier to duplicate. */
3883 duplicate_tree (root
, dfa
)
3884 const bin_tree_t
*root
;
3887 const bin_tree_t
*node
;
3888 bin_tree_t
*dup_root
;
3889 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3891 for (node
= root
; ; )
3893 /* Create a new tree and link it back to the current parent. */
3894 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3897 (*p_new
)->parent
= dup_node
;
3898 (*p_new
)->token
.duplicated
= 1;
3901 /* Go to the left node, or up and to the right. */
3905 p_new
= &dup_node
->left
;
3909 const bin_tree_t
*prev
= NULL
;
3910 while (node
->right
== prev
|| node
->right
== NULL
)
3913 node
= node
->parent
;
3914 dup_node
= dup_node
->parent
;
3919 p_new
= &dup_node
->right
;