1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
37 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
38 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
39 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
40 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
41 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
42 int top_clone_node
, int root_node
,
43 unsigned int constraint
);
44 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
45 unsigned int constraint
);
46 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
47 unsigned int constraint
);
48 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
49 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
51 static void calc_inveclosure (re_dfa_t
*dfa
);
52 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
54 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
56 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
92 re_charset_t
*mbcset
, int *range_alloc
,
93 bracket_elem_t
*start_elem
,
94 bracket_elem_t
*end_elem
);
95 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
98 const unsigned char *name
);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 const unsigned char *name
);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
109 re_charset_t
*mbcset
,
110 int *equiv_class_alloc
,
111 const unsigned char *name
);
112 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
113 re_bitset_ptr_t sbcset
,
114 re_charset_t
*mbcset
,
115 int *char_class_alloc
,
116 const unsigned char *class_name
,
117 reg_syntax_t syntax
);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
120 const unsigned char *name
);
121 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
122 re_bitset_ptr_t sbcset
,
123 const unsigned char *class_name
,
124 reg_syntax_t syntax
);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
127 unsigned RE_TRANSLATE_TYPE trans
,
128 const unsigned char *class_name
,
129 const unsigned char *extra
,
130 int non_match
, reg_errcode_t
*err
);
131 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
132 bin_tree_t
*left
, bin_tree_t
*right
,
133 re_token_type_t type
, int index
);
134 static bin_tree_t
*re_dfa_add_tree_node (re_dfa_t
*dfa
,
135 bin_tree_t
*left
, bin_tree_t
*right
,
136 const re_token_t
*token
)
137 __attribute ((noinline
));
138 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
139 static void mark_opt_subexp (const bin_tree_t
*src
, re_dfa_t
*dfa
);
140 static void mark_opt_subexp_iter (const bin_tree_t
*src
, re_dfa_t
*dfa
, int idx
);
142 /* This table gives an error message for each of the error codes listed
143 in regex.h. Obviously the order here has to be same as there.
144 POSIX doesn't require that we do anything for REG_NOERROR,
145 but why not be nice? */
147 const char __re_error_msgid
[] attribute_hidden
=
149 #define REG_NOERROR_IDX 0
150 gettext_noop ("Success") /* REG_NOERROR */
152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
153 gettext_noop ("No match") /* REG_NOMATCH */
155 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
156 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
159 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
161 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
162 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
165 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
168 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
170 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
171 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
173 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
174 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
176 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
177 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
179 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
180 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
182 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
183 gettext_noop ("Invalid range end") /* REG_ERANGE */
185 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
186 gettext_noop ("Memory exhausted") /* REG_ESPACE */
188 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
189 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
191 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
192 gettext_noop ("Premature end of regular expression") /* REG_EEND */
194 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
195 gettext_noop ("Regular expression too big") /* REG_ESIZE */
197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
198 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
201 const size_t __re_error_msgid_idx
[] attribute_hidden
=
222 /* Entry points for GNU code. */
224 /* re_compile_pattern is the GNU regular expression compiler: it
225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
226 Returns 0 if the pattern was valid, otherwise an error string.
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
232 re_compile_pattern (pattern
, length
, bufp
)
235 struct re_pattern_buffer
*bufp
;
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
244 /* Match anchors at newline. */
245 bufp
->newline_anchor
= 1;
247 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
251 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
254 weak_alias (__re_compile_pattern
, re_compile_pattern
)
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options
;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
273 re_set_syntax (syntax
)
276 reg_syntax_t ret
= re_syntax_options
;
278 re_syntax_options
= syntax
;
282 weak_alias (__re_set_syntax
, re_set_syntax
)
286 re_compile_fastmap (bufp
)
287 struct re_pattern_buffer
*bufp
;
289 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
290 char *fastmap
= bufp
->fastmap
;
292 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
293 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
294 if (dfa
->init_state
!= dfa
->init_state_word
)
295 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
296 if (dfa
->init_state
!= dfa
->init_state_nl
)
297 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
298 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
299 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
300 bufp
->fastmap_accurate
= 1;
304 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
308 __attribute ((always_inline
))
309 re_set_fastmap (char *fastmap
, int icase
, int ch
)
313 fastmap
[tolower (ch
)] = 1;
316 /* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
320 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
322 const re_dfastate_t
*init_state
;
325 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
327 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
328 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
330 int node
= init_state
->nodes
.elems
[node_cnt
];
331 re_token_type_t type
= dfa
->nodes
[node
].type
;
333 if (type
== CHARACTER
)
335 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
339 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
344 *p
++ = dfa
->nodes
[node
].opr
.c
;
345 while (++node
< dfa
->nodes_len
346 && dfa
->nodes
[node
].type
== CHARACTER
347 && dfa
->nodes
[node
].mb_partial
)
348 *p
++ = dfa
->nodes
[node
].opr
.c
;
349 memset (&state
, 0, sizeof (state
));
350 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
352 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
353 re_set_fastmap (fastmap
, 0, buf
[0]);
357 else if (type
== SIMPLE_BRACKET
)
360 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
361 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
362 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
363 re_set_fastmap (fastmap
, icase
, ch
);
365 #ifdef RE_ENABLE_I18N
366 else if (type
== COMPLEX_BRACKET
)
369 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
370 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
371 || cset
->nranges
|| cset
->nchar_classes
)
374 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
376 /* In this case we want to catch the bytes which are
377 the first byte of any collation elements.
378 e.g. In da_DK, we want to catch 'a' since "aa"
379 is a valid collation element, and don't catch
380 'b' since 'b' is the only collation element
381 which starts from 'b'. */
383 const int32_t *table
= (const int32_t *)
384 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
385 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
386 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
388 re_set_fastmap (fastmap
, icase
, ch
);
391 if (dfa
->mb_cur_max
> 1)
392 for (i
= 0; i
< SBC_MAX
; ++i
)
393 if (__btowc (i
) == WEOF
)
394 re_set_fastmap (fastmap
, icase
, i
);
395 # endif /* not _LIBC */
397 for (i
= 0; i
< cset
->nmbchars
; ++i
)
401 memset (&state
, '\0', sizeof (state
));
402 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
403 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
404 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
406 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
407 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
411 #endif /* RE_ENABLE_I18N */
412 else if (type
== OP_PERIOD
413 #ifdef RE_ENABLE_I18N
414 || type
== OP_UTF8_PERIOD
415 #endif /* RE_ENABLE_I18N */
416 || type
== END_OF_RE
)
418 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
419 if (type
== END_OF_RE
)
420 bufp
->can_be_null
= 1;
426 /* Entry point for POSIX code. */
427 /* regcomp takes a regular expression as a string and compiles it.
429 PREG is a regex_t *. We do not expect any fields to be initialized,
430 since POSIX says we shouldn't. Thus, we set
432 `buffer' to the compiled pattern;
433 `used' to the length of the compiled pattern;
434 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
435 REG_EXTENDED bit in CFLAGS is set; otherwise, to
436 RE_SYNTAX_POSIX_BASIC;
437 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
438 `fastmap' to an allocated space for the fastmap;
439 `fastmap_accurate' to zero;
440 `re_nsub' to the number of subexpressions in PATTERN.
442 PATTERN is the address of the pattern string.
444 CFLAGS is a series of bits which affect compilation.
446 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
447 use POSIX basic syntax.
449 If REG_NEWLINE is set, then . and [^...] don't match newline.
450 Also, regexec will try a match beginning after every newline.
452 If REG_ICASE is set, then we considers upper- and lowercase
453 versions of letters to be equivalent when matching.
455 If REG_NOSUB is set, then when PREG is passed to regexec, that
456 routine will report only success or failure, and nothing about the
459 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
460 the return codes and their meanings.) */
463 regcomp (preg
, pattern
, cflags
)
464 regex_t
*__restrict preg
;
465 const char *__restrict pattern
;
469 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
470 : RE_SYNTAX_POSIX_BASIC
);
476 /* Try to allocate space for the fastmap. */
477 preg
->fastmap
= re_malloc (char, SBC_MAX
);
478 if (BE (preg
->fastmap
== NULL
, 0))
481 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags
& REG_NEWLINE
)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax
&= ~RE_DOT_NEWLINE
;
487 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
488 /* It also changes the matching behavior. */
489 preg
->newline_anchor
= 1;
492 preg
->newline_anchor
= 0;
493 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
494 preg
->translate
= NULL
;
496 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret
== REG_ERPAREN
)
503 /* We have already checked preg->fastmap != NULL. */
504 if (BE (ret
== REG_NOERROR
, 1))
505 /* Compute the fastmap now, since regexec cannot modify the pattern
506 buffer. This function never fails in this implementation. */
507 (void) re_compile_fastmap (preg
);
510 /* Some error occurred while compiling the expression. */
511 re_free (preg
->fastmap
);
512 preg
->fastmap
= NULL
;
518 weak_alias (__regcomp
, regcomp
)
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
525 regerror (errcode
, preg
, errbuf
, errbuf_size
)
535 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
536 / sizeof (__re_error_msgid_idx
[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
543 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
545 msg_size
= strlen (msg
) + 1; /* Includes the null. */
547 if (BE (errbuf_size
!= 0, 1))
549 if (BE (msg_size
> errbuf_size
, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
554 memcpy (errbuf
, msg
, errbuf_size
- 1);
555 errbuf
[errbuf_size
- 1] = 0;
559 memcpy (errbuf
, msg
, msg_size
);
565 weak_alias (__regerror
, regerror
)
570 free_dfa_content (re_dfa_t
*dfa
)
574 re_free (dfa
->subexps
);
577 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
579 re_token_t
*node
= dfa
->nodes
+ i
;
580 #ifdef RE_ENABLE_I18N
581 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
582 free_charset (node
->opr
.mbcset
);
584 #endif /* RE_ENABLE_I18N */
585 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
586 re_free (node
->opr
.sbcset
);
588 re_free (dfa
->nexts
);
589 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
591 if (dfa
->eclosures
!= NULL
)
592 re_node_set_free (dfa
->eclosures
+ i
);
593 if (dfa
->inveclosures
!= NULL
)
594 re_node_set_free (dfa
->inveclosures
+ i
);
595 if (dfa
->edests
!= NULL
)
596 re_node_set_free (dfa
->edests
+ i
);
598 re_free (dfa
->edests
);
599 re_free (dfa
->eclosures
);
600 re_free (dfa
->inveclosures
);
601 re_free (dfa
->nodes
);
603 if (dfa
->state_table
)
604 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
606 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
607 for (j
= 0; j
< entry
->num
; ++j
)
609 re_dfastate_t
*state
= entry
->array
[j
];
612 re_free (entry
->array
);
614 re_free (dfa
->state_table
);
615 #ifdef RE_ENABLE_I18N
616 re_free (dfa
->sb_char
);
619 re_free (dfa
->re_str
);
626 /* Free dynamically allocated space used by PREG. */
632 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
633 if (BE (dfa
!= NULL
, 1))
634 free_dfa_content (dfa
);
638 re_free (preg
->fastmap
);
639 preg
->fastmap
= NULL
;
641 re_free (preg
->translate
);
642 preg
->translate
= NULL
;
645 weak_alias (__regfree
, regfree
)
648 /* Entry points compatible with 4.2 BSD regex library. We don't define
649 them unless specifically requested. */
651 #if defined _REGEX_RE_COMP || defined _LIBC
653 /* BSD has one and only one pattern buffer. */
654 static struct re_pattern_buffer re_comp_buf
;
658 /* Make these definitions weak in libc, so POSIX programs can redefine
659 these names if they don't use our functions, and still use
660 regcomp/regexec above without link errors. */
671 if (!re_comp_buf
.buffer
)
672 return gettext ("No previous regular expression");
676 if (re_comp_buf
.buffer
)
678 fastmap
= re_comp_buf
.fastmap
;
679 re_comp_buf
.fastmap
= NULL
;
680 __regfree (&re_comp_buf
);
681 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
682 re_comp_buf
.fastmap
= fastmap
;
685 if (re_comp_buf
.fastmap
== NULL
)
687 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
688 if (re_comp_buf
.fastmap
== NULL
)
689 return (char *) gettext (__re_error_msgid
690 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
693 /* Since `re_exec' always passes NULL for the `regs' argument, we
694 don't need to initialize the pattern buffer fields which affect it. */
696 /* Match anchors at newlines. */
697 re_comp_buf
.newline_anchor
= 1;
699 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
704 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
705 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
709 libc_freeres_fn (free_mem
)
711 __regfree (&re_comp_buf
);
715 #endif /* _REGEX_RE_COMP */
717 /* Internal entry point.
718 Compile the regular expression PATTERN, whose length is LENGTH.
719 SYNTAX indicate regular expression's syntax. */
722 re_compile_internal (preg
, pattern
, length
, syntax
)
724 const char * pattern
;
728 reg_errcode_t err
= REG_NOERROR
;
732 /* Initialize the pattern buffer. */
733 preg
->fastmap_accurate
= 0;
734 preg
->syntax
= syntax
;
735 preg
->not_bol
= preg
->not_eol
= 0;
738 preg
->can_be_null
= 0;
739 preg
->regs_allocated
= REGS_UNALLOCATED
;
741 /* Initialize the dfa. */
742 dfa
= (re_dfa_t
*) preg
->buffer
;
743 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
745 /* If zero allocated, but buffer is non-null, try to realloc
746 enough space. This loses if buffer's address is bogus, but
747 that is the user's responsibility. If ->buffer is NULL this
748 is a simple allocation. */
749 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
752 preg
->allocated
= sizeof (re_dfa_t
);
753 preg
->buffer
= (unsigned char *) dfa
;
755 preg
->used
= sizeof (re_dfa_t
);
757 err
= init_dfa (dfa
, length
);
758 if (BE (err
!= REG_NOERROR
, 0))
760 free_dfa_content (dfa
);
766 dfa
->re_str
= re_malloc (char, length
+ 1);
767 strncpy (dfa
->re_str
, pattern
, length
+ 1);
770 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
771 syntax
& RE_ICASE
, dfa
);
772 if (BE (err
!= REG_NOERROR
, 0))
774 re_compile_internal_free_return
:
775 free_workarea_compile (preg
);
776 re_string_destruct (®exp
);
777 free_dfa_content (dfa
);
783 /* Parse the regular expression, and build a structure tree. */
785 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
786 if (BE (dfa
->str_tree
== NULL
, 0))
787 goto re_compile_internal_free_return
;
789 #ifdef RE_ENABLE_I18N
790 /* If possible, do searching in single byte encoding to speed things up. */
791 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
795 /* Analyze the tree and collect information which is necessary to
798 if (BE (err
!= REG_NOERROR
, 0))
799 goto re_compile_internal_free_return
;
801 /* Then create the initial state of the dfa. */
802 err
= create_initial_state (dfa
);
804 /* Release work areas. */
805 free_workarea_compile (preg
);
806 re_string_destruct (®exp
);
808 if (BE (err
!= REG_NOERROR
, 0))
810 free_dfa_content (dfa
);
818 /* Initialize DFA. We use the length of the regular expression PAT_LEN
819 as the initial length of some arrays. */
822 init_dfa (dfa
, pat_len
)
828 memset (dfa
, '\0', sizeof (re_dfa_t
));
830 /* Force allocation of str_tree_storage the first time. */
831 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
833 dfa
->nodes_alloc
= pat_len
+ 1;
834 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
836 dfa
->states_alloc
= pat_len
+ 1;
838 /* table_size = 2 ^ ceil(log pat_len) */
839 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
840 if (table_size
> pat_len
)
843 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
844 dfa
->state_hash_mask
= table_size
- 1;
846 dfa
->subexps_alloc
= 1;
847 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
849 dfa
->mb_cur_max
= MB_CUR_MAX
;
851 if (dfa
->mb_cur_max
== 6
852 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
854 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
857 #ifdef RE_ENABLE_I18N
858 if (dfa
->mb_cur_max
> 1)
862 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
863 if (BE (dfa
->sb_char
== NULL
, 0))
866 memset (dfa
->sb_char
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
868 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
869 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
870 if (__btowc (ch
) != WEOF
)
871 dfa
->sb_char
[i
] |= 1 << j
;
875 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
876 || dfa
->subexps
== NULL
, 0))
881 /* Initialize WORD_CHAR table, which indicate which character is
882 "word". In this case "word" means that it is the word construction
883 character used by some operators like "\<", "\>", etc. */
890 dfa
->word_ops_used
= 1;
891 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
892 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
893 if (isalnum (ch
) || ch
== '_')
894 dfa
->word_char
[i
] |= 1 << j
;
897 /* Free the work area which are only used while compiling. */
900 free_workarea_compile (preg
)
903 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
904 bin_tree_storage_t
*storage
, *next
;
905 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
907 next
= storage
->next
;
910 dfa
->str_tree_storage
= NULL
;
911 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
912 dfa
->str_tree
= NULL
;
913 re_free (dfa
->org_indices
);
914 dfa
->org_indices
= NULL
;
917 /* Create initial states for all contexts. */
920 create_initial_state (dfa
)
925 re_node_set init_nodes
;
927 /* Initial states have the epsilon closure of the node which is
928 the first node of the regular expression. */
929 first
= dfa
->str_tree
->first
;
930 dfa
->init_node
= first
;
931 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
932 if (BE (err
!= REG_NOERROR
, 0))
935 /* The back-references which are in initial states can epsilon transit,
936 since in this case all of the subexpressions can be null.
937 Then we add epsilon closures of the nodes which are the next nodes of
938 the back-references. */
939 if (dfa
->nbackref
> 0)
940 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
942 int node_idx
= init_nodes
.elems
[i
];
943 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
946 if (type
!= OP_BACK_REF
)
948 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
950 re_token_t
*clexp_node
;
951 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
952 if (clexp_node
->type
== OP_CLOSE_SUBEXP
953 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
956 if (clexp_idx
== init_nodes
.nelem
)
959 if (type
== OP_BACK_REF
)
961 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
962 if (!re_node_set_contains (&init_nodes
, dest_idx
))
964 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
970 /* It must be the first time to invoke acquire_state. */
971 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
972 /* We don't check ERR here, since the initial state must not be NULL. */
973 if (BE (dfa
->init_state
== NULL
, 0))
975 if (dfa
->init_state
->has_constraint
)
977 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
979 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
981 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
985 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
986 || dfa
->init_state_begbuf
== NULL
, 0))
990 dfa
->init_state_word
= dfa
->init_state_nl
991 = dfa
->init_state_begbuf
= dfa
->init_state
;
993 re_node_set_free (&init_nodes
);
997 #ifdef RE_ENABLE_I18N
998 /* If it is possible to do searching in single byte encoding instead of UTF-8
999 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1000 DFA nodes where needed. */
1006 int node
, i
, mb_chars
= 0, has_period
= 0;
1008 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1009 switch (dfa
->nodes
[node
].type
)
1012 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1016 switch (dfa
->nodes
[node
].opr
.idx
)
1024 /* Word anchors etc. cannot be handled. */
1034 case OP_DUP_ASTERISK
:
1035 case OP_DUP_QUESTION
:
1036 case OP_OPEN_SUBEXP
:
1037 case OP_CLOSE_SUBEXP
:
1039 case SIMPLE_BRACKET
:
1040 /* Just double check. */
1041 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1042 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1049 if (mb_chars
|| has_period
)
1050 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1052 if (dfa
->nodes
[node
].type
== CHARACTER
1053 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1054 dfa
->nodes
[node
].mb_partial
= 0;
1055 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1056 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1059 /* The search can be in single byte locale. */
1060 dfa
->mb_cur_max
= 1;
1062 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1066 /* Analyze the structure tree, and calculate "first", "next", "edest",
1067 "eclosure", and "inveclosure". */
1069 static reg_errcode_t
1076 /* Allocate arrays. */
1077 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1078 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1079 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1080 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1081 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1082 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1083 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1085 /* Initialize them. */
1086 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
1089 re_node_set_init_empty (dfa
->edests
+ i
);
1090 re_node_set_init_empty (dfa
->eclosures
+ i
);
1091 re_node_set_init_empty (dfa
->inveclosures
+ i
);
1094 ret
= analyze_tree (dfa
, dfa
->str_tree
);
1095 if (BE (ret
== REG_NOERROR
, 1))
1097 ret
= calc_eclosure (dfa
);
1098 if (ret
== REG_NOERROR
)
1099 calc_inveclosure (dfa
);
1104 /* Helper functions for analyze.
1105 This function calculate "first", "next", and "edest" for the subtree
1106 whose root is NODE. */
1108 static reg_errcode_t
1109 analyze_tree (dfa
, node
)
1114 if (node
->first
== -1)
1115 calc_first (dfa
, node
);
1116 if (node
->next
== -1)
1117 calc_next (dfa
, node
);
1118 if (node
->eclosure
.nelem
== 0)
1119 calc_epsdest (dfa
, node
);
1120 /* Calculate "first" etc. for the left child. */
1121 if (node
->left
!= NULL
)
1123 ret
= analyze_tree (dfa
, node
->left
);
1124 if (BE (ret
!= REG_NOERROR
, 0))
1127 /* Calculate "first" etc. for the right child. */
1128 if (node
->right
!= NULL
)
1130 ret
= analyze_tree (dfa
, node
->right
);
1131 if (BE (ret
!= REG_NOERROR
, 0))
1137 /* Calculate "first" for the node NODE. */
1139 calc_first (dfa
, node
)
1144 idx
= node
->node_idx
;
1145 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1150 case OP_OPEN_BRACKET
:
1151 case OP_CLOSE_BRACKET
:
1152 case OP_OPEN_DUP_NUM
:
1153 case OP_CLOSE_DUP_NUM
:
1155 case OP_NON_MATCH_LIST
:
1156 case OP_OPEN_COLL_ELEM
:
1157 case OP_CLOSE_COLL_ELEM
:
1158 case OP_OPEN_EQUIV_CLASS
:
1159 case OP_CLOSE_EQUIV_CLASS
:
1160 case OP_OPEN_CHAR_CLASS
:
1161 case OP_CLOSE_CHAR_CLASS
:
1162 /* These must not appear here. */
1168 case OP_DUP_ASTERISK
:
1169 case OP_DUP_QUESTION
:
1170 #ifdef RE_ENABLE_I18N
1171 case OP_UTF8_PERIOD
:
1172 case COMPLEX_BRACKET
:
1173 #endif /* RE_ENABLE_I18N */
1174 case SIMPLE_BRACKET
:
1177 case OP_OPEN_SUBEXP
:
1178 case OP_CLOSE_SUBEXP
:
1184 /* else fall through */
1187 assert (node
->left
!= NULL
);
1189 if (node
->left
->first
== -1)
1190 calc_first (dfa
, node
->left
);
1191 node
->first
= node
->left
->first
;
1196 /* Calculate "next" for the node NODE. */
1199 calc_next (dfa
, node
)
1204 bin_tree_t
*parent
= node
->parent
;
1208 idx
= node
->node_idx
;
1209 if (node
->type
== 0)
1210 dfa
->nexts
[idx
] = node
->next
;
1214 idx
= parent
->node_idx
;
1215 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1219 case OP_DUP_ASTERISK
:
1223 if (parent
->left
== node
)
1225 if (parent
->right
->first
== -1)
1226 calc_first (dfa
, parent
->right
);
1227 node
->next
= parent
->right
->first
;
1230 /* else fall through */
1232 if (parent
->next
== -1)
1233 calc_next (dfa
, parent
);
1234 node
->next
= parent
->next
;
1237 idx
= node
->node_idx
;
1238 if (node
->type
== 0)
1239 dfa
->nexts
[idx
] = node
->next
;
1242 /* Calculate "edest" for the node NODE. */
1245 calc_epsdest (dfa
, node
)
1250 idx
= node
->node_idx
;
1251 if (node
->type
== 0)
1253 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1254 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1256 if (node
->left
->first
== -1)
1257 calc_first (dfa
, node
->left
);
1258 if (node
->next
== -1)
1259 calc_next (dfa
, node
);
1260 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1263 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1266 if (node
->left
!= NULL
)
1268 if (node
->left
->first
== -1)
1269 calc_first (dfa
, node
->left
);
1270 left
= node
->left
->first
;
1274 if (node
->next
== -1)
1275 calc_next (dfa
, node
);
1278 if (node
->right
!= NULL
)
1280 if (node
->right
->first
== -1)
1281 calc_first (dfa
, node
->right
);
1282 right
= node
->right
->first
;
1286 if (node
->next
== -1)
1287 calc_next (dfa
, node
);
1290 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1292 else if (dfa
->nodes
[idx
].type
== ANCHOR
1293 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1294 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1295 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1296 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1298 assert (!IS_EPSILON_NODE (dfa
->nodes
[idx
].type
));
1302 /* Duplicate the epsilon closure of the node ROOT_NODE.
1303 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1304 to their own constraint. */
1306 static reg_errcode_t
1307 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1310 int top_org_node
, top_clone_node
, root_node
;
1311 unsigned int init_constraint
;
1314 int org_node
, clone_node
, ret
;
1315 unsigned int constraint
= init_constraint
;
1316 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1318 int org_dest
, clone_dest
;
1319 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1321 /* If the back reference epsilon-transit, its destination must
1322 also have the constraint. Then duplicate the epsilon closure
1323 of the destination of the back reference, and store it in
1324 edests of the back reference. */
1325 org_dest
= dfa
->nexts
[org_node
];
1326 re_node_set_empty (dfa
->edests
+ clone_node
);
1327 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1328 if (BE (err
!= REG_NOERROR
, 0))
1330 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1331 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1332 if (BE (ret
< 0, 0))
1335 else if (dfa
->edests
[org_node
].nelem
== 0)
1337 /* In case of the node can't epsilon-transit, don't duplicate the
1338 destination and store the original destination as the
1339 destination of the node. */
1340 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1343 else if (dfa
->edests
[org_node
].nelem
== 1)
1345 /* In case of the node can epsilon-transit, and it has only one
1347 org_dest
= dfa
->edests
[org_node
].elems
[0];
1348 re_node_set_empty (dfa
->edests
+ clone_node
);
1349 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1351 /* In case of the node has another constraint, append it. */
1352 if (org_node
== root_node
&& clone_node
!= org_node
)
1354 /* ...but if the node is root_node itself, it means the
1355 epsilon closure have a loop, then tie it to the
1356 destination of the root_node. */
1357 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1359 if (BE (ret
< 0, 0))
1363 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1365 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1366 if (BE (err
!= REG_NOERROR
, 0))
1368 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1369 if (BE (ret
< 0, 0))
1372 else /* dfa->edests[org_node].nelem == 2 */
1374 /* In case of the node can epsilon-transit, and it has two
1375 destinations. E.g. '|', '*', '+', '?'. */
1376 org_dest
= dfa
->edests
[org_node
].elems
[0];
1377 re_node_set_empty (dfa
->edests
+ clone_node
);
1378 /* Search for a duplicated node which satisfies the constraint. */
1379 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1380 if (clone_dest
== -1)
1382 /* There are no such a duplicated node, create a new one. */
1383 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1384 if (BE (err
!= REG_NOERROR
, 0))
1386 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1387 if (BE (ret
< 0, 0))
1389 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1390 root_node
, constraint
);
1391 if (BE (err
!= REG_NOERROR
, 0))
1396 /* There are a duplicated node which satisfy the constraint,
1397 use it to avoid infinite loop. */
1398 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1399 if (BE (ret
< 0, 0))
1403 org_dest
= dfa
->edests
[org_node
].elems
[1];
1404 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1405 if (BE (err
!= REG_NOERROR
, 0))
1407 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1408 if (BE (ret
< 0, 0))
1411 org_node
= org_dest
;
1412 clone_node
= clone_dest
;
1417 /* Search for a node which is duplicated from the node ORG_NODE, and
1418 satisfies the constraint CONSTRAINT. */
1421 search_duplicated_node (dfa
, org_node
, constraint
)
1424 unsigned int constraint
;
1427 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1429 if (org_node
== dfa
->org_indices
[idx
]
1430 && constraint
== dfa
->nodes
[idx
].constraint
)
1431 return idx
; /* Found. */
1433 return -1; /* Not found. */
1436 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1437 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1438 otherwise return the error code. */
1440 static reg_errcode_t
1441 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1443 int *new_idx
, org_idx
;
1444 unsigned int constraint
;
1446 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
], 1);
1447 if (BE (dup_idx
== -1, 0))
1449 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1450 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1451 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1452 dfa
->nodes
[dup_idx
].duplicated
= 1;
1453 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1454 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1455 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1457 /* Store the index of the original node. */
1458 dfa
->org_indices
[dup_idx
] = org_idx
;
1464 calc_inveclosure (dfa
)
1468 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1470 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1472 dest
= dfa
->eclosures
[src
].elems
[idx
];
1473 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1478 /* Calculate "eclosure" for all the node in DFA. */
1480 static reg_errcode_t
1484 int node_idx
, incomplete
;
1486 assert (dfa
->nodes_len
> 0);
1489 /* For each nodes, calculate epsilon closure. */
1490 for (node_idx
= 0; ; ++node_idx
)
1493 re_node_set eclosure_elem
;
1494 if (node_idx
== dfa
->nodes_len
)
1503 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1505 /* If we have already calculated, skip it. */
1506 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1508 /* Calculate epsilon closure of `node_idx'. */
1509 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1510 if (BE (err
!= REG_NOERROR
, 0))
1513 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1516 re_node_set_free (&eclosure_elem
);
1522 /* Calculate epsilon closure of NODE. */
1524 static reg_errcode_t
1525 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1526 re_node_set
*new_set
;
1531 unsigned int constraint
;
1533 re_node_set eclosure
;
1535 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1536 if (BE (err
!= REG_NOERROR
, 0))
1539 /* This indicates that we are calculating this node now.
1540 We reference this value to avoid infinite loop. */
1541 dfa
->eclosures
[node
].nelem
= -1;
1543 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1544 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1545 /* If the current node has constraints, duplicate all nodes.
1546 Since they must inherit the constraints. */
1547 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1549 int org_node
, cur_node
;
1550 org_node
= cur_node
= node
;
1551 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1552 if (BE (err
!= REG_NOERROR
, 0))
1556 /* Expand each epsilon destination nodes. */
1557 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1558 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1560 re_node_set eclosure_elem
;
1561 int edest
= dfa
->edests
[node
].elems
[i
];
1562 /* If calculating the epsilon closure of `edest' is in progress,
1563 return intermediate result. */
1564 if (dfa
->eclosures
[edest
].nelem
== -1)
1569 /* If we haven't calculated the epsilon closure of `edest' yet,
1570 calculate now. Otherwise use calculated epsilon closure. */
1571 if (dfa
->eclosures
[edest
].nelem
== 0)
1573 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1574 if (BE (err
!= REG_NOERROR
, 0))
1578 eclosure_elem
= dfa
->eclosures
[edest
];
1579 /* Merge the epsilon closure of `edest'. */
1580 re_node_set_merge (&eclosure
, &eclosure_elem
);
1581 /* If the epsilon closure of `edest' is incomplete,
1582 the epsilon closure of this node is also incomplete. */
1583 if (dfa
->eclosures
[edest
].nelem
== 0)
1586 re_node_set_free (&eclosure_elem
);
1590 /* Epsilon closures include itself. */
1591 re_node_set_insert (&eclosure
, node
);
1592 if (incomplete
&& !root
)
1593 dfa
->eclosures
[node
].nelem
= 0;
1595 dfa
->eclosures
[node
] = eclosure
;
1596 *new_set
= eclosure
;
1600 /* Functions for token which are used in the parser. */
1602 /* Fetch a token from INPUT.
1603 We must not use this function inside bracket expressions. */
1606 fetch_token (result
, input
, syntax
)
1609 reg_syntax_t syntax
;
1611 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1614 /* Peek a token from INPUT, and return the length of the token.
1615 We must not use this function inside bracket expressions. */
1618 peek_token (token
, input
, syntax
)
1621 reg_syntax_t syntax
;
1625 if (re_string_eoi (input
))
1627 token
->type
= END_OF_RE
;
1631 c
= re_string_peek_byte (input
, 0);
1634 token
->word_char
= 0;
1635 #ifdef RE_ENABLE_I18N
1636 token
->mb_partial
= 0;
1637 if (input
->mb_cur_max
> 1 &&
1638 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1640 token
->type
= CHARACTER
;
1641 token
->mb_partial
= 1;
1648 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1650 token
->type
= BACK_SLASH
;
1654 c2
= re_string_peek_byte_case (input
, 1);
1656 token
->type
= CHARACTER
;
1657 #ifdef RE_ENABLE_I18N
1658 if (input
->mb_cur_max
> 1)
1660 wint_t wc
= re_string_wchar_at (input
,
1661 re_string_cur_idx (input
) + 1);
1662 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1666 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1671 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1672 token
->type
= OP_ALT
;
1674 case '1': case '2': case '3': case '4': case '5':
1675 case '6': case '7': case '8': case '9':
1676 if (!(syntax
& RE_NO_BK_REFS
))
1678 token
->type
= OP_BACK_REF
;
1679 token
->opr
.idx
= c2
- '0';
1683 if (!(syntax
& RE_NO_GNU_OPS
))
1685 token
->type
= ANCHOR
;
1686 token
->opr
.ctx_type
= WORD_FIRST
;
1690 if (!(syntax
& RE_NO_GNU_OPS
))
1692 token
->type
= ANCHOR
;
1693 token
->opr
.ctx_type
= WORD_LAST
;
1697 if (!(syntax
& RE_NO_GNU_OPS
))
1699 token
->type
= ANCHOR
;
1700 token
->opr
.ctx_type
= WORD_DELIM
;
1704 if (!(syntax
& RE_NO_GNU_OPS
))
1706 token
->type
= ANCHOR
;
1707 token
->opr
.ctx_type
= INSIDE_WORD
;
1711 if (!(syntax
& RE_NO_GNU_OPS
))
1712 token
->type
= OP_WORD
;
1715 if (!(syntax
& RE_NO_GNU_OPS
))
1716 token
->type
= OP_NOTWORD
;
1719 if (!(syntax
& RE_NO_GNU_OPS
))
1720 token
->type
= OP_SPACE
;
1723 if (!(syntax
& RE_NO_GNU_OPS
))
1724 token
->type
= OP_NOTSPACE
;
1727 if (!(syntax
& RE_NO_GNU_OPS
))
1729 token
->type
= ANCHOR
;
1730 token
->opr
.ctx_type
= BUF_FIRST
;
1734 if (!(syntax
& RE_NO_GNU_OPS
))
1736 token
->type
= ANCHOR
;
1737 token
->opr
.ctx_type
= BUF_LAST
;
1741 if (!(syntax
& RE_NO_BK_PARENS
))
1742 token
->type
= OP_OPEN_SUBEXP
;
1745 if (!(syntax
& RE_NO_BK_PARENS
))
1746 token
->type
= OP_CLOSE_SUBEXP
;
1749 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1750 token
->type
= OP_DUP_PLUS
;
1753 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1754 token
->type
= OP_DUP_QUESTION
;
1757 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1758 token
->type
= OP_OPEN_DUP_NUM
;
1761 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1762 token
->type
= OP_CLOSE_DUP_NUM
;
1770 token
->type
= CHARACTER
;
1771 #ifdef RE_ENABLE_I18N
1772 if (input
->mb_cur_max
> 1)
1774 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1775 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1779 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1784 if (syntax
& RE_NEWLINE_ALT
)
1785 token
->type
= OP_ALT
;
1788 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1789 token
->type
= OP_ALT
;
1792 token
->type
= OP_DUP_ASTERISK
;
1795 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1796 token
->type
= OP_DUP_PLUS
;
1799 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1800 token
->type
= OP_DUP_QUESTION
;
1803 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1804 token
->type
= OP_OPEN_DUP_NUM
;
1807 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1808 token
->type
= OP_CLOSE_DUP_NUM
;
1811 if (syntax
& RE_NO_BK_PARENS
)
1812 token
->type
= OP_OPEN_SUBEXP
;
1815 if (syntax
& RE_NO_BK_PARENS
)
1816 token
->type
= OP_CLOSE_SUBEXP
;
1819 token
->type
= OP_OPEN_BRACKET
;
1822 token
->type
= OP_PERIOD
;
1825 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1826 re_string_cur_idx (input
) != 0)
1828 char prev
= re_string_peek_byte (input
, -1);
1829 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1832 token
->type
= ANCHOR
;
1833 token
->opr
.ctx_type
= LINE_FIRST
;
1836 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1837 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1840 re_string_skip_bytes (input
, 1);
1841 peek_token (&next
, input
, syntax
);
1842 re_string_skip_bytes (input
, -1);
1843 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1846 token
->type
= ANCHOR
;
1847 token
->opr
.ctx_type
= LINE_LAST
;
1855 /* Peek a token from INPUT, and return the length of the token.
1856 We must not use this function out of bracket expressions. */
1859 peek_token_bracket (token
, input
, syntax
)
1862 reg_syntax_t syntax
;
1865 if (re_string_eoi (input
))
1867 token
->type
= END_OF_RE
;
1870 c
= re_string_peek_byte (input
, 0);
1873 #ifdef RE_ENABLE_I18N
1874 if (input
->mb_cur_max
> 1 &&
1875 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1877 token
->type
= CHARACTER
;
1880 #endif /* RE_ENABLE_I18N */
1882 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
1883 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
1885 /* In this case, '\' escape a character. */
1887 re_string_skip_bytes (input
, 1);
1888 c2
= re_string_peek_byte (input
, 0);
1890 token
->type
= CHARACTER
;
1893 if (c
== '[') /* '[' is a special char in a bracket exps. */
1897 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
1898 c2
= re_string_peek_byte (input
, 1);
1906 token
->type
= OP_OPEN_COLL_ELEM
;
1909 token
->type
= OP_OPEN_EQUIV_CLASS
;
1912 if (syntax
& RE_CHAR_CLASSES
)
1914 token
->type
= OP_OPEN_CHAR_CLASS
;
1917 /* else fall through. */
1919 token
->type
= CHARACTER
;
1929 token
->type
= OP_CHARSET_RANGE
;
1932 token
->type
= OP_CLOSE_BRACKET
;
1935 token
->type
= OP_NON_MATCH_LIST
;
1938 token
->type
= CHARACTER
;
1943 /* Functions for parser. */
1945 /* Entry point of the parser.
1946 Parse the regular expression REGEXP and return the structure tree.
1947 If an error is occured, ERR is set by error code, and return NULL.
1948 This function build the following tree, from regular expression <reg_exp>:
1954 CAT means concatenation.
1955 EOR means end of regular expression. */
1958 parse (regexp
, preg
, syntax
, err
)
1959 re_string_t
*regexp
;
1961 reg_syntax_t syntax
;
1964 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1965 bin_tree_t
*tree
, *eor
, *root
;
1966 re_token_t current_token
;
1967 dfa
->syntax
= syntax
;
1968 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
1969 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1970 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1972 eor
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, ¤t_token
);
1974 root
= create_tree (dfa
, tree
, eor
, CONCAT
, 0);
1977 if (BE (eor
== NULL
|| root
== NULL
, 0))
1985 /* This function build the following tree, from regular expression
1986 <branch1>|<branch2>:
1992 ALT means alternative, which represents the operator `|'. */
1995 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1996 re_string_t
*regexp
;
1999 reg_syntax_t syntax
;
2003 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2004 bin_tree_t
*tree
, *branch
= NULL
;
2005 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2006 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2009 while (token
->type
== OP_ALT
)
2011 re_token_t alt_token
= *token
;
2012 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2013 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2014 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2016 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2017 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2022 tree
= re_dfa_add_tree_node (dfa
, tree
, branch
, &alt_token
);
2023 if (BE (tree
== NULL
, 0))
2028 dfa
->has_plural_match
= 1;
2033 /* This function build the following tree, from regular expression
2040 CAT means concatenation. */
2043 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2044 re_string_t
*regexp
;
2047 reg_syntax_t syntax
;
2051 bin_tree_t
*tree
, *exp
;
2052 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2053 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2054 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2057 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2058 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2060 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2061 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2065 if (tree
!= NULL
&& exp
!= NULL
)
2067 tree
= create_tree (dfa
, tree
, exp
, CONCAT
, 0);
2074 else if (tree
== NULL
)
2076 /* Otherwise exp == NULL, we don't need to create new tree. */
2081 /* This function build the following tree, from regular expression a*:
2088 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2089 re_string_t
*regexp
;
2092 reg_syntax_t syntax
;
2096 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2098 switch (token
->type
)
2101 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2102 if (BE (tree
== NULL
, 0))
2107 #ifdef RE_ENABLE_I18N
2108 if (dfa
->mb_cur_max
> 1)
2110 while (!re_string_eoi (regexp
)
2111 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2113 bin_tree_t
*mbc_remain
;
2114 fetch_token (token
, regexp
, syntax
);
2115 mbc_remain
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2116 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
, 0);
2117 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2126 case OP_OPEN_SUBEXP
:
2127 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2128 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2131 case OP_OPEN_BRACKET
:
2132 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2133 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2137 if (BE (preg
->re_nsub
< token
->opr
.idx
2138 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
2143 dfa
->used_bkref_map
|= 1 << (token
->opr
.idx
- 1);
2144 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2145 if (BE (tree
== NULL
, 0))
2151 dfa
->has_mb_node
= 1;
2153 case OP_OPEN_DUP_NUM
:
2154 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2160 case OP_DUP_ASTERISK
:
2162 case OP_DUP_QUESTION
:
2163 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2168 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2170 fetch_token (token
, regexp
, syntax
);
2171 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2173 /* else fall through */
2174 case OP_CLOSE_SUBEXP
:
2175 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2176 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2181 /* else fall through */
2182 case OP_CLOSE_DUP_NUM
:
2183 /* We treat it as a normal character. */
2185 /* Then we can these characters as normal characters. */
2186 token
->type
= CHARACTER
;
2187 /* mb_partial and word_char bits should be initialized already
2189 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2190 if (BE (tree
== NULL
, 0))
2197 if ((token
->opr
.ctx_type
2198 & (WORD_DELIM
| INSIDE_WORD
| WORD_FIRST
| WORD_LAST
))
2199 && dfa
->word_ops_used
== 0)
2200 init_word_char (dfa
);
2201 if (token
->opr
.ctx_type
== WORD_DELIM
)
2203 bin_tree_t
*tree_first
, *tree_last
;
2204 token
->opr
.ctx_type
= WORD_FIRST
;
2205 tree_first
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2206 token
->opr
.ctx_type
= WORD_LAST
;
2207 tree_last
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2208 token
->type
= OP_ALT
;
2209 tree
= re_dfa_add_tree_node (dfa
, tree_first
, tree_last
, token
);
2210 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2218 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2219 if (BE (tree
== NULL
, 0))
2225 /* We must return here, since ANCHORs can't be followed
2226 by repetition operators.
2227 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2228 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2229 fetch_token (token
, regexp
, syntax
);
2232 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2233 if (BE (tree
== NULL
, 0))
2238 if (dfa
->mb_cur_max
> 1)
2239 dfa
->has_mb_node
= 1;
2243 tree
= build_charclass_op (dfa
, regexp
->trans
,
2244 (const unsigned char *) "alnum",
2245 (const unsigned char *) "_",
2246 token
->type
== OP_NOTWORD
, err
);
2247 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2252 tree
= build_charclass_op (dfa
, regexp
->trans
,
2253 (const unsigned char *) "space",
2254 (const unsigned char *) "",
2255 token
->type
== OP_NOTSPACE
, err
);
2256 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2266 /* Must not happen? */
2272 fetch_token (token
, regexp
, syntax
);
2274 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2275 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2277 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2278 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2280 /* In BRE consecutive duplications are not allowed. */
2281 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2282 && (token
->type
== OP_DUP_ASTERISK
2283 || token
->type
== OP_OPEN_DUP_NUM
))
2288 dfa
->has_plural_match
= 1;
2294 /* This function build the following tree, from regular expression
2302 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2303 re_string_t
*regexp
;
2306 reg_syntax_t syntax
;
2310 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2311 bin_tree_t
*tree
, *left_par
, *right_par
;
2313 cur_nsub
= preg
->re_nsub
++;
2314 if (BE (dfa
->subexps_alloc
< preg
->re_nsub
, 0))
2316 re_subexp_t
*new_array
;
2317 dfa
->subexps_alloc
*= 2;
2318 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2319 if (BE (new_array
== NULL
, 0))
2321 dfa
->subexps_alloc
/= 2;
2325 dfa
->subexps
= new_array
;
2327 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2328 dfa
->subexps
[cur_nsub
].end
= -1;
2330 left_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2331 if (BE (left_par
== NULL
, 0))
2336 dfa
->nodes
[left_par
->node_idx
].opr
.idx
= cur_nsub
;
2337 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2339 /* The subexpression may be a null string. */
2340 if (token
->type
== OP_CLOSE_SUBEXP
)
2344 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2345 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2348 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2353 right_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2354 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2355 tree
= ((tree
== NULL
) ? right_par
2356 : create_tree (dfa
, tree
, right_par
, CONCAT
, 0));
2357 tree
= create_tree (dfa
, left_par
, tree
, CONCAT
, 0);
2358 if (BE (right_par
== NULL
|| tree
== NULL
, 0))
2363 dfa
->nodes
[right_par
->node_idx
].opr
.idx
= cur_nsub
;
2368 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2371 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2373 re_string_t
*regexp
;
2376 reg_syntax_t syntax
;
2379 re_token_t dup_token
;
2380 bin_tree_t
*tree
= NULL
;
2381 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2382 re_token_t start_token
= *token
;
2384 if (token
->type
== OP_OPEN_DUP_NUM
)
2387 start
= fetch_number (regexp
, token
, syntax
);
2390 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2391 start
= 0; /* We treat "{,m}" as "{0,m}". */
2394 *err
= REG_BADBR
; /* <re>{} is invalid. */
2398 if (BE (start
!= -2, 1))
2400 /* We treat "{n}" as "{n,n}". */
2401 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2402 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2403 ? fetch_number (regexp
, token
, syntax
) : -2));
2405 if (BE (start
== -2 || end
== -2, 0))
2407 /* Invalid sequence. */
2408 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2410 if (token
->type
== END_OF_RE
)
2418 /* If the syntax bit is set, rollback. */
2419 re_string_set_index (regexp
, start_idx
);
2420 *token
= start_token
;
2421 token
->type
= CHARACTER
;
2422 /* mb_partial and word_char bits should be already initialized by
2427 if (BE (end
!= -1 && start
> end
, 0))
2429 /* First number greater than second. */
2436 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2437 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2440 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2441 if (BE (elem
== NULL
, 0))
2444 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2445 else if (BE (start
> 0, 0))
2448 for (i
= 2; i
<= start
; ++i
)
2450 elem
= duplicate_tree (elem
, dfa
);
2451 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2452 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2453 goto parse_dup_op_espace
;
2457 if (BE (end
!= start
, 1))
2459 dup_token
.type
= (end
== -1 ? OP_DUP_ASTERISK
: OP_DUP_QUESTION
);
2460 if (BE (start
> 0, 0))
2462 elem
= duplicate_tree (elem
, dfa
);
2463 if (BE (elem
== NULL
, 0))
2464 goto parse_dup_op_espace
;
2466 /* This subexpression will be marked as optional, so that
2467 empty matches do not touch the registers. */
2468 mark_opt_subexp (elem
, dfa
);
2470 /* Prepare the tree with the modifier. */
2471 elem
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2472 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2476 /* We do not need to duplicate the tree because we have not
2478 mark_opt_subexp (elem
, dfa
);
2479 tree
= elem
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2482 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2483 goto parse_dup_op_espace
;
2485 /* This loop is actually executed only when end != -1,
2486 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2487 already created the start+1-th copy. */
2488 for (i
= start
+ 2; i
<= end
; ++i
)
2490 elem
= duplicate_tree (elem
, dfa
);
2491 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2492 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2500 fetch_token (token
, regexp
, syntax
);
2503 parse_dup_op_espace
:
2508 /* Size of the names for collating symbol/equivalence_class/character_class.
2509 I'm not sure, but maybe enough. */
2510 #define BRACKET_NAME_BUF_SIZE 32
2513 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2514 Build the range expression which starts from START_ELEM, and ends
2515 at END_ELEM. The result are written to MBCSET and SBCSET.
2516 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2517 mbcset->range_ends, is a pointer argument sinse we may
2520 static reg_errcode_t
2521 # ifdef RE_ENABLE_I18N
2522 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2523 re_charset_t
*mbcset
;
2525 # else /* not RE_ENABLE_I18N */
2526 build_range_exp (sbcset
, start_elem
, end_elem
)
2527 # endif /* not RE_ENABLE_I18N */
2528 re_bitset_ptr_t sbcset
;
2529 bracket_elem_t
*start_elem
, *end_elem
;
2531 unsigned int start_ch
, end_ch
;
2532 /* Equivalence Classes and Character Classes can't be a range start/end. */
2533 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2534 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2538 /* We can handle no multi character collating elements without libc
2540 if (BE ((start_elem
->type
== COLL_SYM
2541 && strlen ((char *) start_elem
->opr
.name
) > 1)
2542 || (end_elem
->type
== COLL_SYM
2543 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2544 return REG_ECOLLATE
;
2546 # ifdef RE_ENABLE_I18N
2548 wchar_t wc
, start_wc
, end_wc
;
2549 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2551 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2552 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2554 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2555 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2557 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2558 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2559 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2560 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2561 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2562 return REG_ECOLLATE
;
2563 cmp_buf
[0] = start_wc
;
2564 cmp_buf
[4] = end_wc
;
2565 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2568 /* Got valid collation sequence values, add them as a new entry.
2569 However, for !_LIBC we have no collation elements: if the
2570 character set is single byte, the single byte character set
2571 that we build below suffices. parse_bracket_exp passes
2572 no MBCSET if dfa->mb_cur_max == 1. */
2575 /* Check the space of the arrays. */
2576 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2578 /* There is not enough space, need realloc. */
2579 wchar_t *new_array_start
, *new_array_end
;
2582 /* +1 in case of mbcset->nranges is 0. */
2583 new_nranges
= 2 * mbcset
->nranges
+ 1;
2584 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2585 are NULL if *range_alloc == 0. */
2586 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2588 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2591 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2594 mbcset
->range_starts
= new_array_start
;
2595 mbcset
->range_ends
= new_array_end
;
2596 *range_alloc
= new_nranges
;
2599 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2600 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2603 /* Build the table for single byte characters. */
2604 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2607 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2608 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2609 bitset_set (sbcset
, wc
);
2612 # else /* not RE_ENABLE_I18N */
2615 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2616 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2618 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2619 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2621 if (start_ch
> end_ch
)
2623 /* Build the table for single byte characters. */
2624 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2625 if (start_ch
<= ch
&& ch
<= end_ch
)
2626 bitset_set (sbcset
, ch
);
2628 # endif /* not RE_ENABLE_I18N */
2631 #endif /* not _LIBC */
2634 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2635 Build the collating element which is represented by NAME.
2636 The result are written to MBCSET and SBCSET.
2637 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2638 pointer argument since we may update it. */
2640 static reg_errcode_t
2641 # ifdef RE_ENABLE_I18N
2642 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2643 re_charset_t
*mbcset
;
2644 int *coll_sym_alloc
;
2645 # else /* not RE_ENABLE_I18N */
2646 build_collating_symbol (sbcset
, name
)
2647 # endif /* not RE_ENABLE_I18N */
2648 re_bitset_ptr_t sbcset
;
2649 const unsigned char *name
;
2651 size_t name_len
= strlen ((const char *) name
);
2652 if (BE (name_len
!= 1, 0))
2653 return REG_ECOLLATE
;
2656 bitset_set (sbcset
, name
[0]);
2660 #endif /* not _LIBC */
2662 /* This function parse bracket expression like "[abc]", "[a-c]",
2666 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2667 re_string_t
*regexp
;
2670 reg_syntax_t syntax
;
2674 const unsigned char *collseqmb
;
2675 const char *collseqwc
;
2678 const int32_t *symb_table
;
2679 const unsigned char *extra
;
2681 /* Local function for parse_bracket_exp used in _LIBC environement.
2682 Seek the collating symbol entry correspondings to NAME.
2683 Return the index of the symbol in the SYMB_TABLE. */
2685 static inline int32_t
2686 __attribute ((always_inline
))
2687 seek_collating_symbol_entry (name
, name_len
)
2688 const unsigned char *name
;
2691 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2692 int32_t elem
= hash
% table_size
;
2693 int32_t second
= hash
% (table_size
- 2);
2694 while (symb_table
[2 * elem
] != 0)
2696 /* First compare the hashing value. */
2697 if (symb_table
[2 * elem
] == hash
2698 /* Compare the length of the name. */
2699 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2700 /* Compare the name. */
2701 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2704 /* Yep, this is the entry. */
2714 /* Local function for parse_bracket_exp used in _LIBC environement.
2715 Look up the collation sequence value of BR_ELEM.
2716 Return the value if succeeded, UINT_MAX otherwise. */
2718 static inline unsigned int
2719 __attribute ((always_inline
))
2720 lookup_collation_sequence_value (br_elem
)
2721 bracket_elem_t
*br_elem
;
2723 if (br_elem
->type
== SB_CHAR
)
2726 if (MB_CUR_MAX == 1)
2729 return collseqmb
[br_elem
->opr
.ch
];
2732 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2733 return __collseq_table_lookup (collseqwc
, wc
);
2736 else if (br_elem
->type
== MB_CHAR
)
2738 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2740 else if (br_elem
->type
== COLL_SYM
)
2742 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2746 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2748 if (symb_table
[2 * elem
] != 0)
2750 /* We found the entry. */
2751 idx
= symb_table
[2 * elem
+ 1];
2752 /* Skip the name of collating element name. */
2753 idx
+= 1 + extra
[idx
];
2754 /* Skip the byte sequence of the collating element. */
2755 idx
+= 1 + extra
[idx
];
2756 /* Adjust for the alignment. */
2757 idx
= (idx
+ 3) & ~3;
2758 /* Skip the multibyte collation sequence value. */
2759 idx
+= sizeof (unsigned int);
2760 /* Skip the wide char sequence of the collating element. */
2761 idx
+= sizeof (unsigned int) *
2762 (1 + *(unsigned int *) (extra
+ idx
));
2763 /* Return the collation sequence value. */
2764 return *(unsigned int *) (extra
+ idx
);
2766 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2768 /* No valid character. Match it as a single byte
2770 return collseqmb
[br_elem
->opr
.name
[0]];
2773 else if (sym_name_len
== 1)
2774 return collseqmb
[br_elem
->opr
.name
[0]];
2779 /* Local function for parse_bracket_exp used in _LIBC environement.
2780 Build the range expression which starts from START_ELEM, and ends
2781 at END_ELEM. The result are written to MBCSET and SBCSET.
2782 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2783 mbcset->range_ends, is a pointer argument sinse we may
2786 static inline reg_errcode_t
2787 __attribute ((always_inline
))
2788 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2789 re_charset_t
*mbcset
;
2791 re_bitset_ptr_t sbcset
;
2792 bracket_elem_t
*start_elem
, *end_elem
;
2795 uint32_t start_collseq
;
2796 uint32_t end_collseq
;
2798 /* Equivalence Classes and Character Classes can't be a range
2800 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2801 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2805 start_collseq
= lookup_collation_sequence_value (start_elem
);
2806 end_collseq
= lookup_collation_sequence_value (end_elem
);
2807 /* Check start/end collation sequence values. */
2808 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2809 return REG_ECOLLATE
;
2810 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2813 /* Got valid collation sequence values, add them as a new entry.
2814 However, if we have no collation elements, and the character set
2815 is single byte, the single byte character set that we
2816 build below suffices. */
2817 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2819 /* Check the space of the arrays. */
2820 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2822 /* There is not enough space, need realloc. */
2823 uint32_t *new_array_start
;
2824 uint32_t *new_array_end
;
2827 /* +1 in case of mbcset->nranges is 0. */
2828 new_nranges
= 2 * mbcset
->nranges
+ 1;
2829 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2831 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2834 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2837 mbcset
->range_starts
= new_array_start
;
2838 mbcset
->range_ends
= new_array_end
;
2839 *range_alloc
= new_nranges
;
2842 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2843 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2846 /* Build the table for single byte characters. */
2847 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2849 uint32_t ch_collseq
;
2851 if (MB_CUR_MAX == 1)
2854 ch_collseq
= collseqmb
[ch
];
2856 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2857 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2858 bitset_set (sbcset
, ch
);
2863 /* Local function for parse_bracket_exp used in _LIBC environement.
2864 Build the collating element which is represented by NAME.
2865 The result are written to MBCSET and SBCSET.
2866 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2867 pointer argument sinse we may update it. */
2869 static inline reg_errcode_t
2870 __attribute ((always_inline
))
2871 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2872 re_charset_t
*mbcset
;
2873 int *coll_sym_alloc
;
2874 re_bitset_ptr_t sbcset
;
2875 const unsigned char *name
;
2878 size_t name_len
= strlen ((const char *) name
);
2881 elem
= seek_collating_symbol_entry (name
, name_len
);
2882 if (symb_table
[2 * elem
] != 0)
2884 /* We found the entry. */
2885 idx
= symb_table
[2 * elem
+ 1];
2886 /* Skip the name of collating element name. */
2887 idx
+= 1 + extra
[idx
];
2889 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2891 /* No valid character, treat it as a normal
2893 bitset_set (sbcset
, name
[0]);
2897 return REG_ECOLLATE
;
2899 /* Got valid collation sequence, add it as a new entry. */
2900 /* Check the space of the arrays. */
2901 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2903 /* Not enough, realloc it. */
2904 /* +1 in case of mbcset->ncoll_syms is 0. */
2905 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2906 /* Use realloc since mbcset->coll_syms is NULL
2908 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2909 new_coll_sym_alloc
);
2910 if (BE (new_coll_syms
== NULL
, 0))
2912 mbcset
->coll_syms
= new_coll_syms
;
2913 *coll_sym_alloc
= new_coll_sym_alloc
;
2915 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2920 if (BE (name_len
!= 1, 0))
2921 return REG_ECOLLATE
;
2924 bitset_set (sbcset
, name
[0]);
2931 re_token_t br_token
;
2932 re_bitset_ptr_t sbcset
;
2933 #ifdef RE_ENABLE_I18N
2934 re_charset_t
*mbcset
;
2935 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2936 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2937 #endif /* not RE_ENABLE_I18N */
2939 bin_tree_t
*work_tree
;
2941 int first_round
= 1;
2943 collseqmb
= (const unsigned char *)
2944 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2945 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2951 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2952 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2953 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2954 _NL_COLLATE_SYMB_TABLEMB
);
2955 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2956 _NL_COLLATE_SYMB_EXTRAMB
);
2959 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2960 #ifdef RE_ENABLE_I18N
2961 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2962 #endif /* RE_ENABLE_I18N */
2963 #ifdef RE_ENABLE_I18N
2964 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2966 if (BE (sbcset
== NULL
, 0))
2967 #endif /* RE_ENABLE_I18N */
2973 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2974 if (BE (token
->type
== END_OF_RE
, 0))
2977 goto parse_bracket_exp_free_return
;
2979 if (token
->type
== OP_NON_MATCH_LIST
)
2981 #ifdef RE_ENABLE_I18N
2982 mbcset
->non_match
= 1;
2983 #endif /* not RE_ENABLE_I18N */
2985 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2986 bitset_set (sbcset
, '\0');
2987 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2988 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2989 if (BE (token
->type
== END_OF_RE
, 0))
2992 goto parse_bracket_exp_free_return
;
2996 /* We treat the first ']' as a normal character. */
2997 if (token
->type
== OP_CLOSE_BRACKET
)
2998 token
->type
= CHARACTER
;
3002 bracket_elem_t start_elem
, end_elem
;
3003 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3004 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3006 int token_len2
= 0, is_range_exp
= 0;
3009 start_elem
.opr
.name
= start_name_buf
;
3010 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3011 syntax
, first_round
);
3012 if (BE (ret
!= REG_NOERROR
, 0))
3015 goto parse_bracket_exp_free_return
;
3019 /* Get information about the next token. We need it in any case. */
3020 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3022 /* Do not check for ranges if we know they are not allowed. */
3023 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3025 if (BE (token
->type
== END_OF_RE
, 0))
3028 goto parse_bracket_exp_free_return
;
3030 if (token
->type
== OP_CHARSET_RANGE
)
3032 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3033 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3034 if (BE (token2
.type
== END_OF_RE
, 0))
3037 goto parse_bracket_exp_free_return
;
3039 if (token2
.type
== OP_CLOSE_BRACKET
)
3041 /* We treat the last '-' as a normal character. */
3042 re_string_skip_bytes (regexp
, -token_len
);
3043 token
->type
= CHARACTER
;
3050 if (is_range_exp
== 1)
3052 end_elem
.opr
.name
= end_name_buf
;
3053 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3055 if (BE (ret
!= REG_NOERROR
, 0))
3058 goto parse_bracket_exp_free_return
;
3061 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3064 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3065 &start_elem
, &end_elem
);
3067 # ifdef RE_ENABLE_I18N
3068 *err
= build_range_exp (sbcset
,
3069 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3070 &range_alloc
, &start_elem
, &end_elem
);
3072 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3074 #endif /* RE_ENABLE_I18N */
3075 if (BE (*err
!= REG_NOERROR
, 0))
3076 goto parse_bracket_exp_free_return
;
3080 switch (start_elem
.type
)
3083 bitset_set (sbcset
, start_elem
.opr
.ch
);
3085 #ifdef RE_ENABLE_I18N
3087 /* Check whether the array has enough space. */
3088 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3090 wchar_t *new_mbchars
;
3091 /* Not enough, realloc it. */
3092 /* +1 in case of mbcset->nmbchars is 0. */
3093 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3094 /* Use realloc since array is NULL if *alloc == 0. */
3095 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3097 if (BE (new_mbchars
== NULL
, 0))
3098 goto parse_bracket_exp_espace
;
3099 mbcset
->mbchars
= new_mbchars
;
3101 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3103 #endif /* RE_ENABLE_I18N */
3105 *err
= build_equiv_class (sbcset
,
3106 #ifdef RE_ENABLE_I18N
3107 mbcset
, &equiv_class_alloc
,
3108 #endif /* RE_ENABLE_I18N */
3109 start_elem
.opr
.name
);
3110 if (BE (*err
!= REG_NOERROR
, 0))
3111 goto parse_bracket_exp_free_return
;
3114 *err
= build_collating_symbol (sbcset
,
3115 #ifdef RE_ENABLE_I18N
3116 mbcset
, &coll_sym_alloc
,
3117 #endif /* RE_ENABLE_I18N */
3118 start_elem
.opr
.name
);
3119 if (BE (*err
!= REG_NOERROR
, 0))
3120 goto parse_bracket_exp_free_return
;
3123 *err
= build_charclass (regexp
->trans
, sbcset
,
3124 #ifdef RE_ENABLE_I18N
3125 mbcset
, &char_class_alloc
,
3126 #endif /* RE_ENABLE_I18N */
3127 start_elem
.opr
.name
, syntax
);
3128 if (BE (*err
!= REG_NOERROR
, 0))
3129 goto parse_bracket_exp_free_return
;
3136 if (BE (token
->type
== END_OF_RE
, 0))
3139 goto parse_bracket_exp_free_return
;
3141 if (token
->type
== OP_CLOSE_BRACKET
)
3145 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3147 /* If it is non-matching list. */
3149 bitset_not (sbcset
);
3151 #ifdef RE_ENABLE_I18N
3152 /* Ensure only single byte characters are set. */
3153 if (dfa
->mb_cur_max
> 1)
3154 bitset_mask (sbcset
, dfa
->sb_char
);
3155 #endif /* RE_ENABLE_I18N */
3157 /* Build a tree for simple bracket. */
3158 br_token
.type
= SIMPLE_BRACKET
;
3159 br_token
.opr
.sbcset
= sbcset
;
3160 work_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3161 if (BE (work_tree
== NULL
, 0))
3162 goto parse_bracket_exp_espace
;
3164 #ifdef RE_ENABLE_I18N
3165 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3166 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3167 || mbcset
->non_match
)))
3169 re_token_t alt_token
;
3170 bin_tree_t
*mbc_tree
;
3172 /* Build a tree for complex bracket. */
3173 dfa
->has_mb_node
= 1;
3174 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3175 if (sbcset
[sbc_idx
])
3177 /* If there are no bits set in sbcset, there is no point
3178 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3179 if (sbc_idx
== BITSET_UINTS
)
3182 dfa
->nodes
[work_tree
->node_idx
].type
= COMPLEX_BRACKET
;
3183 dfa
->nodes
[work_tree
->node_idx
].opr
.mbcset
= mbcset
;
3186 br_token
.type
= COMPLEX_BRACKET
;
3187 br_token
.opr
.mbcset
= mbcset
;
3188 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3189 if (BE (mbc_tree
== NULL
, 0))
3190 goto parse_bracket_exp_espace
;
3191 /* Then join them by ALT node. */
3192 alt_token
.type
= OP_ALT
;
3193 dfa
->has_plural_match
= 1;
3194 work_tree
= re_dfa_add_tree_node (dfa
, work_tree
, mbc_tree
, &alt_token
);
3195 if (BE (mbc_tree
!= NULL
, 1))
3200 free_charset (mbcset
);
3203 #else /* not RE_ENABLE_I18N */
3205 #endif /* not RE_ENABLE_I18N */
3207 parse_bracket_exp_espace
:
3209 parse_bracket_exp_free_return
:
3211 #ifdef RE_ENABLE_I18N
3212 free_charset (mbcset
);
3213 #endif /* RE_ENABLE_I18N */
3217 /* Parse an element in the bracket expression. */
3219 static reg_errcode_t
3220 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3222 bracket_elem_t
*elem
;
3223 re_string_t
*regexp
;
3227 reg_syntax_t syntax
;
3230 #ifdef RE_ENABLE_I18N
3232 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3233 if (cur_char_size
> 1)
3235 elem
->type
= MB_CHAR
;
3236 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3237 re_string_skip_bytes (regexp
, cur_char_size
);
3240 #endif /* RE_ENABLE_I18N */
3241 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3242 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3243 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3244 return parse_bracket_symbol (elem
, regexp
, token
);
3245 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3247 /* A '-' must only appear as anything but a range indicator before
3248 the closing bracket. Everything else is an error. */
3250 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3251 if (token2
.type
!= OP_CLOSE_BRACKET
)
3252 /* The actual error value is not standardized since this whole
3253 case is undefined. But ERANGE makes good sense. */
3256 elem
->type
= SB_CHAR
;
3257 elem
->opr
.ch
= token
->opr
.c
;
3261 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3262 such as [:<character_class>:], [.<collating_element>.], and
3263 [=<equivalent_class>=]. */
3265 static reg_errcode_t
3266 parse_bracket_symbol (elem
, regexp
, token
)
3267 bracket_elem_t
*elem
;
3268 re_string_t
*regexp
;
3271 unsigned char ch
, delim
= token
->opr
.c
;
3273 if (re_string_eoi(regexp
))
3277 if (i
>= BRACKET_NAME_BUF_SIZE
)
3279 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3280 ch
= re_string_fetch_byte_case (regexp
);
3282 ch
= re_string_fetch_byte (regexp
);
3283 if (re_string_eoi(regexp
))
3285 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3287 elem
->opr
.name
[i
] = ch
;
3289 re_string_skip_bytes (regexp
, 1);
3290 elem
->opr
.name
[i
] = '\0';
3291 switch (token
->type
)
3293 case OP_OPEN_COLL_ELEM
:
3294 elem
->type
= COLL_SYM
;
3296 case OP_OPEN_EQUIV_CLASS
:
3297 elem
->type
= EQUIV_CLASS
;
3299 case OP_OPEN_CHAR_CLASS
:
3300 elem
->type
= CHAR_CLASS
;
3308 /* Helper function for parse_bracket_exp.
3309 Build the equivalence class which is represented by NAME.
3310 The result are written to MBCSET and SBCSET.
3311 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3312 is a pointer argument sinse we may update it. */
3314 static reg_errcode_t
3315 #ifdef RE_ENABLE_I18N
3316 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3317 re_charset_t
*mbcset
;
3318 int *equiv_class_alloc
;
3319 #else /* not RE_ENABLE_I18N */
3320 build_equiv_class (sbcset
, name
)
3321 #endif /* not RE_ENABLE_I18N */
3322 re_bitset_ptr_t sbcset
;
3323 const unsigned char *name
;
3326 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3329 const int32_t *table
, *indirect
;
3330 const unsigned char *weights
, *extra
, *cp
;
3331 unsigned char char_buf
[2];
3335 /* This #include defines a local function! */
3336 # include <locale/weight.h>
3337 /* Calculate the index for equivalence class. */
3339 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3340 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3341 _NL_COLLATE_WEIGHTMB
);
3342 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3343 _NL_COLLATE_EXTRAMB
);
3344 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3345 _NL_COLLATE_INDIRECTMB
);
3346 idx1
= findidx (&cp
);
3347 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3348 /* This isn't a valid character. */
3349 return REG_ECOLLATE
;
3351 /* Build single byte matcing table for this equivalence class. */
3352 char_buf
[1] = (unsigned char) '\0';
3353 len
= weights
[idx1
];
3354 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3358 idx2
= findidx (&cp
);
3363 /* This isn't a valid character. */
3365 if (len
== weights
[idx2
])
3368 while (cnt
<= len
&&
3369 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3373 bitset_set (sbcset
, ch
);
3376 /* Check whether the array has enough space. */
3377 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3379 /* Not enough, realloc it. */
3380 /* +1 in case of mbcset->nequiv_classes is 0. */
3381 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3382 /* Use realloc since the array is NULL if *alloc == 0. */
3383 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3385 new_equiv_class_alloc
);
3386 if (BE (new_equiv_classes
== NULL
, 0))
3388 mbcset
->equiv_classes
= new_equiv_classes
;
3389 *equiv_class_alloc
= new_equiv_class_alloc
;
3391 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3396 if (BE (strlen ((const char *) name
) != 1, 0))
3397 return REG_ECOLLATE
;
3398 bitset_set (sbcset
, *name
);
3403 /* Helper function for parse_bracket_exp.
3404 Build the character class which is represented by NAME.
3405 The result are written to MBCSET and SBCSET.
3406 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3407 is a pointer argument sinse we may update it. */
3409 static reg_errcode_t
3410 #ifdef RE_ENABLE_I18N
3411 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3412 re_charset_t
*mbcset
;
3413 int *char_class_alloc
;
3414 #else /* not RE_ENABLE_I18N */
3415 build_charclass (trans
, sbcset
, class_name
, syntax
)
3416 #endif /* not RE_ENABLE_I18N */
3417 unsigned RE_TRANSLATE_TYPE trans
;
3418 re_bitset_ptr_t sbcset
;
3419 const unsigned char *class_name
;
3420 reg_syntax_t syntax
;
3423 const char *name
= (const char *) class_name
;
3425 /* In case of REG_ICASE "upper" and "lower" match the both of
3426 upper and lower cases. */
3427 if ((syntax
& RE_ICASE
)
3428 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3431 #ifdef RE_ENABLE_I18N
3432 /* Check the space of the arrays. */
3433 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3435 /* Not enough, realloc it. */
3436 /* +1 in case of mbcset->nchar_classes is 0. */
3437 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3438 /* Use realloc since array is NULL if *alloc == 0. */
3439 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3440 new_char_class_alloc
);
3441 if (BE (new_char_classes
== NULL
, 0))
3443 mbcset
->char_classes
= new_char_classes
;
3444 *char_class_alloc
= new_char_class_alloc
;
3446 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3447 #endif /* RE_ENABLE_I18N */
3449 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3450 for (i = 0; i < SBC_MAX; ++i) \
3452 if (ctype_func (i)) \
3454 int ch = trans ? trans[i] : i; \
3455 bitset_set (sbcset, ch); \
3459 if (strcmp (name
, "alnum") == 0)
3460 BUILD_CHARCLASS_LOOP (isalnum
)
3461 else if (strcmp (name
, "cntrl") == 0)
3462 BUILD_CHARCLASS_LOOP (iscntrl
)
3463 else if (strcmp (name
, "lower") == 0)
3464 BUILD_CHARCLASS_LOOP (islower
)
3465 else if (strcmp (name
, "space") == 0)
3466 BUILD_CHARCLASS_LOOP (isspace
)
3467 else if (strcmp (name
, "alpha") == 0)
3468 BUILD_CHARCLASS_LOOP (isalpha
)
3469 else if (strcmp (name
, "digit") == 0)
3470 BUILD_CHARCLASS_LOOP (isdigit
)
3471 else if (strcmp (name
, "print") == 0)
3472 BUILD_CHARCLASS_LOOP (isprint
)
3473 else if (strcmp (name
, "upper") == 0)
3474 BUILD_CHARCLASS_LOOP (isupper
)
3475 else if (strcmp (name
, "blank") == 0)
3476 BUILD_CHARCLASS_LOOP (isblank
)
3477 else if (strcmp (name
, "graph") == 0)
3478 BUILD_CHARCLASS_LOOP (isgraph
)
3479 else if (strcmp (name
, "punct") == 0)
3480 BUILD_CHARCLASS_LOOP (ispunct
)
3481 else if (strcmp (name
, "xdigit") == 0)
3482 BUILD_CHARCLASS_LOOP (isxdigit
)
3490 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3492 unsigned RE_TRANSLATE_TYPE trans
;
3493 const unsigned char *class_name
;
3494 const unsigned char *extra
;
3498 re_bitset_ptr_t sbcset
;
3499 #ifdef RE_ENABLE_I18N
3500 re_charset_t
*mbcset
;
3502 #endif /* not RE_ENABLE_I18N */
3504 re_token_t br_token
;
3507 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3508 #ifdef RE_ENABLE_I18N
3509 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3510 #endif /* RE_ENABLE_I18N */
3512 #ifdef RE_ENABLE_I18N
3513 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3514 #else /* not RE_ENABLE_I18N */
3515 if (BE (sbcset
== NULL
, 0))
3516 #endif /* not RE_ENABLE_I18N */
3524 #ifdef RE_ENABLE_I18N
3526 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3527 bitset_set(cset->sbcset, '\0');
3529 mbcset
->non_match
= 1;
3530 #endif /* not RE_ENABLE_I18N */
3533 /* We don't care the syntax in this case. */
3534 ret
= build_charclass (trans
, sbcset
,
3535 #ifdef RE_ENABLE_I18N
3537 #endif /* RE_ENABLE_I18N */
3540 if (BE (ret
!= REG_NOERROR
, 0))
3543 #ifdef RE_ENABLE_I18N
3544 free_charset (mbcset
);
3545 #endif /* RE_ENABLE_I18N */
3549 /* \w match '_' also. */
3550 for (; *extra
; extra
++)
3551 bitset_set (sbcset
, *extra
);
3553 /* If it is non-matching list. */
3555 bitset_not (sbcset
);
3557 #ifdef RE_ENABLE_I18N
3558 /* Ensure only single byte characters are set. */
3559 if (dfa
->mb_cur_max
> 1)
3560 bitset_mask (sbcset
, dfa
->sb_char
);
3563 /* Build a tree for simple bracket. */
3564 br_token
.type
= SIMPLE_BRACKET
;
3565 br_token
.opr
.sbcset
= sbcset
;
3566 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3567 if (BE (tree
== NULL
, 0))
3568 goto build_word_op_espace
;
3570 #ifdef RE_ENABLE_I18N
3571 if (dfa
->mb_cur_max
> 1)
3573 re_token_t alt_token
;
3574 bin_tree_t
*mbc_tree
;
3575 /* Build a tree for complex bracket. */
3576 br_token
.type
= COMPLEX_BRACKET
;
3577 br_token
.opr
.mbcset
= mbcset
;
3578 dfa
->has_mb_node
= 1;
3579 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3580 if (BE (mbc_tree
== NULL
, 0))
3581 goto build_word_op_espace
;
3582 /* Then join them by ALT node. */
3583 alt_token
.type
= OP_ALT
;
3584 dfa
->has_plural_match
= 1;
3585 tree
= re_dfa_add_tree_node (dfa
, tree
, mbc_tree
, &alt_token
);
3586 if (BE (mbc_tree
!= NULL
, 1))
3591 free_charset (mbcset
);
3594 #else /* not RE_ENABLE_I18N */
3596 #endif /* not RE_ENABLE_I18N */
3598 build_word_op_espace
:
3600 #ifdef RE_ENABLE_I18N
3601 free_charset (mbcset
);
3602 #endif /* RE_ENABLE_I18N */
3607 /* This is intended for the expressions like "a{1,3}".
3608 Fetch a number from `input', and return the number.
3609 Return -1, if the number field is empty like "{,1}".
3610 Return -2, If an error is occured. */
3613 fetch_number (input
, token
, syntax
)
3616 reg_syntax_t syntax
;
3622 fetch_token (token
, input
, syntax
);
3624 if (BE (token
->type
== END_OF_RE
, 0))
3626 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3628 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3629 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3630 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3635 #ifdef RE_ENABLE_I18N
3637 free_charset (re_charset_t
*cset
)
3639 re_free (cset
->mbchars
);
3641 re_free (cset
->coll_syms
);
3642 re_free (cset
->equiv_classes
);
3643 re_free (cset
->range_starts
);
3644 re_free (cset
->range_ends
);
3646 re_free (cset
->char_classes
);
3649 #endif /* RE_ENABLE_I18N */
3651 /* Functions for binary tree operation. */
3653 /* Create a tree node. */
3656 create_tree (dfa
, left
, right
, type
, index
)
3660 re_token_type_t type
;
3664 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3666 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3668 if (storage
== NULL
)
3670 storage
->next
= dfa
->str_tree_storage
;
3671 dfa
->str_tree_storage
= storage
;
3672 dfa
->str_tree_storage_idx
= 0;
3674 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3676 tree
->parent
= NULL
;
3678 tree
->right
= right
;
3680 tree
->node_idx
= index
;
3683 re_node_set_init_empty (&tree
->eclosure
);
3686 left
->parent
= tree
;
3688 right
->parent
= tree
;
3692 /* Create both a DFA node and a tree for it. */
3695 re_dfa_add_tree_node (dfa
, left
, right
, token
)
3699 const re_token_t
*token
;
3701 int new_idx
= re_dfa_add_node (dfa
, *token
, 0);
3706 return create_tree (dfa
, left
, right
, 0, new_idx
);
3709 /* Mark the tree SRC as an optional subexpression. */
3712 mark_opt_subexp (src
, dfa
)
3713 const bin_tree_t
*src
;
3716 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3718 if (src
->type
== CONCAT
3719 && src
->left
->type
== NON_TYPE
3720 && dfa
->nodes
[src
->left
->node_idx
].type
== OP_OPEN_SUBEXP
)
3721 mark_opt_subexp_iter (src
, dfa
, dfa
->nodes
[src
->left
->node_idx
].opr
.idx
);
3725 /* Recursive tree walker for mark_opt_subexp. */
3728 mark_opt_subexp_iter (src
, dfa
, idx
)
3729 const bin_tree_t
*src
;
3735 if (src
->type
== NON_TYPE
)
3737 node_idx
= src
->node_idx
;
3738 if ((dfa
->nodes
[node_idx
].type
== OP_OPEN_SUBEXP
3739 || dfa
->nodes
[node_idx
].type
== OP_CLOSE_SUBEXP
)
3740 && dfa
->nodes
[node_idx
].opr
.idx
== idx
)
3741 dfa
->nodes
[node_idx
].opt_subexp
= 1;
3744 if (src
->left
!= NULL
)
3745 mark_opt_subexp_iter (src
->left
, dfa
, idx
);
3747 if (src
->right
!= NULL
)
3748 mark_opt_subexp_iter (src
->right
, dfa
, idx
);
3752 /* Duplicate the node SRC, and return new node. */
3755 duplicate_tree (src
, dfa
)
3756 const bin_tree_t
*src
;
3759 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3761 /* Since node indies must be according to Post-order of the tree,
3762 we must duplicate the left at first. */
3763 if (src
->left
!= NULL
)
3765 left
= duplicate_tree (src
->left
, dfa
);
3770 /* Secondaly, duplicate the right. */
3771 if (src
->right
!= NULL
)
3773 right
= duplicate_tree (src
->right
, dfa
);
3778 /* At last, duplicate itself. */
3779 if (src
->type
== NON_TYPE
)
3781 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3782 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3783 if (BE (new_node_idx
== -1, 0))
3787 new_node_idx
= src
->type
;
3789 new_tree
= create_tree (dfa
, left
, right
, src
->type
, new_node_idx
);