1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
72 int length
, reg_syntax_t syntax
);
73 static void re_compile_fastmap_iter (regex_t
*bufp
,
74 const re_dfastate_t
*init_state
,
76 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
77 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
79 static void free_charset (re_charset_t
*cset
);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t
*preg
);
82 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
83 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
84 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
85 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
86 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
87 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
88 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
89 int top_clone_node
, int root_node
,
90 unsigned int constraint
);
91 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
92 unsigned int constraint
);
93 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
94 unsigned int constraint
);
95 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
96 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
98 static void calc_inveclosure (re_dfa_t
*dfa
);
99 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
100 reg_syntax_t syntax
);
101 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
102 static int peek_token (re_token_t
*token
, re_string_t
*input
,
103 reg_syntax_t syntax
);
104 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
105 reg_syntax_t syntax
);
106 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
107 reg_syntax_t syntax
, reg_errcode_t
*err
);
108 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
109 re_token_t
*token
, reg_syntax_t syntax
,
110 int nest
, reg_errcode_t
*err
);
111 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
112 re_token_t
*token
, reg_syntax_t syntax
,
113 int nest
, reg_errcode_t
*err
);
114 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
115 re_token_t
*token
, reg_syntax_t syntax
,
116 int nest
, reg_errcode_t
*err
);
117 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
118 re_token_t
*token
, reg_syntax_t syntax
,
119 int nest
, reg_errcode_t
*err
);
120 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
121 re_dfa_t
*dfa
, re_token_t
*token
,
122 reg_syntax_t syntax
, reg_errcode_t
*err
);
123 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
124 re_token_t
*token
, reg_syntax_t syntax
,
126 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
128 re_token_t
*token
, int token_len
,
130 reg_syntax_t syntax
);
131 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
135 # ifdef RE_ENABLE_I18N
136 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
137 re_charset_t
*mbcset
, int *range_alloc
,
138 bracket_elem_t
*start_elem
,
139 bracket_elem_t
*end_elem
);
140 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
141 re_charset_t
*mbcset
,
143 const unsigned char *name
);
144 # else /* not RE_ENABLE_I18N */
145 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
146 bracket_elem_t
*start_elem
,
147 bracket_elem_t
*end_elem
);
148 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
149 const unsigned char *name
);
150 # endif /* not RE_ENABLE_I18N */
151 #endif /* not _LIBC */
152 #ifdef RE_ENABLE_I18N
153 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
154 re_charset_t
*mbcset
,
155 int *equiv_class_alloc
,
156 const unsigned char *name
);
157 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
158 re_charset_t
*mbcset
,
159 int *char_class_alloc
,
160 const unsigned char *class_name
,
161 reg_syntax_t syntax
);
162 #else /* not RE_ENABLE_I18N */
163 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
164 const unsigned char *name
);
165 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
166 const unsigned char *class_name
,
167 reg_syntax_t syntax
);
168 #endif /* not RE_ENABLE_I18N */
169 static bin_tree_t
*build_word_op (re_dfa_t
*dfa
, int not, reg_errcode_t
*err
);
170 static void free_bin_tree (bin_tree_t
*tree
);
171 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
172 re_token_type_t type
, int index
);
173 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
175 /* This table gives an error message for each of the error codes listed
176 in regex.h. Obviously the order here has to be same as there.
177 POSIX doesn't require that we do anything for REG_NOERROR,
178 but why not be nice? */
180 const char __re_error_msgid
[] attribute_hidden
=
182 #define REG_NOERROR_IDX 0
183 gettext_noop ("Success") /* REG_NOERROR */
185 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
186 gettext_noop ("No match") /* REG_NOMATCH */
188 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
189 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
191 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
192 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
194 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
195 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
197 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
198 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
200 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
201 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
203 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
204 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
206 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
207 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
209 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
210 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
212 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
213 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
215 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
216 gettext_noop ("Invalid range end") /* REG_ERANGE */
218 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
219 gettext_noop ("Memory exhausted") /* REG_ESPACE */
221 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
222 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
224 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
225 gettext_noop ("Premature end of regular expression") /* REG_EEND */
227 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
228 gettext_noop ("Regular expression too big") /* REG_ESIZE */
230 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
231 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
234 const size_t __re_error_msgid_idx
[] attribute_hidden
=
255 /* Entry points for GNU code. */
257 /* re_compile_pattern is the GNU regular expression compiler: it
258 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
259 Returns 0 if the pattern was valid, otherwise an error string.
261 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
262 are set in BUFP on entry. */
265 re_compile_pattern (pattern
, length
, bufp
)
268 struct re_pattern_buffer
*bufp
;
272 /* And GNU code determines whether or not to get register information
273 by passing null for the REGS argument to re_match, etc., not by
277 /* Match anchors at newline. */
278 bufp
->newline_anchor
= 1;
280 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
284 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
287 weak_alias (__re_compile_pattern
, re_compile_pattern
)
290 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
291 also be assigned to arbitrarily: each pattern buffer stores its own
292 syntax, so it can be changed between regex compilations. */
293 /* This has no initializer because initialized variables in Emacs
294 become read-only after dumping. */
295 reg_syntax_t re_syntax_options
;
298 /* Specify the precise syntax of regexps for compilation. This provides
299 for compatibility for various utilities which historically have
300 different, incompatible syntaxes.
302 The argument SYNTAX is a bit mask comprised of the various bits
303 defined in regex.h. We return the old syntax. */
306 re_set_syntax (syntax
)
309 reg_syntax_t ret
= re_syntax_options
;
311 re_syntax_options
= syntax
;
315 weak_alias (__re_set_syntax
, re_set_syntax
)
319 re_compile_fastmap (bufp
)
320 struct re_pattern_buffer
*bufp
;
322 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
323 char *fastmap
= bufp
->fastmap
;
325 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
326 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
327 if (dfa
->init_state
!= dfa
->init_state_word
)
328 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
329 if (dfa
->init_state
!= dfa
->init_state_nl
)
330 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
331 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
332 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
333 bufp
->fastmap_accurate
= 1;
337 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
341 re_set_fastmap (char *fastmap
, int icase
, int ch
)
345 fastmap
[tolower (ch
)] = 1;
348 /* Helper function for re_compile_fastmap.
349 Compile fastmap for the initial_state INIT_STATE. */
352 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
354 const re_dfastate_t
*init_state
;
357 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
359 int icase
= (MB_CUR_MAX
== 1 && (bufp
->syntax
& RE_ICASE
));
360 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
362 int node
= init_state
->nodes
.elems
[node_cnt
];
363 re_token_type_t type
= dfa
->nodes
[node
].type
;
365 if (type
== CHARACTER
)
366 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
367 else if (type
== SIMPLE_BRACKET
)
370 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
371 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
372 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
373 re_set_fastmap (fastmap
, icase
, ch
);
375 #ifdef RE_ENABLE_I18N
376 else if (type
== COMPLEX_BRACKET
)
379 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
380 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
381 || cset
->nranges
|| cset
->nchar_classes
)
384 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
393 const int32_t *table
= (const int32_t *)
394 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
395 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
396 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
398 re_set_fastmap (fastmap
, icase
, ch
);
402 for (i
= 0; i
< SBC_MAX
; ++i
)
403 if (__btowc (i
) == WEOF
)
404 re_set_fastmap (fastmap
, icase
, i
);
405 # endif /* not _LIBC */
407 for (i
= 0; i
< cset
->nmbchars
; ++i
)
411 memset (&state
, '\0', sizeof (state
));
412 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
413 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
416 #endif /* RE_ENABLE_I18N */
417 else if (type
== END_OF_RE
|| type
== OP_PERIOD
)
419 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
420 if (type
== END_OF_RE
)
421 bufp
->can_be_null
= 1;
427 /* Entry point for POSIX code. */
428 /* regcomp takes a regular expression as a string and compiles it.
430 PREG is a regex_t *. We do not expect any fields to be initialized,
431 since POSIX says we shouldn't. Thus, we set
433 `buffer' to the compiled pattern;
434 `used' to the length of the compiled pattern;
435 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436 REG_EXTENDED bit in CFLAGS is set; otherwise, to
437 RE_SYNTAX_POSIX_BASIC;
438 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
439 `fastmap' to an allocated space for the fastmap;
440 `fastmap_accurate' to zero;
441 `re_nsub' to the number of subexpressions in PATTERN.
443 PATTERN is the address of the pattern string.
445 CFLAGS is a series of bits which affect compilation.
447 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448 use POSIX basic syntax.
450 If REG_NEWLINE is set, then . and [^...] don't match newline.
451 Also, regexec will try a match beginning after every newline.
453 If REG_ICASE is set, then we considers upper- and lowercase
454 versions of letters to be equivalent when matching.
456 If REG_NOSUB is set, then when PREG is passed to regexec, that
457 routine will report only success or failure, and nothing about the
460 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461 the return codes and their meanings.) */
464 regcomp (preg
, pattern
, cflags
)
465 regex_t
*__restrict preg
;
466 const char *__restrict pattern
;
470 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC
);
477 /* Try to allocate space for the fastmap. */
478 preg
->fastmap
= re_malloc (char, SBC_MAX
);
479 if (BE (preg
->fastmap
== NULL
, 0))
482 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags
& REG_NEWLINE
)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax
&= ~RE_DOT_NEWLINE
;
488 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
489 /* It also changes the matching behavior. */
490 preg
->newline_anchor
= 1;
493 preg
->newline_anchor
= 0;
494 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
495 preg
->translate
= NULL
;
497 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret
== REG_ERPAREN
)
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret
== REG_NOERROR
, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function nevers fails in this implementation. */
508 (void) re_compile_fastmap (preg
);
511 /* Some error occurred while compiling the expression. */
512 re_free (preg
->fastmap
);
513 preg
->fastmap
= NULL
;
519 weak_alias (__regcomp
, regcomp
)
522 /* Returns a message corresponding to an error code, ERRCODE, returned
523 from either regcomp or regexec. We don't use PREG here. */
526 regerror (errcode
, preg
, errbuf
, errbuf_size
)
536 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
537 / sizeof (__re_error_msgid_idx
[0])), 0))
538 /* Only error codes returned by the rest of the code should be passed
539 to this routine. If we are given anything else, or if other regex
540 code generates an invalid error code, then the program has a bug.
541 Dump core so we can fix it. */
544 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
546 msg_size
= strlen (msg
) + 1; /* Includes the null. */
548 if (BE (errbuf_size
!= 0, 1))
550 if (BE (msg_size
> errbuf_size
, 0))
552 #if defined HAVE_MEMPCPY || defined _LIBC
553 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
555 memcpy (errbuf
, msg
, errbuf_size
- 1);
556 errbuf
[errbuf_size
- 1] = 0;
560 memcpy (errbuf
, msg
, msg_size
);
566 weak_alias (__regerror
, regerror
)
571 free_dfa_content (re_dfa_t
*dfa
)
575 re_free (dfa
->subexps
);
577 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
579 re_token_t
*node
= dfa
->nodes
+ i
;
580 #ifdef RE_ENABLE_I18N
581 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
582 free_charset (node
->opr
.mbcset
);
584 #endif /* RE_ENABLE_I18N */
585 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
586 re_free (node
->opr
.sbcset
);
588 re_free (dfa
->nexts
);
589 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
591 if (dfa
->eclosures
!= NULL
)
592 re_node_set_free (dfa
->eclosures
+ i
);
593 if (dfa
->inveclosures
!= NULL
)
594 re_node_set_free (dfa
->inveclosures
+ i
);
595 if (dfa
->edests
!= NULL
)
596 re_node_set_free (dfa
->edests
+ i
);
598 re_free (dfa
->edests
);
599 re_free (dfa
->eclosures
);
600 re_free (dfa
->inveclosures
);
601 re_free (dfa
->nodes
);
603 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
605 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
606 for (j
= 0; j
< entry
->num
; ++j
)
608 re_dfastate_t
*state
= entry
->array
[j
];
611 re_free (entry
->array
);
613 re_free (dfa
->state_table
);
615 if (dfa
->word_char
!= NULL
)
616 re_free (dfa
->word_char
);
618 re_free (dfa
->re_str
);
625 /* Free dynamically allocated space used by PREG. */
631 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
632 if (BE (dfa
!= NULL
, 1))
633 free_dfa_content (dfa
);
635 re_free (preg
->fastmap
);
638 weak_alias (__regfree
, regfree
)
641 /* Entry points compatible with 4.2 BSD regex library. We don't define
642 them unless specifically requested. */
644 #if defined _REGEX_RE_COMP || defined _LIBC
646 /* BSD has one and only one pattern buffer. */
647 static struct re_pattern_buffer re_comp_buf
;
651 /* Make these definitions weak in libc, so POSIX programs can redefine
652 these names if they don't use our functions, and still use
653 regcomp/regexec above without link errors. */
664 if (!re_comp_buf
.buffer
)
665 return gettext ("No previous regular expression");
669 if (re_comp_buf
.buffer
)
671 fastmap
= re_comp_buf
.fastmap
;
672 re_comp_buf
.fastmap
= NULL
;
673 __regfree (&re_comp_buf
);
674 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
675 re_comp_buf
.fastmap
= fastmap
;
678 if (re_comp_buf
.fastmap
== NULL
)
680 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
681 if (re_comp_buf
.fastmap
== NULL
)
682 return (char *) gettext (__re_error_msgid
683 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
686 /* Since `re_exec' always passes NULL for the `regs' argument, we
687 don't need to initialize the pattern buffer fields which affect it. */
689 /* Match anchors at newlines. */
690 re_comp_buf
.newline_anchor
= 1;
692 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
697 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
698 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
702 libc_freeres_fn (free_mem
)
704 __regfree (&re_comp_buf
);
708 #endif /* _REGEX_RE_COMP */
710 /* Internal entry point.
711 Compile the regular expression PATTERN, whose length is LENGTH.
712 SYNTAX indicate regular expression's syntax. */
715 re_compile_internal (preg
, pattern
, length
, syntax
)
717 const char * pattern
;
721 reg_errcode_t err
= REG_NOERROR
;
725 /* Initialize the pattern buffer. */
726 preg
->fastmap_accurate
= 0;
727 preg
->syntax
= syntax
;
728 preg
->not_bol
= preg
->not_eol
= 0;
731 preg
->can_be_null
= 0;
732 preg
->regs_allocated
= REGS_UNALLOCATED
;
734 /* Initialize the dfa. */
735 dfa
= (re_dfa_t
*) preg
->buffer
;
736 if (preg
->allocated
< sizeof (re_dfa_t
))
738 /* If zero allocated, but buffer is non-null, try to realloc
739 enough space. This loses if buffer's address is bogus, but
740 that is the user's responsibility. If ->buffer is NULL this
741 is a simple allocation. */
742 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
745 preg
->allocated
= sizeof (re_dfa_t
);
747 preg
->buffer
= (unsigned char *) dfa
;
748 preg
->used
= sizeof (re_dfa_t
);
750 err
= init_dfa (dfa
, length
);
751 if (BE (err
!= REG_NOERROR
, 0))
758 dfa
->re_str
= re_malloc (char, length
+ 1);
759 strncpy (dfa
->re_str
, pattern
, length
+ 1);
762 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
764 if (BE (err
!= REG_NOERROR
, 0))
771 /* Parse the regular expression, and build a structure tree. */
773 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
774 if (BE (dfa
->str_tree
== NULL
, 0))
775 goto re_compile_internal_free_return
;
777 /* Analyze the tree and collect information which is necessary to
780 if (BE (err
!= REG_NOERROR
, 0))
781 goto re_compile_internal_free_return
;
783 /* Then create the initial state of the dfa. */
784 err
= create_initial_state (dfa
);
786 /* Release work areas. */
787 free_workarea_compile (preg
);
788 re_string_destruct (®exp
);
790 if (BE (err
!= REG_NOERROR
, 0))
792 re_compile_internal_free_return
:
793 free_dfa_content (dfa
);
800 /* Initialize DFA. We use the length of the regular expression PAT_LEN
801 as the initial length of some arrays. */
804 init_dfa (dfa
, pat_len
)
810 memset (dfa
, '\0', sizeof (re_dfa_t
));
812 dfa
->nodes_alloc
= pat_len
+ 1;
813 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
815 dfa
->states_alloc
= pat_len
+ 1;
817 /* table_size = 2 ^ ceil(log pat_len) */
818 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
819 if (table_size
> pat_len
)
822 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
823 dfa
->state_hash_mask
= table_size
- 1;
825 dfa
->subexps_alloc
= 1;
826 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
827 dfa
->word_char
= NULL
;
829 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
830 || dfa
->subexps
== NULL
, 0))
832 /* We don't bother to free anything which was allocated. Very
833 soon the process will go down anyway. */
835 dfa
->state_table
= NULL
;
842 /* Initialize WORD_CHAR table, which indicate which character is
843 "word". In this case "word" means that it is the word construction
844 character used by some operators like "\<", "\>", etc. */
851 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
852 if (BE (dfa
->word_char
== NULL
, 0))
854 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
855 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
856 if (isalnum (ch
) || ch
== '_')
857 dfa
->word_char
[i
] |= 1 << j
;
861 /* Free the work area which are only used while compiling. */
864 free_workarea_compile (preg
)
867 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
868 free_bin_tree (dfa
->str_tree
);
869 dfa
->str_tree
= NULL
;
870 re_free (dfa
->org_indices
);
871 dfa
->org_indices
= NULL
;
874 /* Create initial states for all contexts. */
877 create_initial_state (dfa
)
882 re_node_set init_nodes
;
884 /* Initial states have the epsilon closure of the node which is
885 the first node of the regular expression. */
886 first
= dfa
->str_tree
->first
;
887 dfa
->init_node
= first
;
888 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
889 if (BE (err
!= REG_NOERROR
, 0))
892 /* The back-references which are in initial states can epsilon transit,
893 since in this case all of the subexpressions can be null.
894 Then we add epsilon closures of the nodes which are the next nodes of
895 the back-references. */
896 if (dfa
->nbackref
> 0)
897 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
899 int node_idx
= init_nodes
.elems
[i
];
900 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
903 if (type
!= OP_BACK_REF
)
905 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
907 re_token_t
*clexp_node
;
908 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
909 if (clexp_node
->type
== OP_CLOSE_SUBEXP
910 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
913 if (clexp_idx
== init_nodes
.nelem
)
916 if (type
== OP_BACK_REF
)
918 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
919 if (!re_node_set_contains (&init_nodes
, dest_idx
))
921 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
927 /* It must be the first time to invoke acquire_state. */
928 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
929 /* We don't check ERR here, since the initial state must not be NULL. */
930 if (BE (dfa
->init_state
== NULL
, 0))
932 if (dfa
->init_state
->has_constraint
)
934 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
936 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
938 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
942 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
943 || dfa
->init_state_begbuf
== NULL
, 0))
947 dfa
->init_state_word
= dfa
->init_state_nl
948 = dfa
->init_state_begbuf
= dfa
->init_state
;
950 re_node_set_free (&init_nodes
);
954 /* Analyze the structure tree, and calculate "first", "next", "edest",
955 "eclosure", and "inveclosure". */
964 /* Allocate arrays. */
965 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
966 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
967 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
968 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
969 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
970 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
971 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
973 /* Initialize them. */
974 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
977 re_node_set_init_empty (dfa
->edests
+ i
);
978 re_node_set_init_empty (dfa
->eclosures
+ i
);
979 re_node_set_init_empty (dfa
->inveclosures
+ i
);
982 ret
= analyze_tree (dfa
, dfa
->str_tree
);
983 if (BE (ret
== REG_NOERROR
, 1))
985 ret
= calc_eclosure (dfa
);
986 if (ret
== REG_NOERROR
)
987 calc_inveclosure (dfa
);
992 /* Helper functions for analyze.
993 This function calculate "first", "next", and "edest" for the subtree
994 whose root is NODE. */
997 analyze_tree (dfa
, node
)
1002 if (node
->first
== -1)
1003 calc_first (dfa
, node
);
1004 if (node
->next
== -1)
1005 calc_next (dfa
, node
);
1006 if (node
->eclosure
.nelem
== 0)
1007 calc_epsdest (dfa
, node
);
1008 /* Calculate "first" etc. for the left child. */
1009 if (node
->left
!= NULL
)
1011 ret
= analyze_tree (dfa
, node
->left
);
1012 if (BE (ret
!= REG_NOERROR
, 0))
1015 /* Calculate "first" etc. for the right child. */
1016 if (node
->right
!= NULL
)
1018 ret
= analyze_tree (dfa
, node
->right
);
1019 if (BE (ret
!= REG_NOERROR
, 0))
1025 /* Calculate "first" for the node NODE. */
1027 calc_first (dfa
, node
)
1032 idx
= node
->node_idx
;
1033 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1038 case OP_OPEN_BRACKET
:
1039 case OP_CLOSE_BRACKET
:
1040 case OP_OPEN_DUP_NUM
:
1041 case OP_CLOSE_DUP_NUM
:
1042 case OP_NON_MATCH_LIST
:
1043 case OP_OPEN_COLL_ELEM
:
1044 case OP_CLOSE_COLL_ELEM
:
1045 case OP_OPEN_EQUIV_CLASS
:
1046 case OP_CLOSE_EQUIV_CLASS
:
1047 case OP_OPEN_CHAR_CLASS
:
1048 case OP_CLOSE_CHAR_CLASS
:
1049 /* These must not be appeared here. */
1055 case OP_DUP_ASTERISK
:
1056 case OP_DUP_QUESTION
:
1057 #ifdef RE_ENABLE_I18N
1058 case COMPLEX_BRACKET
:
1059 #endif /* RE_ENABLE_I18N */
1060 case SIMPLE_BRACKET
:
1063 case OP_OPEN_SUBEXP
:
1064 case OP_CLOSE_SUBEXP
:
1069 assert (node
->left
!= NULL
);
1071 if (node
->left
->first
== -1)
1072 calc_first (dfa
, node
->left
);
1073 node
->first
= node
->left
->first
;
1078 /* else fall through */
1081 assert (node
->left
!= NULL
);
1083 if (node
->left
->first
== -1)
1084 calc_first (dfa
, node
->left
);
1085 node
->first
= node
->left
->first
;
1090 /* Calculate "next" for the node NODE. */
1093 calc_next (dfa
, node
)
1098 bin_tree_t
*parent
= node
->parent
;
1102 idx
= node
->node_idx
;
1103 if (node
->type
== 0)
1104 dfa
->nexts
[idx
] = node
->next
;
1108 idx
= parent
->node_idx
;
1109 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1113 case OP_DUP_ASTERISK
:
1118 if (parent
->left
== node
)
1120 if (parent
->right
->first
== -1)
1121 calc_first (dfa
, parent
->right
);
1122 node
->next
= parent
->right
->first
;
1125 /* else fall through */
1127 if (parent
->next
== -1)
1128 calc_next (dfa
, parent
);
1129 node
->next
= parent
->next
;
1132 idx
= node
->node_idx
;
1133 if (node
->type
== 0)
1134 dfa
->nexts
[idx
] = node
->next
;
1137 /* Calculate "edest" for the node NODE. */
1140 calc_epsdest (dfa
, node
)
1145 idx
= node
->node_idx
;
1146 if (node
->type
== 0)
1148 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1149 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1150 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1152 if (node
->left
->first
== -1)
1153 calc_first (dfa
, node
->left
);
1154 if (node
->next
== -1)
1155 calc_next (dfa
, node
);
1156 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1159 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1162 if (node
->left
!= NULL
)
1164 if (node
->left
->first
== -1)
1165 calc_first (dfa
, node
->left
);
1166 left
= node
->left
->first
;
1170 if (node
->next
== -1)
1171 calc_next (dfa
, node
);
1174 if (node
->right
!= NULL
)
1176 if (node
->right
->first
== -1)
1177 calc_first (dfa
, node
->right
);
1178 right
= node
->right
->first
;
1182 if (node
->next
== -1)
1183 calc_next (dfa
, node
);
1186 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1188 else if (dfa
->nodes
[idx
].type
== ANCHOR
1189 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1190 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1191 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1192 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1196 /* Duplicate the epsilon closure of the node ROOT_NODE.
1197 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1198 to their own constraint. */
1200 static reg_errcode_t
1201 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1204 int top_org_node
, top_clone_node
, root_node
;
1205 unsigned int init_constraint
;
1208 int org_node
, clone_node
, ret
;
1209 unsigned int constraint
= init_constraint
;
1210 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1212 int org_dest
, clone_dest
;
1213 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1215 /* If the back reference epsilon-transit, its destination must
1216 also have the constraint. Then duplicate the epsilon closure
1217 of the destination of the back reference, and store it in
1218 edests of the back reference. */
1219 org_dest
= dfa
->nexts
[org_node
];
1220 re_node_set_empty (dfa
->edests
+ clone_node
);
1221 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1222 if (BE (err
!= REG_NOERROR
, 0))
1224 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1225 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1226 if (BE (ret
< 0, 0))
1229 else if (dfa
->edests
[org_node
].nelem
== 0)
1231 /* In case of the node can't epsilon-transit, don't duplicate the
1232 destination and store the original destination as the
1233 destination of the node. */
1234 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1237 else if (dfa
->edests
[org_node
].nelem
== 1)
1239 /* In case of the node can epsilon-transit, and it has only one
1241 org_dest
= dfa
->edests
[org_node
].elems
[0];
1242 re_node_set_empty (dfa
->edests
+ clone_node
);
1243 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1245 /* In case of the node has another constraint, append it. */
1246 if (org_node
== root_node
&& clone_node
!= org_node
)
1248 /* ...but if the node is root_node itself, it means the
1249 epsilon closure have a loop, then tie it to the
1250 destination of the root_node. */
1251 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1253 if (BE (ret
< 0, 0))
1257 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1259 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1260 if (BE (err
!= REG_NOERROR
, 0))
1262 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1263 if (BE (ret
< 0, 0))
1266 else /* dfa->edests[org_node].nelem == 2 */
1268 /* In case of the node can epsilon-transit, and it has two
1269 destinations. E.g. '|', '*', '+', '?'. */
1270 org_dest
= dfa
->edests
[org_node
].elems
[0];
1271 re_node_set_empty (dfa
->edests
+ clone_node
);
1272 /* Search for a duplicated node which satisfies the constraint. */
1273 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1274 if (clone_dest
== -1)
1276 /* There are no such a duplicated node, create a new one. */
1277 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1278 if (BE (err
!= REG_NOERROR
, 0))
1280 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1281 if (BE (ret
< 0, 0))
1283 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1284 root_node
, constraint
);
1285 if (BE (err
!= REG_NOERROR
, 0))
1290 /* There are a duplicated node which satisfy the constraint,
1291 use it to avoid infinite loop. */
1292 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1293 if (BE (ret
< 0, 0))
1297 org_dest
= dfa
->edests
[org_node
].elems
[1];
1298 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1299 if (BE (err
!= REG_NOERROR
, 0))
1301 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1302 if (BE (ret
< 0, 0))
1305 org_node
= org_dest
;
1306 clone_node
= clone_dest
;
1311 /* Search for a node which is duplicated from the node ORG_NODE, and
1312 satisfies the constraint CONSTRAINT. */
1315 search_duplicated_node (dfa
, org_node
, constraint
)
1318 unsigned int constraint
;
1321 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1323 if (org_node
== dfa
->org_indices
[idx
]
1324 && constraint
== dfa
->nodes
[idx
].constraint
)
1325 return idx
; /* Found. */
1327 return -1; /* Not found. */
1330 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1331 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1332 otherwise return the error code. */
1334 static reg_errcode_t
1335 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1337 int *new_idx
, org_idx
;
1338 unsigned int constraint
;
1343 dup
= dfa
->nodes
[org_idx
];
1344 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1345 if (BE (dup_idx
== -1, 0))
1347 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1348 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1349 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1350 dfa
->nodes
[dup_idx
].duplicated
= 1;
1351 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1352 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1353 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1355 /* Store the index of the original node. */
1356 dfa
->org_indices
[dup_idx
] = org_idx
;
1362 calc_inveclosure (dfa
)
1366 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1368 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1370 dest
= dfa
->eclosures
[src
].elems
[idx
];
1371 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1376 /* Calculate "eclosure" for all the node in DFA. */
1378 static reg_errcode_t
1382 int node_idx
, incomplete
;
1384 assert (dfa
->nodes_len
> 0);
1387 /* For each nodes, calculate epsilon closure. */
1388 for (node_idx
= 0; ; ++node_idx
)
1391 re_node_set eclosure_elem
;
1392 if (node_idx
== dfa
->nodes_len
)
1401 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1403 /* If we have already calculated, skip it. */
1404 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1406 /* Calculate epsilon closure of `node_idx'. */
1407 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1408 if (BE (err
!= REG_NOERROR
, 0))
1411 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1414 re_node_set_free (&eclosure_elem
);
1420 /* Calculate epsilon closure of NODE. */
1422 static reg_errcode_t
1423 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1424 re_node_set
*new_set
;
1429 unsigned int constraint
;
1431 re_node_set eclosure
;
1433 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1434 if (BE (err
!= REG_NOERROR
, 0))
1437 /* This indicates that we are calculating this node now.
1438 We reference this value to avoid infinite loop. */
1439 dfa
->eclosures
[node
].nelem
= -1;
1441 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1442 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1443 /* If the current node has constraints, duplicate all nodes.
1444 Since they must inherit the constraints. */
1445 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1447 int org_node
, cur_node
;
1448 org_node
= cur_node
= node
;
1449 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1450 if (BE (err
!= REG_NOERROR
, 0))
1454 /* Expand each epsilon destination nodes. */
1455 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1456 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1458 re_node_set eclosure_elem
;
1459 int edest
= dfa
->edests
[node
].elems
[i
];
1460 /* If calculating the epsilon closure of `edest' is in progress,
1461 return intermediate result. */
1462 if (dfa
->eclosures
[edest
].nelem
== -1)
1467 /* If we haven't calculated the epsilon closure of `edest' yet,
1468 calculate now. Otherwise use calculated epsilon closure. */
1469 if (dfa
->eclosures
[edest
].nelem
== 0)
1471 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1472 if (BE (err
!= REG_NOERROR
, 0))
1476 eclosure_elem
= dfa
->eclosures
[edest
];
1477 /* Merge the epsilon closure of `edest'. */
1478 re_node_set_merge (&eclosure
, &eclosure_elem
);
1479 /* If the epsilon closure of `edest' is incomplete,
1480 the epsilon closure of this node is also incomplete. */
1481 if (dfa
->eclosures
[edest
].nelem
== 0)
1484 re_node_set_free (&eclosure_elem
);
1488 /* Epsilon closures include itself. */
1489 re_node_set_insert (&eclosure
, node
);
1490 if (incomplete
&& !root
)
1491 dfa
->eclosures
[node
].nelem
= 0;
1493 dfa
->eclosures
[node
] = eclosure
;
1494 *new_set
= eclosure
;
1498 /* Functions for token which are used in the parser. */
1500 /* Fetch a token from INPUT.
1501 We must not use this function inside bracket expressions. */
1504 fetch_token (input
, syntax
)
1506 reg_syntax_t syntax
;
1510 consumed_byte
= peek_token (&token
, input
, syntax
);
1511 re_string_skip_bytes (input
, consumed_byte
);
1515 /* Peek a token from INPUT, and return the length of the token.
1516 We must not use this function inside bracket expressions. */
1519 peek_token (token
, input
, syntax
)
1522 reg_syntax_t syntax
;
1526 if (re_string_eoi (input
))
1528 token
->type
= END_OF_RE
;
1532 c
= re_string_peek_byte (input
, 0);
1535 #ifdef RE_ENABLE_I18N
1536 token
->mb_partial
= 0;
1537 if (MB_CUR_MAX
> 1 &&
1538 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1540 token
->type
= CHARACTER
;
1541 token
->mb_partial
= 1;
1548 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1550 token
->type
= BACK_SLASH
;
1554 c2
= re_string_peek_byte_case (input
, 1);
1556 token
->type
= CHARACTER
;
1560 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1561 token
->type
= OP_ALT
;
1563 case '1': case '2': case '3': case '4': case '5':
1564 case '6': case '7': case '8': case '9':
1565 if (!(syntax
& RE_NO_BK_REFS
))
1567 token
->type
= OP_BACK_REF
;
1568 token
->opr
.idx
= c2
- '0';
1572 if (!(syntax
& RE_NO_GNU_OPS
))
1574 token
->type
= ANCHOR
;
1575 token
->opr
.idx
= WORD_FIRST
;
1579 if (!(syntax
& RE_NO_GNU_OPS
))
1581 token
->type
= ANCHOR
;
1582 token
->opr
.idx
= WORD_LAST
;
1586 if (!(syntax
& RE_NO_GNU_OPS
))
1588 token
->type
= ANCHOR
;
1589 token
->opr
.idx
= WORD_DELIM
;
1593 if (!(syntax
& RE_NO_GNU_OPS
))
1595 token
->type
= ANCHOR
;
1596 token
->opr
.idx
= INSIDE_WORD
;
1600 if (!(syntax
& RE_NO_GNU_OPS
))
1601 token
->type
= OP_WORD
;
1604 if (!(syntax
& RE_NO_GNU_OPS
))
1605 token
->type
= OP_NOTWORD
;
1608 if (!(syntax
& RE_NO_GNU_OPS
))
1610 token
->type
= ANCHOR
;
1611 token
->opr
.idx
= BUF_FIRST
;
1615 if (!(syntax
& RE_NO_GNU_OPS
))
1617 token
->type
= ANCHOR
;
1618 token
->opr
.idx
= BUF_LAST
;
1622 if (!(syntax
& RE_NO_BK_PARENS
))
1623 token
->type
= OP_OPEN_SUBEXP
;
1626 if (!(syntax
& RE_NO_BK_PARENS
))
1627 token
->type
= OP_CLOSE_SUBEXP
;
1630 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1631 token
->type
= OP_DUP_PLUS
;
1634 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1635 token
->type
= OP_DUP_QUESTION
;
1638 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1639 token
->type
= OP_OPEN_DUP_NUM
;
1642 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1643 token
->type
= OP_CLOSE_DUP_NUM
;
1651 token
->type
= CHARACTER
;
1655 if (syntax
& RE_NEWLINE_ALT
)
1656 token
->type
= OP_ALT
;
1659 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1660 token
->type
= OP_ALT
;
1663 token
->type
= OP_DUP_ASTERISK
;
1666 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1667 token
->type
= OP_DUP_PLUS
;
1670 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1671 token
->type
= OP_DUP_QUESTION
;
1674 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1675 token
->type
= OP_OPEN_DUP_NUM
;
1678 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1679 token
->type
= OP_CLOSE_DUP_NUM
;
1682 if (syntax
& RE_NO_BK_PARENS
)
1683 token
->type
= OP_OPEN_SUBEXP
;
1686 if (syntax
& RE_NO_BK_PARENS
)
1687 token
->type
= OP_CLOSE_SUBEXP
;
1690 token
->type
= OP_OPEN_BRACKET
;
1693 token
->type
= OP_PERIOD
;
1696 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1697 re_string_cur_idx (input
) != 0)
1699 char prev
= re_string_peek_byte (input
, -1);
1700 if (prev
!= '|' && prev
!= '(' &&
1701 (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n'))
1704 token
->type
= ANCHOR
;
1705 token
->opr
.idx
= LINE_FIRST
;
1708 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1709 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1712 re_string_skip_bytes (input
, 1);
1713 peek_token (&next
, input
, syntax
);
1714 re_string_skip_bytes (input
, -1);
1715 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1718 token
->type
= ANCHOR
;
1719 token
->opr
.idx
= LINE_LAST
;
1727 /* Peek a token from INPUT, and return the length of the token.
1728 We must not use this function out of bracket expressions. */
1731 peek_token_bracket (token
, input
, syntax
)
1734 reg_syntax_t syntax
;
1737 if (re_string_eoi (input
))
1739 token
->type
= END_OF_RE
;
1742 c
= re_string_peek_byte (input
, 0);
1745 #ifdef RE_ENABLE_I18N
1746 if (MB_CUR_MAX
> 1 &&
1747 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1749 token
->type
= CHARACTER
;
1752 #endif /* RE_ENABLE_I18N */
1754 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1756 /* In this case, '\' escape a character. */
1758 re_string_skip_bytes (input
, 1);
1759 c2
= re_string_peek_byte (input
, 0);
1761 token
->type
= CHARACTER
;
1764 if (c
== '[') /* '[' is a special char in a bracket exps. */
1768 c2
= re_string_peek_byte (input
, 1);
1774 token
->type
= OP_OPEN_COLL_ELEM
;
1777 token
->type
= OP_OPEN_EQUIV_CLASS
;
1780 if (syntax
& RE_CHAR_CLASSES
)
1782 token
->type
= OP_OPEN_CHAR_CLASS
;
1785 /* else fall through. */
1787 token
->type
= CHARACTER
;
1797 token
->type
= OP_CHARSET_RANGE
;
1800 token
->type
= OP_CLOSE_BRACKET
;
1803 token
->type
= OP_NON_MATCH_LIST
;
1806 token
->type
= CHARACTER
;
1811 /* Functions for parser. */
1813 /* Entry point of the parser.
1814 Parse the regular expression REGEXP and return the structure tree.
1815 If an error is occured, ERR is set by error code, and return NULL.
1816 This function build the following tree, from regular expression <reg_exp>:
1822 CAT means concatenation.
1823 EOR means end of regular expression. */
1826 parse (regexp
, preg
, syntax
, err
)
1827 re_string_t
*regexp
;
1829 reg_syntax_t syntax
;
1832 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1833 bin_tree_t
*tree
, *eor
, *root
;
1834 re_token_t current_token
;
1836 current_token
= fetch_token (regexp
, syntax
);
1837 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1838 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1840 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1841 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1843 root
= create_tree (tree
, eor
, CONCAT
, 0);
1846 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1854 /* This function build the following tree, from regular expression
1855 <branch1>|<branch2>:
1861 ALT means alternative, which represents the operator `|'. */
1864 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1865 re_string_t
*regexp
;
1868 reg_syntax_t syntax
;
1872 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1873 bin_tree_t
*tree
, *branch
= NULL
;
1875 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1876 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1879 while (token
->type
== OP_ALT
)
1881 re_token_t alt_token
= *token
;
1882 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1883 *token
= fetch_token (regexp
, syntax
);
1884 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1885 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1887 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1888 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1890 free_bin_tree (tree
);
1896 tree
= create_tree (tree
, branch
, 0, new_idx
);
1897 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1902 dfa
->has_plural_match
= 1;
1907 /* This function build the following tree, from regular expression
1914 CAT means concatenation. */
1917 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
1918 re_string_t
*regexp
;
1921 reg_syntax_t syntax
;
1925 bin_tree_t
*tree
, *exp
;
1926 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1927 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1930 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1931 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1933 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1934 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
1936 free_bin_tree (tree
);
1939 if (tree
!= NULL
&& exp
!= NULL
)
1941 tree
= create_tree (tree
, exp
, CONCAT
, 0);
1948 else if (tree
== NULL
)
1950 /* Otherwise exp == NULL, we don't need to create new tree. */
1955 /* This function build the following tree, from regular expression a*:
1962 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
1963 re_string_t
*regexp
;
1966 reg_syntax_t syntax
;
1970 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1973 switch (token
->type
)
1976 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1977 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1978 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1983 #ifdef RE_ENABLE_I18N
1986 while (!re_string_eoi (regexp
)
1987 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
1989 bin_tree_t
*mbc_remain
;
1990 *token
= fetch_token (regexp
, syntax
);
1991 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1992 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
1993 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
1994 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
1995 return *err
= REG_ESPACE
, NULL
;
2000 case OP_OPEN_SUBEXP
:
2001 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2002 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2005 case OP_OPEN_BRACKET
:
2006 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2007 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2011 if (BE (preg
->re_nsub
< token
->opr
.idx
2012 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
2017 dfa
->used_bkref_map
|= 1 << (token
->opr
.idx
- 1);
2018 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2019 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2020 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2026 dfa
->has_mb_node
= 1;
2028 case OP_DUP_ASTERISK
:
2030 case OP_DUP_QUESTION
:
2031 case OP_OPEN_DUP_NUM
:
2032 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2037 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2039 *token
= fetch_token (regexp
, syntax
);
2040 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2042 /* else fall through */
2043 case OP_CLOSE_SUBEXP
:
2044 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2045 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2050 /* else fall through */
2051 case OP_CLOSE_DUP_NUM
:
2052 /* We treat it as a normal character. */
2054 /* Then we can these characters as normal characters. */
2055 token
->type
= CHARACTER
;
2056 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2057 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2058 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2065 if (dfa
->word_char
== NULL
)
2067 *err
= init_word_char (dfa
);
2068 if (BE (*err
!= REG_NOERROR
, 0))
2071 if (token
->opr
.ctx_type
== WORD_DELIM
)
2073 bin_tree_t
*tree_first
, *tree_last
;
2074 int idx_first
, idx_last
;
2075 token
->opr
.ctx_type
= WORD_FIRST
;
2076 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
2077 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
2078 token
->opr
.ctx_type
= WORD_LAST
;
2079 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
2080 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
2081 token
->type
= OP_ALT
;
2082 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2083 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
2084 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
2085 || tree_first
== NULL
|| tree_last
== NULL
2086 || tree
== NULL
, 0))
2094 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2095 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2096 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2097 return *err
= REG_ESPACE
, NULL
;
2099 /* We must return here, since ANCHORs can't be followed
2100 by repetition operators.
2101 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2102 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2103 *token
= fetch_token (regexp
, syntax
);
2106 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2107 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2108 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2114 dfa
->has_mb_node
= 1;
2117 tree
= build_word_op (dfa
, 0, err
);
2118 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2122 tree
= build_word_op (dfa
, 1, err
);
2123 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2133 /* Must not happen? */
2139 *token
= fetch_token (regexp
, syntax
);
2141 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2142 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2144 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2145 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2147 dfa
->has_plural_match
= 1;
2153 /* This function build the following tree, from regular expression
2161 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2162 re_string_t
*regexp
;
2165 reg_syntax_t syntax
;
2169 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2170 bin_tree_t
*tree
, *left_par
, *right_par
;
2173 cur_nsub
= preg
->re_nsub
++;
2174 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2176 re_subexp_t
*new_array
;
2177 dfa
->subexps_alloc
*= 2;
2178 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2179 if (BE (new_array
== NULL
, 0))
2181 dfa
->subexps_alloc
/= 2;
2185 dfa
->subexps
= new_array
;
2187 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2188 dfa
->subexps
[cur_nsub
].end
= -1;
2190 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2191 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2192 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2197 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2198 *token
= fetch_token (regexp
, syntax
);
2200 /* The subexpression may be a null string. */
2201 if (token
->type
== OP_CLOSE_SUBEXP
)
2205 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2206 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2209 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2211 free_bin_tree (tree
);
2215 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2216 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2217 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2218 tree
= ((tree
== NULL
) ? right_par
2219 : create_tree (tree
, right_par
, CONCAT
, 0));
2220 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2221 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2226 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2231 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2234 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2235 bin_tree_t
*dup_elem
;
2236 re_string_t
*regexp
;
2239 reg_syntax_t syntax
;
2242 re_token_t dup_token
;
2243 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2244 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2245 re_token_t start_token
= *token
;
2246 if (token
->type
== OP_OPEN_DUP_NUM
)
2250 int start
= fetch_number (regexp
, token
, syntax
);
2254 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2255 start
= 0; /* We treat "{,m}" as "{0,m}". */
2258 *err
= REG_BADBR
; /* <re>{} is invalid. */
2262 if (BE (start
!= -2, 1))
2264 /* We treat "{n}" as "{n,n}". */
2265 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2266 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2267 ? fetch_number (regexp
, token
, syntax
) : -2));
2269 if (BE (start
== -2 || end
== -2, 0))
2271 /* Invalid sequence. */
2272 if (token
->type
== OP_CLOSE_DUP_NUM
)
2273 goto parse_dup_op_invalid_interval
;
2275 goto parse_dup_op_ebrace
;
2277 if (BE (start
== 0 && end
== 0, 0))
2279 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2280 *token
= fetch_token (regexp
, syntax
);
2281 free_bin_tree (dup_elem
);
2285 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2287 for (i
= 0; i
< start
; ++i
)
2290 work_tree
= duplicate_tree (elem
, dfa
);
2291 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2292 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2293 goto parse_dup_op_espace
;
2298 /* We treat "<re>{0,}" as "<re>*". */
2299 dup_token
.type
= OP_DUP_ASTERISK
;
2302 elem
= duplicate_tree (elem
, dfa
);
2303 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2304 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2305 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2306 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2307 || tree
== NULL
, 0))
2308 goto parse_dup_op_espace
;
2312 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2313 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2314 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2315 goto parse_dup_op_espace
;
2318 else if (end
- start
> 0)
2320 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2321 dup_token
.type
= OP_DUP_QUESTION
;
2324 elem
= duplicate_tree (elem
, dfa
);
2325 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2326 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2327 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2328 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2329 goto parse_dup_op_espace
;
2333 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2334 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2335 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2336 goto parse_dup_op_espace
;
2338 for (i
= 1; i
< end
- start
; ++i
)
2340 work_tree
= duplicate_tree (elem
, dfa
);
2341 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2342 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2352 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2353 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2354 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2360 *token
= fetch_token (regexp
, syntax
);
2363 parse_dup_op_espace
:
2364 free_bin_tree (tree
);
2368 parse_dup_op_ebrace
:
2369 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2374 goto parse_dup_op_rollback
;
2375 parse_dup_op_invalid_interval
:
2376 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2381 parse_dup_op_rollback
:
2382 re_string_set_index (regexp
, start_idx
);
2383 *token
= start_token
;
2384 token
->type
= CHARACTER
;
2388 /* Size of the names for collating symbol/equivalence_class/character_class.
2389 I'm not sure, but maybe enough. */
2390 #define BRACKET_NAME_BUF_SIZE 32
2393 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2394 Build the range expression which starts from START_ELEM, and ends
2395 at END_ELEM. The result are written to MBCSET and SBCSET.
2396 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2397 mbcset->range_ends, is a pointer argument sinse we may
2400 static reg_errcode_t
2401 # ifdef RE_ENABLE_I18N
2402 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2403 re_charset_t
*mbcset
;
2405 # else /* not RE_ENABLE_I18N */
2406 build_range_exp (sbcset
, start_elem
, end_elem
)
2407 # endif /* not RE_ENABLE_I18N */
2408 re_bitset_ptr_t sbcset
;
2409 bracket_elem_t
*start_elem
, *end_elem
;
2411 unsigned int start_ch
, end_ch
;
2412 /* Equivalence Classes and Character Classes can't be a range start/end. */
2413 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2414 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2418 /* We can handle no multi character collating elements without libc
2420 if (BE ((start_elem
->type
== COLL_SYM
2421 && strlen ((char *) start_elem
->opr
.name
) > 1)
2422 || (end_elem
->type
== COLL_SYM
2423 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2424 return REG_ECOLLATE
;
2426 # ifdef RE_ENABLE_I18N
2428 wchar_t wc
, start_wc
, end_wc
;
2429 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2431 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2432 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2434 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2435 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2437 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2438 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2439 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2440 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2441 cmp_buf
[0] = start_wc
;
2442 cmp_buf
[4] = end_wc
;
2443 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2446 /* Check the space of the arrays. */
2447 if (*range_alloc
== mbcset
->nranges
)
2449 /* There are not enough space, need realloc. */
2450 wchar_t *new_array_start
, *new_array_end
;
2453 /* +1 in case of mbcset->nranges is 0. */
2454 new_nranges
= 2 * mbcset
->nranges
+ 1;
2455 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2456 are NULL if *range_alloc == 0. */
2457 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2459 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2462 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2465 mbcset
->range_starts
= new_array_start
;
2466 mbcset
->range_ends
= new_array_end
;
2467 *range_alloc
= new_nranges
;
2470 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2471 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2473 /* Build the table for single byte characters. */
2474 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2477 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2478 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2479 bitset_set (sbcset
, wc
);
2482 # else /* not RE_ENABLE_I18N */
2485 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2486 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2488 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2489 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2491 if (start_ch
> end_ch
)
2493 /* Build the table for single byte characters. */
2494 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2495 if (start_ch
<= ch
&& ch
<= end_ch
)
2496 bitset_set (sbcset
, ch
);
2498 # endif /* not RE_ENABLE_I18N */
2501 #endif /* not _LIBC */
2504 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2505 Build the collating element which is represented by NAME.
2506 The result are written to MBCSET and SBCSET.
2507 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2508 pointer argument since we may update it. */
2510 static reg_errcode_t
2511 # ifdef RE_ENABLE_I18N
2512 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2513 re_charset_t
*mbcset
;
2514 int *coll_sym_alloc
;
2515 # else /* not RE_ENABLE_I18N */
2516 build_collating_symbol (sbcset
, name
)
2517 # endif /* not RE_ENABLE_I18N */
2518 re_bitset_ptr_t sbcset
;
2519 const unsigned char *name
;
2521 size_t name_len
= strlen ((const char *) name
);
2522 if (BE (name_len
!= 1, 0))
2523 return REG_ECOLLATE
;
2526 bitset_set (sbcset
, name
[0]);
2530 #endif /* not _LIBC */
2532 /* This function parse bracket expression like "[abc]", "[a-c]",
2536 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2537 re_string_t
*regexp
;
2540 reg_syntax_t syntax
;
2544 const unsigned char *collseqmb
;
2545 const char *collseqwc
;
2548 const int32_t *symb_table
;
2549 const unsigned char *extra
;
2551 /* Local function for parse_bracket_exp used in _LIBC environement.
2552 Seek the collating symbol entry correspondings to NAME.
2553 Return the index of the symbol in the SYMB_TABLE. */
2555 static inline int32_t
2556 seek_collating_symbol_entry (name
, name_len
)
2557 const unsigned char *name
;
2560 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2561 int32_t elem
= hash
% table_size
;
2562 int32_t second
= hash
% (table_size
- 2);
2563 while (symb_table
[2 * elem
] != 0)
2565 /* First compare the hashing value. */
2566 if (symb_table
[2 * elem
] == hash
2567 /* Compare the length of the name. */
2568 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2569 /* Compare the name. */
2570 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2573 /* Yep, this is the entry. */
2583 /* Local function for parse_bracket_exp used in _LIBC environement.
2584 Look up the collation sequence value of BR_ELEM.
2585 Return the value if succeeded, UINT_MAX otherwise. */
2587 static inline unsigned int
2588 lookup_collation_sequence_value (br_elem
)
2589 bracket_elem_t
*br_elem
;
2591 if (br_elem
->type
== SB_CHAR
)
2594 if (MB_CUR_MAX == 1)
2597 return collseqmb
[br_elem
->opr
.ch
];
2600 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2601 return collseq_table_lookup (collseqwc
, wc
);
2604 else if (br_elem
->type
== MB_CHAR
)
2606 return collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2608 else if (br_elem
->type
== COLL_SYM
)
2610 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2614 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2616 if (symb_table
[2 * elem
] != 0)
2618 /* We found the entry. */
2619 idx
= symb_table
[2 * elem
+ 1];
2620 /* Skip the name of collating element name. */
2621 idx
+= 1 + extra
[idx
];
2622 /* Skip the byte sequence of the collating element. */
2623 idx
+= 1 + extra
[idx
];
2624 /* Adjust for the alignment. */
2625 idx
= (idx
+ 3) & ~3;
2626 /* Skip the multibyte collation sequence value. */
2627 idx
+= sizeof (unsigned int);
2628 /* Skip the wide char sequence of the collating element. */
2629 idx
+= sizeof (unsigned int) *
2630 (1 + *(unsigned int *) (extra
+ idx
));
2631 /* Return the collation sequence value. */
2632 return *(unsigned int *) (extra
+ idx
);
2634 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2636 /* No valid character. Match it as a single byte
2638 return collseqmb
[br_elem
->opr
.name
[0]];
2641 else if (sym_name_len
== 1)
2642 return collseqmb
[br_elem
->opr
.name
[0]];
2647 /* Local function for parse_bracket_exp used in _LIBC environement.
2648 Build the range expression which starts from START_ELEM, and ends
2649 at END_ELEM. The result are written to MBCSET and SBCSET.
2650 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2651 mbcset->range_ends, is a pointer argument sinse we may
2654 static inline reg_errcode_t
2655 # ifdef RE_ENABLE_I18N
2656 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2657 re_charset_t
*mbcset
;
2659 # else /* not RE_ENABLE_I18N */
2660 build_range_exp (sbcset
, start_elem
, end_elem
)
2661 # endif /* not RE_ENABLE_I18N */
2662 re_bitset_ptr_t sbcset
;
2663 bracket_elem_t
*start_elem
, *end_elem
;
2666 uint32_t start_collseq
;
2667 uint32_t end_collseq
;
2669 # ifdef RE_ENABLE_I18N
2670 /* Check the space of the arrays. */
2671 if (*range_alloc
== mbcset
->nranges
)
2673 /* There are not enough space, need realloc. */
2674 uint32_t *new_array_start
;
2675 uint32_t *new_array_end
;
2678 /* +1 in case of mbcset->nranges is 0. */
2679 new_nranges
= 2 * mbcset
->nranges
+ 1;
2680 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2681 are NULL if *range_alloc == 0. */
2682 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2684 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2687 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2690 mbcset
->range_starts
= new_array_start
;
2691 mbcset
->range_ends
= new_array_end
;
2692 *range_alloc
= new_nranges
;
2694 # endif /* RE_ENABLE_I18N */
2696 /* Equivalence Classes and Character Classes can't be a range
2698 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2699 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2703 start_collseq
= lookup_collation_sequence_value (start_elem
);
2704 end_collseq
= lookup_collation_sequence_value (end_elem
);
2705 /* Check start/end collation sequence values. */
2706 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2707 return REG_ECOLLATE
;
2708 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2711 # ifdef RE_ENABLE_I18N
2712 /* Got valid collation sequence values, add them as a new entry. */
2713 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2714 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2715 # endif /* RE_ENABLE_I18N */
2717 /* Build the table for single byte characters. */
2718 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2720 uint32_t ch_collseq
;
2722 if (MB_CUR_MAX == 1)
2725 ch_collseq
= collseqmb
[ch
];
2727 ch_collseq
= collseq_table_lookup (collseqwc
, __btowc (ch
));
2728 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2729 bitset_set (sbcset
, ch
);
2734 /* Local function for parse_bracket_exp used in _LIBC environement.
2735 Build the collating element which is represented by NAME.
2736 The result are written to MBCSET and SBCSET.
2737 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2738 pointer argument sinse we may update it. */
2740 static inline reg_errcode_t
2741 # ifdef RE_ENABLE_I18N
2742 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2743 re_charset_t
*mbcset
;
2744 int *coll_sym_alloc
;
2745 # else /* not RE_ENABLE_I18N */
2746 build_collating_symbol (sbcset
, name
)
2747 # endif /* not RE_ENABLE_I18N */
2748 re_bitset_ptr_t sbcset
;
2749 const unsigned char *name
;
2752 size_t name_len
= strlen ((const char *) name
);
2755 elem
= seek_collating_symbol_entry (name
, name_len
);
2756 if (symb_table
[2 * elem
] != 0)
2758 /* We found the entry. */
2759 idx
= symb_table
[2 * elem
+ 1];
2760 /* Skip the name of collating element name. */
2761 idx
+= 1 + extra
[idx
];
2763 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2765 /* No valid character, treat it as a normal
2767 bitset_set (sbcset
, name
[0]);
2771 return REG_ECOLLATE
;
2773 # ifdef RE_ENABLE_I18N
2774 /* Got valid collation sequence, add it as a new entry. */
2775 /* Check the space of the arrays. */
2776 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2778 /* Not enough, realloc it. */
2779 /* +1 in case of mbcset->ncoll_syms is 0. */
2780 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2781 /* Use realloc since mbcset->coll_syms is NULL
2783 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2785 if (BE (mbcset
->coll_syms
== NULL
, 0))
2788 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2789 # endif /* RE_ENABLE_I18N */
2794 if (BE (name_len
!= 1, 0))
2795 return REG_ECOLLATE
;
2798 bitset_set (sbcset
, name
[0]);
2805 re_token_t br_token
;
2806 re_bitset_ptr_t sbcset
;
2807 #ifdef RE_ENABLE_I18N
2808 re_charset_t
*mbcset
;
2809 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2810 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2811 #else /* not RE_ENABLE_I18N */
2813 #endif /* not RE_ENABLE_I18N */
2814 bin_tree_t
*work_tree
;
2815 int token_len
, new_idx
;
2817 collseqmb
= (const unsigned char *)
2818 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2819 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2825 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2826 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2827 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2828 _NL_COLLATE_SYMB_TABLEMB
);
2829 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2830 _NL_COLLATE_SYMB_EXTRAMB
);
2833 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2834 #ifdef RE_ENABLE_I18N
2835 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2836 #endif /* RE_ENABLE_I18N */
2837 #ifdef RE_ENABLE_I18N
2838 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2840 if (BE (sbcset
== NULL
, 0))
2841 #endif /* RE_ENABLE_I18N */
2847 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2848 if (BE (token
->type
== END_OF_RE
, 0))
2851 goto parse_bracket_exp_free_return
;
2853 if (token
->type
== OP_NON_MATCH_LIST
)
2855 #ifdef RE_ENABLE_I18N
2857 mbcset
->non_match
= 1;
2858 #else /* not RE_ENABLE_I18N */
2860 #endif /* not RE_ENABLE_I18N */
2861 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2862 bitset_set (sbcset
, '\0');
2863 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2864 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2865 if (BE (token
->type
== END_OF_RE
, 0))
2868 goto parse_bracket_exp_free_return
;
2870 #ifdef RE_ENABLE_I18N
2872 for (i
= 0; i
< SBC_MAX
; ++i
)
2873 if (__btowc (i
) == WEOF
)
2874 bitset_set (sbcset
, i
);
2875 #endif /* RE_ENABLE_I18N */
2878 /* We treat the first ']' as a normal character. */
2879 if (token
->type
== OP_CLOSE_BRACKET
)
2880 token
->type
= CHARACTER
;
2884 bracket_elem_t start_elem
, end_elem
;
2885 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
2886 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
2888 int token_len2
= 0, is_range_exp
= 0;
2891 start_elem
.opr
.name
= start_name_buf
;
2892 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
2894 if (BE (ret
!= REG_NOERROR
, 0))
2897 goto parse_bracket_exp_free_return
;
2900 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2901 if (BE (token
->type
== END_OF_RE
, 0))
2904 goto parse_bracket_exp_free_return
;
2906 if (token
->type
== OP_CHARSET_RANGE
)
2908 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
2909 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
2910 if (BE (token
->type
== END_OF_RE
, 0))
2913 goto parse_bracket_exp_free_return
;
2915 if (token2
.type
== OP_CLOSE_BRACKET
)
2917 /* We treat the last '-' as a normal character. */
2918 re_string_skip_bytes (regexp
, -token_len
);
2919 token
->type
= CHARACTER
;
2925 if (is_range_exp
== 1)
2927 end_elem
.opr
.name
= end_name_buf
;
2928 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
2930 if (BE (ret
!= REG_NOERROR
, 0))
2933 goto parse_bracket_exp_free_return
;
2936 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2937 if (BE (token
->type
== END_OF_RE
, 0))
2940 goto parse_bracket_exp_free_return
;
2942 *err
= build_range_exp (sbcset
,
2943 #ifdef RE_ENABLE_I18N
2944 mbcset
, &range_alloc
,
2945 #endif /* RE_ENABLE_I18N */
2946 &start_elem
, &end_elem
);
2947 if (BE (*err
!= REG_NOERROR
, 0))
2948 goto parse_bracket_exp_free_return
;
2952 switch (start_elem
.type
)
2955 bitset_set (sbcset
, start_elem
.opr
.ch
);
2957 #ifdef RE_ENABLE_I18N
2959 /* Check whether the array has enough space. */
2960 if (mbchar_alloc
== mbcset
->nmbchars
)
2962 /* Not enough, realloc it. */
2963 /* +1 in case of mbcset->nmbchars is 0. */
2964 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
2965 /* Use realloc since array is NULL if *alloc == 0. */
2966 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
2968 if (BE (mbcset
->mbchars
== NULL
, 0))
2969 goto parse_bracket_exp_espace
;
2971 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
2973 #endif /* RE_ENABLE_I18N */
2975 *err
= build_equiv_class (sbcset
,
2976 #ifdef RE_ENABLE_I18N
2977 mbcset
, &equiv_class_alloc
,
2978 #endif /* RE_ENABLE_I18N */
2979 start_elem
.opr
.name
);
2980 if (BE (*err
!= REG_NOERROR
, 0))
2981 goto parse_bracket_exp_free_return
;
2984 *err
= build_collating_symbol (sbcset
,
2985 #ifdef RE_ENABLE_I18N
2986 mbcset
, &coll_sym_alloc
,
2987 #endif /* RE_ENABLE_I18N */
2988 start_elem
.opr
.name
);
2989 if (BE (*err
!= REG_NOERROR
, 0))
2990 goto parse_bracket_exp_free_return
;
2993 ret
= build_charclass (sbcset
,
2994 #ifdef RE_ENABLE_I18N
2995 mbcset
, &char_class_alloc
,
2996 #endif /* RE_ENABLE_I18N */
2997 start_elem
.opr
.name
, syntax
);
2998 if (BE (ret
!= REG_NOERROR
, 0))
2999 goto parse_bracket_exp_espace
;
3006 if (token
->type
== OP_CLOSE_BRACKET
)
3010 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3012 /* If it is non-matching list. */
3013 #ifdef RE_ENABLE_I18N
3014 if (mbcset
->non_match
)
3015 #else /* not RE_ENABLE_I18N */
3017 #endif /* not RE_ENABLE_I18N */
3018 bitset_not (sbcset
);
3020 /* Build a tree for simple bracket. */
3021 br_token
.type
= SIMPLE_BRACKET
;
3022 br_token
.opr
.sbcset
= sbcset
;
3023 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3024 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3025 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
3026 goto parse_bracket_exp_espace
;
3028 #ifdef RE_ENABLE_I18N
3029 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3030 || mbcset
->nranges
|| (MB_CUR_MAX
> 1 && (mbcset
->nchar_classes
3031 || mbcset
->non_match
)))
3033 re_token_t alt_token
;
3034 bin_tree_t
*mbc_tree
;
3035 /* Build a tree for complex bracket. */
3036 br_token
.type
= COMPLEX_BRACKET
;
3037 br_token
.opr
.mbcset
= mbcset
;
3038 dfa
->has_mb_node
= 1;
3039 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3040 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3041 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3042 goto parse_bracket_exp_espace
;
3043 /* Then join them by ALT node. */
3044 dfa
->has_plural_match
= 1;
3045 alt_token
.type
= OP_ALT
;
3046 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3047 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
3048 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3053 free_charset (mbcset
);
3056 #else /* not RE_ENABLE_I18N */
3058 #endif /* not RE_ENABLE_I18N */
3060 parse_bracket_exp_espace
:
3062 parse_bracket_exp_free_return
:
3064 #ifdef RE_ENABLE_I18N
3065 free_charset (mbcset
);
3066 #endif /* RE_ENABLE_I18N */
3070 /* Parse an element in the bracket expression. */
3072 static reg_errcode_t
3073 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
)
3074 bracket_elem_t
*elem
;
3075 re_string_t
*regexp
;
3079 reg_syntax_t syntax
;
3081 #ifdef RE_ENABLE_I18N
3083 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3084 if (cur_char_size
> 1)
3086 elem
->type
= MB_CHAR
;
3087 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3088 re_string_skip_bytes (regexp
, cur_char_size
);
3091 #endif /* RE_ENABLE_I18N */
3092 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3093 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3094 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3095 return parse_bracket_symbol (elem
, regexp
, token
);
3096 elem
->type
= SB_CHAR
;
3097 elem
->opr
.ch
= token
->opr
.c
;
3101 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3102 such as [:<character_class>:], [.<collating_element>.], and
3103 [=<equivalent_class>=]. */
3105 static reg_errcode_t
3106 parse_bracket_symbol (elem
, regexp
, token
)
3107 bracket_elem_t
*elem
;
3108 re_string_t
*regexp
;
3111 unsigned char ch
, delim
= token
->opr
.c
;
3115 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3117 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3118 ch
= re_string_fetch_byte_case (regexp
);
3120 ch
= re_string_fetch_byte (regexp
);
3121 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3123 elem
->opr
.name
[i
] = ch
;
3125 re_string_skip_bytes (regexp
, 1);
3126 elem
->opr
.name
[i
] = '\0';
3127 switch (token
->type
)
3129 case OP_OPEN_COLL_ELEM
:
3130 elem
->type
= COLL_SYM
;
3132 case OP_OPEN_EQUIV_CLASS
:
3133 elem
->type
= EQUIV_CLASS
;
3135 case OP_OPEN_CHAR_CLASS
:
3136 elem
->type
= CHAR_CLASS
;
3144 /* Helper function for parse_bracket_exp.
3145 Build the equivalence class which is represented by NAME.
3146 The result are written to MBCSET and SBCSET.
3147 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3148 is a pointer argument sinse we may update it. */
3150 static reg_errcode_t
3151 #ifdef RE_ENABLE_I18N
3152 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3153 re_charset_t
*mbcset
;
3154 int *equiv_class_alloc
;
3155 #else /* not RE_ENABLE_I18N */
3156 build_equiv_class (sbcset
, name
)
3157 #endif /* not RE_ENABLE_I18N */
3158 re_bitset_ptr_t sbcset
;
3159 const unsigned char *name
;
3161 #if defined _LIBC && defined RE_ENABLE_I18N
3162 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3165 const int32_t *table
, *indirect
;
3166 const unsigned char *weights
, *extra
, *cp
;
3167 unsigned char char_buf
[2];
3171 /* This #include defines a local function! */
3172 # include <locale/weight.h>
3173 /* Calculate the index for equivalence class. */
3175 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3176 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3177 _NL_COLLATE_WEIGHTMB
);
3178 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3179 _NL_COLLATE_EXTRAMB
);
3180 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3181 _NL_COLLATE_INDIRECTMB
);
3182 idx1
= findidx (&cp
);
3183 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3184 /* This isn't a valid character. */
3185 return REG_ECOLLATE
;
3187 /* Build single byte matcing table for this equivalence class. */
3188 char_buf
[1] = (unsigned char) '\0';
3189 len
= weights
[idx1
];
3190 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3194 idx2
= findidx (&cp
);
3199 /* This isn't a valid character. */
3201 if (len
== weights
[idx2
])
3204 while (cnt
<= len
&&
3205 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3209 bitset_set (sbcset
, ch
);
3212 /* Check whether the array has enough space. */
3213 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3215 /* Not enough, realloc it. */
3216 /* +1 in case of mbcset->nequiv_classes is 0. */
3217 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3218 /* Use realloc since the array is NULL if *alloc == 0. */
3219 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3220 *equiv_class_alloc
);
3221 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3224 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3227 #endif /* _LIBC && RE_ENABLE_I18N */
3229 if (BE (strlen ((const char *) name
) != 1, 0))
3230 return REG_ECOLLATE
;
3231 bitset_set (sbcset
, *name
);
3236 /* Helper function for parse_bracket_exp.
3237 Build the character class which is represented by NAME.
3238 The result are written to MBCSET and SBCSET.
3239 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3240 is a pointer argument sinse we may update it. */
3242 static reg_errcode_t
3243 #ifdef RE_ENABLE_I18N
3244 build_charclass (sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3245 re_charset_t
*mbcset
;
3246 int *char_class_alloc
;
3247 #else /* not RE_ENABLE_I18N */
3248 build_charclass (sbcset
, class_name
, syntax
)
3249 #endif /* not RE_ENABLE_I18N */
3250 re_bitset_ptr_t sbcset
;
3251 const unsigned char *class_name
;
3252 reg_syntax_t syntax
;
3255 const char *name
= (const char *) class_name
;
3257 /* In case of REG_ICASE "upper" and "lower" match the both of
3258 upper and lower cases. */
3259 if ((syntax
& RE_ICASE
)
3260 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3263 #ifdef RE_ENABLE_I18N
3264 /* Check the space of the arrays. */
3265 if (*char_class_alloc
== mbcset
->nchar_classes
)
3267 /* Not enough, realloc it. */
3268 /* +1 in case of mbcset->nchar_classes is 0. */
3269 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3270 /* Use realloc since array is NULL if *alloc == 0. */
3271 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3273 if (BE (mbcset
->char_classes
== NULL
, 0))
3276 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3277 #endif /* RE_ENABLE_I18N */
3279 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3280 for (i = 0; i < SBC_MAX; ++i) \
3282 if (ctype_func (i)) \
3283 bitset_set (sbcset, i); \
3286 if (strcmp (name
, "alnum") == 0)
3287 BUILD_CHARCLASS_LOOP (isalnum
)
3288 else if (strcmp (name
, "cntrl") == 0)
3289 BUILD_CHARCLASS_LOOP (iscntrl
)
3290 else if (strcmp (name
, "lower") == 0)
3291 BUILD_CHARCLASS_LOOP (islower
)
3292 else if (strcmp (name
, "space") == 0)
3293 BUILD_CHARCLASS_LOOP (isspace
)
3294 else if (strcmp (name
, "alpha") == 0)
3295 BUILD_CHARCLASS_LOOP (isalpha
)
3296 else if (strcmp (name
, "digit") == 0)
3297 BUILD_CHARCLASS_LOOP (isdigit
)
3298 else if (strcmp (name
, "print") == 0)
3299 BUILD_CHARCLASS_LOOP (isprint
)
3300 else if (strcmp (name
, "upper") == 0)
3301 BUILD_CHARCLASS_LOOP (isupper
)
3302 else if (strcmp (name
, "blank") == 0)
3303 BUILD_CHARCLASS_LOOP (isblank
)
3304 else if (strcmp (name
, "graph") == 0)
3305 BUILD_CHARCLASS_LOOP (isgraph
)
3306 else if (strcmp (name
, "punct") == 0)
3307 BUILD_CHARCLASS_LOOP (ispunct
)
3308 else if (strcmp (name
, "xdigit") == 0)
3309 BUILD_CHARCLASS_LOOP (isxdigit
)
3317 build_word_op (dfa
, not, err
)
3322 re_bitset_ptr_t sbcset
;
3323 #ifdef RE_ENABLE_I18N
3324 re_charset_t
*mbcset
;
3326 #else /* not RE_ENABLE_I18N */
3328 #endif /* not RE_ENABLE_I18N */
3330 re_token_t br_token
;
3334 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3335 #ifdef RE_ENABLE_I18N
3336 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3337 #endif /* RE_ENABLE_I18N */
3339 #ifdef RE_ENABLE_I18N
3340 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3341 #else /* not RE_ENABLE_I18N */
3342 if (BE (sbcset
== NULL
, 0))
3343 #endif /* not RE_ENABLE_I18N */
3351 #ifdef RE_ENABLE_I18N
3354 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3355 bitset_set(cset->sbcset, '\0');
3357 mbcset
->non_match
= 1;
3359 for (i
= 0; i
< SBC_MAX
; ++i
)
3360 if (__btowc (i
) == WEOF
)
3361 bitset_set (sbcset
, i
);
3362 #else /* not RE_ENABLE_I18N */
3364 #endif /* not RE_ENABLE_I18N */
3367 /* We don't care the syntax in this case. */
3368 ret
= build_charclass (sbcset
,
3369 #ifdef RE_ENABLE_I18N
3371 #endif /* RE_ENABLE_I18N */
3372 (const unsigned char *) "alpha", 0);
3374 if (BE (ret
!= REG_NOERROR
, 0))
3377 #ifdef RE_ENABLE_I18N
3378 free_charset (mbcset
);
3379 #endif /* RE_ENABLE_I18N */
3383 /* \w match '_' also. */
3384 bitset_set (sbcset
, '_');
3386 /* If it is non-matching list. */
3387 #ifdef RE_ENABLE_I18N
3388 if (mbcset
->non_match
)
3389 #else /* not RE_ENABLE_I18N */
3391 #endif /* not RE_ENABLE_I18N */
3392 bitset_not (sbcset
);
3394 /* Build a tree for simple bracket. */
3395 br_token
.type
= SIMPLE_BRACKET
;
3396 br_token
.opr
.sbcset
= sbcset
;
3397 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3398 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3399 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3400 goto build_word_op_espace
;
3402 #ifdef RE_ENABLE_I18N
3405 re_token_t alt_token
;
3406 bin_tree_t
*mbc_tree
;
3407 /* Build a tree for complex bracket. */
3408 br_token
.type
= COMPLEX_BRACKET
;
3409 br_token
.opr
.mbcset
= mbcset
;
3410 dfa
->has_mb_node
= 1;
3411 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3412 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3413 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3414 goto build_word_op_espace
;
3415 /* Then join them by ALT node. */
3416 alt_token
.type
= OP_ALT
;
3417 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3418 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3419 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3424 free_charset (mbcset
);
3427 #else /* not RE_ENABLE_I18N */
3429 #endif /* not RE_ENABLE_I18N */
3431 build_word_op_espace
:
3433 #ifdef RE_ENABLE_I18N
3434 free_charset (mbcset
);
3435 #endif /* RE_ENABLE_I18N */
3440 /* This is intended for the expressions like "a{1,3}".
3441 Fetch a number from `input', and return the number.
3442 Return -1, if the number field is empty like "{,1}".
3443 Return -2, If an error is occured. */
3446 fetch_number (input
, token
, syntax
)
3449 reg_syntax_t syntax
;
3455 *token
= fetch_token (input
, syntax
);
3457 if (BE (token
->type
== END_OF_RE
, 0))
3459 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3461 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3462 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3463 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3468 #ifdef RE_ENABLE_I18N
3470 free_charset (re_charset_t
*cset
)
3472 re_free (cset
->mbchars
);
3474 re_free (cset
->coll_syms
);
3475 re_free (cset
->equiv_classes
);
3476 re_free (cset
->range_starts
);
3477 re_free (cset
->range_ends
);
3479 re_free (cset
->char_classes
);
3482 #endif /* RE_ENABLE_I18N */
3484 /* Functions for binary tree operation. */
3486 /* Create a node of tree.
3487 Note: This function automatically free left and right if malloc fails. */
3490 create_tree (left
, right
, type
, index
)
3493 re_token_type_t type
;
3497 tree
= re_malloc (bin_tree_t
, 1);
3498 if (BE (tree
== NULL
, 0))
3500 free_bin_tree (left
);
3501 free_bin_tree (right
);
3504 tree
->parent
= NULL
;
3506 tree
->right
= right
;
3508 tree
->node_idx
= index
;
3511 re_node_set_init_empty (&tree
->eclosure
);
3514 left
->parent
= tree
;
3516 right
->parent
= tree
;
3520 /* Free the sub tree pointed by TREE. */
3523 free_bin_tree (tree
)
3528 /*re_node_set_free (&tree->eclosure);*/
3529 free_bin_tree (tree
->left
);
3530 free_bin_tree (tree
->right
);
3534 /* Duplicate the node SRC, and return new node. */
3537 duplicate_tree (src
, dfa
)
3538 const bin_tree_t
*src
;
3541 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3543 /* Since node indies must be according to Post-order of the tree,
3544 we must duplicate the left at first. */
3545 if (src
->left
!= NULL
)
3547 left
= duplicate_tree (src
->left
, dfa
);
3552 /* Secondaly, duplicate the right. */
3553 if (src
->right
!= NULL
)
3555 right
= duplicate_tree (src
->right
, dfa
);
3558 free_bin_tree (left
);
3563 /* At last, duplicate itself. */
3564 if (src
->type
== NON_TYPE
)
3566 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3567 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3568 if (BE (new_node_idx
== -1, 0))
3570 free_bin_tree (left
);
3571 free_bin_tree (right
);
3576 new_node_idx
= src
->type
;
3578 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3579 if (BE (new_tree
== NULL
, 0))
3581 free_bin_tree (left
);
3582 free_bin_tree (right
);