1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
72 int length
, reg_syntax_t syntax
);
73 static void re_compile_fastmap_iter (regex_t
*bufp
,
74 const re_dfastate_t
*init_state
,
76 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
77 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
79 static void free_charset (re_charset_t
*cset
);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t
*preg
);
82 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
83 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
84 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
85 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
86 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
87 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
88 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
89 int top_clone_node
, int root_node
,
90 unsigned int constraint
);
91 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
92 unsigned int constraint
);
93 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
94 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
96 static void calc_inveclosure (re_dfa_t
*dfa
);
97 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
99 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
100 static int peek_token (re_token_t
*token
, re_string_t
*input
,
101 reg_syntax_t syntax
);
102 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
103 reg_syntax_t syntax
);
104 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
105 reg_syntax_t syntax
, reg_errcode_t
*err
);
106 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
107 re_token_t
*token
, reg_syntax_t syntax
,
108 int nest
, reg_errcode_t
*err
);
109 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
110 re_token_t
*token
, reg_syntax_t syntax
,
111 int nest
, reg_errcode_t
*err
);
112 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
113 re_token_t
*token
, reg_syntax_t syntax
,
114 int nest
, reg_errcode_t
*err
);
115 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
116 re_token_t
*token
, reg_syntax_t syntax
,
117 int nest
, reg_errcode_t
*err
);
118 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
119 re_dfa_t
*dfa
, re_token_t
*token
,
120 reg_syntax_t syntax
, reg_errcode_t
*err
);
121 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
122 re_token_t
*token
, reg_syntax_t syntax
,
124 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
126 re_token_t
*token
, int token_len
,
128 reg_syntax_t syntax
);
129 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
135 re_charset_t
*mbcset
, int *range_alloc
,
136 bracket_elem_t
*start_elem
,
137 bracket_elem_t
*end_elem
);
138 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
139 re_charset_t
*mbcset
,
141 const unsigned char *name
);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
144 bracket_elem_t
*start_elem
,
145 bracket_elem_t
*end_elem
);
146 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
147 const unsigned char *name
);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
152 re_charset_t
*mbcset
,
153 int *equiv_class_alloc
,
154 const unsigned char *name
);
155 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
156 re_charset_t
*mbcset
,
157 int *char_class_alloc
,
158 const unsigned char *class_name
,
159 reg_syntax_t syntax
);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
162 const unsigned char *name
);
163 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
164 const unsigned char *class_name
,
165 reg_syntax_t syntax
);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t
*build_word_op (re_dfa_t
*dfa
, int not, reg_errcode_t
*err
);
168 static void free_bin_tree (bin_tree_t
*tree
);
169 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
170 re_token_type_t type
, int index
);
171 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
178 const char __re_error_msgid
[] attribute_hidden
=
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
232 const size_t __re_error_msgid_idx
[] attribute_hidden
=
253 /* Entry points for GNU code. */
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
263 re_compile_pattern (pattern
, length
, bufp
)
266 struct re_pattern_buffer
*bufp
;
270 /* GNU code is written to assume at least RE_NREGS registers will be set
271 (and at least one extra will be -1). */
272 bufp
->regs_allocated
= REGS_UNALLOCATED
;
274 /* And GNU code determines whether or not to get register information
275 by passing null for the REGS argument to re_match, etc., not by
279 /* Match anchors at newline. */
280 bufp
->newline_anchor
= 1;
282 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
286 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
289 weak_alias (__re_compile_pattern
, re_compile_pattern
)
292 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
293 also be assigned to arbitrarily: each pattern buffer stores its own
294 syntax, so it can be changed between regex compilations. */
295 /* This has no initializer because initialized variables in Emacs
296 become read-only after dumping. */
297 reg_syntax_t re_syntax_options
;
300 /* Specify the precise syntax of regexps for compilation. This provides
301 for compatibility for various utilities which historically have
302 different, incompatible syntaxes.
304 The argument SYNTAX is a bit mask comprised of the various bits
305 defined in regex.h. We return the old syntax. */
308 re_set_syntax (syntax
)
311 reg_syntax_t ret
= re_syntax_options
;
313 re_syntax_options
= syntax
;
317 weak_alias (__re_set_syntax
, re_set_syntax
)
321 re_compile_fastmap (bufp
)
322 struct re_pattern_buffer
*bufp
;
324 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
325 char *fastmap
= bufp
->fastmap
;
327 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
328 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
329 if (dfa
->init_state
!= dfa
->init_state_word
)
330 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
331 if (dfa
->init_state
!= dfa
->init_state_nl
)
332 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
333 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
334 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
335 bufp
->fastmap_accurate
= 1;
339 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
342 /* Helper function for re_compile_fastmap.
343 Compile fastmap for the initial_state INIT_STATE. */
346 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
348 const re_dfastate_t
*init_state
;
351 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
353 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
355 int node
= init_state
->nodes
.elems
[node_cnt
];
356 re_token_type_t type
= dfa
->nodes
[node
].type
;
358 if (type
== CHARACTER
)
359 fastmap
[dfa
->nodes
[node
].opr
.c
] = 1;
360 else if (type
== SIMPLE_BRACKET
)
363 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
364 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
365 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
368 #ifdef RE_ENABLE_I18N
369 else if (type
== COMPLEX_BRACKET
)
372 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
373 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
374 || cset
->nranges
|| cset
->nchar_classes
)
377 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
379 /* In this case we want to catch the bytes which are
380 the first byte of any collation elements.
381 e.g. In da_DK, we want to catch 'a' since "aa"
382 is a valid collation element, and don't catch
383 'b' since 'b' is the only collation element
384 which starts from 'b'. */
386 const int32_t *table
= (const int32_t *)
387 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
388 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
389 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
395 for (i
= 0; i
< SBC_MAX
; ++i
)
396 if (__btowc (i
) == WEOF
)
398 # endif /* not _LIBC */
400 for (i
= 0; i
< cset
->nmbchars
; ++i
)
403 wctomb (buf
, cset
->mbchars
[i
]);
404 fastmap
[*(unsigned char *) buf
] = 1;
407 #endif /* RE_ENABLE_I18N */
408 else if (type
== END_OF_RE
|| type
== OP_PERIOD
409 #ifdef RE_ENABLE_I18N
410 || type
== COMPLEX_BRACKET
411 #endif /* RE_ENABLE_I18N */
414 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
415 if (type
== END_OF_RE
)
416 bufp
->can_be_null
= 1;
422 /* Entry point for POSIX code. */
423 /* regcomp takes a regular expression as a string and compiles it.
425 PREG is a regex_t *. We do not expect any fields to be initialized,
426 since POSIX says we shouldn't. Thus, we set
428 `buffer' to the compiled pattern;
429 `used' to the length of the compiled pattern;
430 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
431 REG_EXTENDED bit in CFLAGS is set; otherwise, to
432 RE_SYNTAX_POSIX_BASIC;
433 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
434 `fastmap' to an allocated space for the fastmap;
435 `fastmap_accurate' to zero;
436 `re_nsub' to the number of subexpressions in PATTERN.
438 PATTERN is the address of the pattern string.
440 CFLAGS is a series of bits which affect compilation.
442 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
443 use POSIX basic syntax.
445 If REG_NEWLINE is set, then . and [^...] don't match newline.
446 Also, regexec will try a match beginning after every newline.
448 If REG_ICASE is set, then we considers upper- and lowercase
449 versions of letters to be equivalent when matching.
451 If REG_NOSUB is set, then when PREG is passed to regexec, that
452 routine will report only success or failure, and nothing about the
455 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
456 the return codes and their meanings.) */
459 regcomp (preg
, pattern
, cflags
)
460 regex_t
*__restrict preg
;
461 const char *__restrict pattern
;
465 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
466 : RE_SYNTAX_POSIX_BASIC
);
472 /* Try to allocate space for the fastmap. */
473 preg
->fastmap
= re_malloc (char, SBC_MAX
);
474 if (BE (preg
->fastmap
== NULL
, 0))
477 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
479 /* If REG_NEWLINE is set, newlines are treated differently. */
480 if (cflags
& REG_NEWLINE
)
481 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
482 syntax
&= ~RE_DOT_NEWLINE
;
483 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
484 /* It also changes the matching behavior. */
485 preg
->newline_anchor
= 1;
488 preg
->newline_anchor
= 0;
489 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
490 preg
->translate
= NULL
;
492 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
494 /* POSIX doesn't distinguish between an unmatched open-group and an
495 unmatched close-group: both are REG_EPAREN. */
496 if (ret
== REG_ERPAREN
)
499 /* We have already checked preg->fastmap != NULL. */
500 if (BE (ret
== REG_NOERROR
, 1))
502 /* Compute the fastmap now, since regexec cannot modify the pattern
504 if (BE (re_compile_fastmap (preg
) == -2, 0))
506 /* Some error occurred while computing the fastmap, just forget
508 re_free (preg
->fastmap
);
509 preg
->fastmap
= NULL
;
516 weak_alias (__regcomp
, regcomp
)
519 /* Returns a message corresponding to an error code, ERRCODE, returned
520 from either regcomp or regexec. We don't use PREG here. */
523 regerror (errcode
, preg
, errbuf
, errbuf_size
)
533 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
534 / sizeof (__re_error_msgid_idx
[0])), 0))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
541 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
543 msg_size
= strlen (msg
) + 1; /* Includes the null. */
545 if (BE (errbuf_size
!= 0, 1))
547 if (BE (msg_size
> errbuf_size
, 0))
549 #if defined HAVE_MEMPCPY || defined _LIBC
550 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
552 memcpy (errbuf
, msg
, errbuf_size
- 1);
553 errbuf
[errbuf_size
- 1] = 0;
557 memcpy (errbuf
, msg
, msg_size
);
563 weak_alias (__regerror
, regerror
)
566 /* Free dynamically allocated space used by PREG. */
573 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
574 if (BE (dfa
!= NULL
, 1))
576 re_free (dfa
->subexps
);
578 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
580 re_token_t
*node
= dfa
->nodes
+ i
;
581 #ifdef RE_ENABLE_I18N
582 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
583 free_charset (node
->opr
.mbcset
);
585 #endif /* RE_ENABLE_I18N */
586 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
587 re_free (node
->opr
.sbcset
);
589 re_free (dfa
->nexts
);
590 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
592 if (dfa
->eclosures
!= NULL
)
593 re_node_set_free (dfa
->eclosures
+ i
);
594 if (dfa
->inveclosures
!= NULL
)
595 re_node_set_free (dfa
->inveclosures
+ i
);
596 if (dfa
->edests
!= NULL
)
597 re_node_set_free (dfa
->edests
+ i
);
599 re_free (dfa
->edests
);
600 re_free (dfa
->eclosures
);
601 re_free (dfa
->inveclosures
);
602 re_free (dfa
->nodes
);
604 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
606 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
607 for (j
= 0; j
< entry
->num
; ++j
)
609 re_dfastate_t
*state
= entry
->array
[j
];
610 if (state
->entrance_nodes
!= &state
->nodes
)
612 re_node_set_free (state
->entrance_nodes
);
613 re_free (state
->entrance_nodes
);
615 re_node_set_free (&state
->nodes
);
616 re_free (state
->trtable
);
617 re_free (state
->trtable_search
);
620 re_free (entry
->array
);
622 re_free (dfa
->state_table
);
624 if (dfa
->word_char
!= NULL
)
625 re_free (dfa
->word_char
);
627 re_free (dfa
->re_str
);
631 re_free (preg
->fastmap
);
634 weak_alias (__regfree
, regfree
)
637 /* Entry points compatible with 4.2 BSD regex library. We don't define
638 them unless specifically requested. */
640 #if defined _REGEX_RE_COMP || defined _LIBC
642 /* BSD has one and only one pattern buffer. */
643 static struct re_pattern_buffer re_comp_buf
;
647 /* Make these definitions weak in libc, so POSIX programs can redefine
648 these names if they don't use our functions, and still use
649 regcomp/regexec above without link errors. */
660 if (!re_comp_buf
.buffer
)
661 return gettext ("No previous regular expression");
665 if (re_comp_buf
.buffer
)
667 fastmap
= re_comp_buf
.fastmap
;
668 re_comp_buf
.fastmap
= NULL
;
669 __regfree (&re_comp_buf
);
670 re_comp_buf
.buffer
= NULL
;
671 re_comp_buf
.allocated
= 0;
672 re_comp_buf
.fastmap
= fastmap
;
675 if (re_comp_buf
.fastmap
== NULL
)
677 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
678 if (re_comp_buf
.fastmap
== NULL
)
679 return (char *) gettext (__re_error_msgid
680 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
683 /* Since `re_exec' always passes NULL for the `regs' argument, we
684 don't need to initialize the pattern buffer fields which affect it. */
686 /* Match anchors at newlines. */
687 re_comp_buf
.newline_anchor
= 1;
689 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
694 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
695 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
699 static void __attribute__ ((unused
))
702 __regfree (&re_comp_buf
);
704 text_set_element (__libc_subfreeres
, free_mem
);
707 #endif /* _REGEX_RE_COMP */
709 /* Internal entry point.
710 Compile the regular expression PATTERN, whose length is LENGTH.
711 SYNTAX indicate regular expression's syntax. */
714 re_compile_internal (preg
, pattern
, length
, syntax
)
716 const char * pattern
;
720 reg_errcode_t err
= REG_NOERROR
;
724 /* Initialize the pattern buffer. */
725 preg
->fastmap_accurate
= 0;
726 preg
->syntax
= syntax
;
727 preg
->not_bol
= preg
->not_eol
= 0;
731 /* Initialize the dfa. */
732 dfa
= (re_dfa_t
*) preg
->buffer
;
733 if (preg
->allocated
< sizeof (re_dfa_t
))
735 /* If zero allocated, but buffer is non-null, try to realloc
736 enough space. This loses if buffer's address is bogus, but
737 that is the user's responsibility. If ->buffer is NULL this
738 is a simple allocation. */
739 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
742 preg
->allocated
= sizeof (re_dfa_t
);
744 preg
->buffer
= (unsigned char *) dfa
;
745 preg
->used
= sizeof (re_dfa_t
);
747 err
= init_dfa (dfa
, length
);
748 if (BE (err
!= REG_NOERROR
, 0))
755 dfa
->re_str
= re_malloc (char, length
+ 1);
756 strncpy (dfa
->re_str
, pattern
, length
+ 1);
759 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
761 if (BE (err
!= REG_NOERROR
, 0))
768 /* Parse the regular expression, and build a structure tree. */
770 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
771 if (BE (dfa
->str_tree
== NULL
, 0))
772 goto re_compile_internal_free_return
;
774 /* Analyze the tree and collect information which is necessary to
777 if (BE (err
!= REG_NOERROR
, 0))
778 goto re_compile_internal_free_return
;
780 /* Then create the initial state of the dfa. */
781 err
= create_initial_state (dfa
);
782 if (BE (err
!= REG_NOERROR
, 0))
783 goto re_compile_internal_free_return
;
785 re_compile_internal_free_return
:
786 /* Release work areas. */
787 free_workarea_compile (preg
);
788 re_string_destruct (®exp
);
793 /* Initialize DFA. We use the length of the regular expression PAT_LEN
794 as the initial length of some arrays. */
797 init_dfa (dfa
, pat_len
)
803 memset (dfa
, '\0', sizeof (re_dfa_t
));
805 dfa
->nodes_alloc
= pat_len
+ 1;
806 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
808 dfa
->states_alloc
= pat_len
+ 1;
810 /* table_size = 2 ^ ceil(log pat_len) */
811 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
812 if (table_size
> pat_len
)
815 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
816 dfa
->state_hash_mask
= table_size
- 1;
818 dfa
->subexps_alloc
= 1;
819 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
820 dfa
->word_char
= NULL
;
822 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
823 || dfa
->subexps
== NULL
, 0))
825 /* We don't bother to free anything which was allocated. Very
826 soon the process will go down anyway. */
828 dfa
->state_table
= NULL
;
835 /* Initialize WORD_CHAR table, which indicate which character is
836 "word". In this case "word" means that it is the word construction
837 character used by some operators like "\<", "\>", etc. */
844 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
845 if (BE (dfa
->word_char
== NULL
, 0))
847 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
848 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
849 if (isalnum (ch
) || ch
== '_')
850 dfa
->word_char
[i
] |= 1 << j
;
854 /* Free the work area which are only used while compiling. */
857 free_workarea_compile (preg
)
860 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
861 free_bin_tree (dfa
->str_tree
);
862 dfa
->str_tree
= NULL
;
865 /* Create initial states for all contexts. */
868 create_initial_state (dfa
)
873 re_node_set init_nodes
;
875 /* Initial states have the epsilon closure of the node which is
876 the first node of the regular expression. */
877 first
= dfa
->str_tree
->first
;
878 dfa
->init_node
= first
;
879 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
880 if (BE (err
!= REG_NOERROR
, 0))
883 /* The back-references which are in initial states can epsilon transit,
884 since in this case all of the subexpressions can be null.
885 Then we add epsilon closures of the nodes which are the next nodes of
886 the back-references. */
887 if (dfa
->nbackref
> 0)
888 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
890 int node_idx
= init_nodes
.elems
[i
];
891 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
894 if (type
!= OP_BACK_REF
)
896 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
898 re_token_t
*clexp_node
;
899 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
900 if (clexp_node
->type
== OP_CLOSE_SUBEXP
901 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
904 if (clexp_idx
== init_nodes
.nelem
)
907 if (type
== OP_BACK_REF
)
909 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
910 if (!re_node_set_contains (&init_nodes
, dest_idx
))
912 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
918 /* It must be the first time to invoke acquire_state. */
919 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
920 /* We don't check ERR here, since the initial state must not be NULL. */
921 if (BE (dfa
->init_state
== NULL
, 0))
923 if (dfa
->init_state
->has_constraint
)
925 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
927 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
929 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
933 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
934 || dfa
->init_state_begbuf
== NULL
, 0))
938 dfa
->init_state_word
= dfa
->init_state_nl
939 = dfa
->init_state_begbuf
= dfa
->init_state
;
941 re_node_set_free (&init_nodes
);
945 /* Analyze the structure tree, and calculate "first", "next", "edest",
946 "eclosure", and "inveclosure". */
955 /* Allocate arrays. */
956 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
957 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
958 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
959 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
960 if (BE (dfa
->nexts
== NULL
|| dfa
->edests
== NULL
961 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
963 /* Initialize them. */
964 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
967 re_node_set_init_empty (dfa
->edests
+ i
);
968 re_node_set_init_empty (dfa
->eclosures
+ i
);
969 re_node_set_init_empty (dfa
->inveclosures
+ i
);
972 ret
= analyze_tree (dfa
, dfa
->str_tree
);
973 if (BE (ret
== REG_NOERROR
, 1))
975 ret
= calc_eclosure (dfa
);
976 if (ret
== REG_NOERROR
)
977 calc_inveclosure (dfa
);
982 /* Helper functions for analyze.
983 This function calculate "first", "next", and "edest" for the subtree
984 whose root is NODE. */
987 analyze_tree (dfa
, node
)
992 if (node
->first
== -1)
993 calc_first (dfa
, node
);
994 if (node
->next
== -1)
995 calc_next (dfa
, node
);
996 if (node
->eclosure
.nelem
== 0)
997 calc_epsdest (dfa
, node
);
998 /* Calculate "first" etc. for the left child. */
999 if (node
->left
!= NULL
)
1001 ret
= analyze_tree (dfa
, node
->left
);
1002 if (BE (ret
!= REG_NOERROR
, 0))
1005 /* Calculate "first" etc. for the right child. */
1006 if (node
->right
!= NULL
)
1008 ret
= analyze_tree (dfa
, node
->right
);
1009 if (BE (ret
!= REG_NOERROR
, 0))
1015 /* Calculate "first" for the node NODE. */
1017 calc_first (dfa
, node
)
1022 idx
= node
->node_idx
;
1023 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1028 case OP_OPEN_BRACKET
:
1029 case OP_CLOSE_BRACKET
:
1030 case OP_OPEN_DUP_NUM
:
1031 case OP_CLOSE_DUP_NUM
:
1032 case OP_NON_MATCH_LIST
:
1033 case OP_OPEN_COLL_ELEM
:
1034 case OP_CLOSE_COLL_ELEM
:
1035 case OP_OPEN_EQUIV_CLASS
:
1036 case OP_CLOSE_EQUIV_CLASS
:
1037 case OP_OPEN_CHAR_CLASS
:
1038 case OP_CLOSE_CHAR_CLASS
:
1039 /* These must not be appeared here. */
1045 case OP_DUP_ASTERISK
:
1046 case OP_DUP_QUESTION
:
1047 #ifdef RE_ENABLE_I18N
1048 case COMPLEX_BRACKET
:
1049 #endif /* RE_ENABLE_I18N */
1050 case SIMPLE_BRACKET
:
1053 case OP_OPEN_SUBEXP
:
1054 case OP_CLOSE_SUBEXP
:
1059 assert (node
->left
!= NULL
);
1061 if (node
->left
->first
== -1)
1062 calc_first (dfa
, node
->left
);
1063 node
->first
= node
->left
->first
;
1068 /* else fall through */
1071 assert (node
->left
!= NULL
);
1073 if (node
->left
->first
== -1)
1074 calc_first (dfa
, node
->left
);
1075 node
->first
= node
->left
->first
;
1080 /* Calculate "next" for the node NODE. */
1083 calc_next (dfa
, node
)
1088 bin_tree_t
*parent
= node
->parent
;
1092 idx
= node
->node_idx
;
1093 if (node
->type
== 0)
1094 dfa
->nexts
[idx
] = node
->next
;
1098 idx
= parent
->node_idx
;
1099 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1103 case OP_DUP_ASTERISK
:
1108 if (parent
->left
== node
)
1110 if (parent
->right
->first
== -1)
1111 calc_first (dfa
, parent
->right
);
1112 node
->next
= parent
->right
->first
;
1115 /* else fall through */
1117 if (parent
->next
== -1)
1118 calc_next (dfa
, parent
);
1119 node
->next
= parent
->next
;
1122 idx
= node
->node_idx
;
1123 if (node
->type
== 0)
1124 dfa
->nexts
[idx
] = node
->next
;
1127 /* Calculate "edest" for the node NODE. */
1130 calc_epsdest (dfa
, node
)
1135 idx
= node
->node_idx
;
1136 if (node
->type
== 0)
1138 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1139 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1140 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1142 if (node
->left
->first
== -1)
1143 calc_first (dfa
, node
->left
);
1144 if (node
->next
== -1)
1145 calc_next (dfa
, node
);
1146 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1149 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1152 if (node
->left
!= NULL
)
1154 if (node
->left
->first
== -1)
1155 calc_first (dfa
, node
->left
);
1156 left
= node
->left
->first
;
1160 if (node
->next
== -1)
1161 calc_next (dfa
, node
);
1164 if (node
->right
!= NULL
)
1166 if (node
->right
->first
== -1)
1167 calc_first (dfa
, node
->right
);
1168 right
= node
->right
->first
;
1172 if (node
->next
== -1)
1173 calc_next (dfa
, node
);
1176 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1178 else if (dfa
->nodes
[idx
].type
== ANCHOR
1179 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1180 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1181 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1182 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1186 /* Duplicate the epsilon closure of the node ROOT_NODE.
1187 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1188 to their own constraint. */
1190 static reg_errcode_t
1191 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1194 int top_org_node
, top_clone_node
, root_node
;
1195 unsigned int init_constraint
;
1198 int org_node
, clone_node
, ret
;
1199 unsigned int constraint
= init_constraint
;
1200 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1202 int org_dest
, clone_dest
;
1203 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1205 /* If the back reference epsilon-transit, its destination must
1206 also have the constraint. Then duplicate the epsilon closure
1207 of the destination of the back reference, and store it in
1208 edests of the back reference. */
1209 org_dest
= dfa
->nexts
[org_node
];
1210 re_node_set_empty (dfa
->edests
+ clone_node
);
1211 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1212 if (BE (err
!= REG_NOERROR
, 0))
1214 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1215 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1216 if (BE (ret
< 0, 0))
1219 else if (dfa
->edests
[org_node
].nelem
== 0)
1221 /* In case of the node can't epsilon-transit, don't duplicate the
1222 destination and store the original destination as the
1223 destination of the node. */
1224 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1227 else if (dfa
->edests
[org_node
].nelem
== 1)
1229 /* In case of the node can epsilon-transit, and it has only one
1231 org_dest
= dfa
->edests
[org_node
].elems
[0];
1232 re_node_set_empty (dfa
->edests
+ clone_node
);
1233 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1235 /* In case of the node has another constraint, append it. */
1236 if (org_node
== root_node
&& clone_node
!= org_node
)
1238 /* ...but if the node is root_node itself, it means the
1239 epsilon closure have a loop, then tie it to the
1240 destination of the root_node. */
1241 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1243 if (BE (ret
< 0, 0))
1247 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1249 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1250 if (BE (err
!= REG_NOERROR
, 0))
1252 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1253 if (BE (ret
< 0, 0))
1256 else /* dfa->edests[org_node].nelem == 2 */
1258 /* In case of the node can epsilon-transit, and it has two
1260 org_dest
= dfa
->edests
[org_node
].elems
[0];
1261 re_node_set_empty (dfa
->edests
+ clone_node
);
1262 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1263 if (BE (err
!= REG_NOERROR
, 0))
1265 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1266 if (BE (ret
< 0, 0))
1269 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
, root_node
,
1271 if (BE (err
!= REG_NOERROR
, 0))
1274 org_dest
= dfa
->edests
[org_node
].elems
[1];
1275 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1276 if (BE (err
!= REG_NOERROR
, 0))
1278 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1279 if (BE (ret
< 0, 0))
1282 org_node
= org_dest
;
1283 clone_node
= clone_dest
;
1288 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1289 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1290 otherwise return the error code. */
1292 static reg_errcode_t
1293 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1295 int *new_idx
, org_idx
;
1296 unsigned int constraint
;
1301 dup
= dfa
->nodes
[org_idx
];
1302 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1303 if (BE (dup_idx
== -1, 0))
1305 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1306 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1307 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1308 dfa
->nodes
[dup_idx
].duplicated
= 1;
1309 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1310 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1311 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1318 calc_inveclosure (dfa
)
1322 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1324 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1326 dest
= dfa
->eclosures
[src
].elems
[idx
];
1327 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1332 /* Calculate "eclosure" for all the node in DFA. */
1334 static reg_errcode_t
1338 int node_idx
, incomplete
;
1340 assert (dfa
->nodes_len
> 0);
1343 /* For each nodes, calculate epsilon closure. */
1344 for (node_idx
= 0; ; ++node_idx
)
1347 re_node_set eclosure_elem
;
1348 if (node_idx
== dfa
->nodes_len
)
1357 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1359 /* If we have already calculated, skip it. */
1360 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1362 /* Calculate epsilon closure of `node_idx'. */
1363 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1364 if (BE (err
!= REG_NOERROR
, 0))
1367 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1370 re_node_set_free (&eclosure_elem
);
1376 /* Calculate epsilon closure of NODE. */
1378 static reg_errcode_t
1379 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1380 re_node_set
*new_set
;
1385 unsigned int constraint
;
1387 re_node_set eclosure
;
1389 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1390 if (BE (err
!= REG_NOERROR
, 0))
1393 /* This indicates that we are calculating this node now.
1394 We reference this value to avoid infinite loop. */
1395 dfa
->eclosures
[node
].nelem
= -1;
1397 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1398 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1399 /* If the current node has constraints, duplicate all nodes.
1400 Since they must inherit the constraints. */
1401 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1403 int org_node
, cur_node
;
1404 org_node
= cur_node
= node
;
1405 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1406 if (BE (err
!= REG_NOERROR
, 0))
1410 /* Expand each epsilon destination nodes. */
1411 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1412 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1414 re_node_set eclosure_elem
;
1415 int edest
= dfa
->edests
[node
].elems
[i
];
1416 /* If calculating the epsilon closure of `edest' is in progress,
1417 return intermediate result. */
1418 if (dfa
->eclosures
[edest
].nelem
== -1)
1423 /* If we haven't calculated the epsilon closure of `edest' yet,
1424 calculate now. Otherwise use calculated epsilon closure. */
1425 if (dfa
->eclosures
[edest
].nelem
== 0)
1427 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1428 if (BE (err
!= REG_NOERROR
, 0))
1432 eclosure_elem
= dfa
->eclosures
[edest
];
1433 /* Merge the epsilon closure of `edest'. */
1434 re_node_set_merge (&eclosure
, &eclosure_elem
);
1435 /* If the epsilon closure of `edest' is incomplete,
1436 the epsilon closure of this node is also incomplete. */
1437 if (dfa
->eclosures
[edest
].nelem
== 0)
1440 re_node_set_free (&eclosure_elem
);
1444 /* Epsilon closures include itself. */
1445 re_node_set_insert (&eclosure
, node
);
1446 if (incomplete
&& !root
)
1447 dfa
->eclosures
[node
].nelem
= 0;
1449 dfa
->eclosures
[node
] = eclosure
;
1450 *new_set
= eclosure
;
1454 /* Functions for token which are used in the parser. */
1456 /* Fetch a token from INPUT.
1457 We must not use this function inside bracket expressions. */
1460 fetch_token (input
, syntax
)
1462 reg_syntax_t syntax
;
1466 consumed_byte
= peek_token (&token
, input
, syntax
);
1467 re_string_skip_bytes (input
, consumed_byte
);
1471 /* Peek a token from INPUT, and return the length of the token.
1472 We must not use this function inside bracket expressions. */
1475 peek_token (token
, input
, syntax
)
1478 reg_syntax_t syntax
;
1482 if (re_string_eoi (input
))
1484 token
->type
= END_OF_RE
;
1488 c
= re_string_peek_byte (input
, 0);
1491 #ifdef RE_ENABLE_I18N
1492 token
->mb_partial
= 0;
1493 if (MB_CUR_MAX
> 1 &&
1494 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1496 token
->type
= CHARACTER
;
1497 token
->mb_partial
= 1;
1504 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1506 token
->type
= BACK_SLASH
;
1510 c2
= re_string_peek_byte_case (input
, 1);
1512 token
->type
= CHARACTER
;
1516 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1517 token
->type
= OP_ALT
;
1519 case '1': case '2': case '3': case '4': case '5':
1520 case '6': case '7': case '8': case '9':
1521 if (!(syntax
& RE_NO_BK_REFS
))
1523 token
->type
= OP_BACK_REF
;
1524 token
->opr
.idx
= c2
- '0';
1528 if (!(syntax
& RE_NO_GNU_OPS
))
1530 token
->type
= ANCHOR
;
1531 token
->opr
.idx
= WORD_FIRST
;
1535 if (!(syntax
& RE_NO_GNU_OPS
))
1537 token
->type
= ANCHOR
;
1538 token
->opr
.idx
= WORD_LAST
;
1542 if (!(syntax
& RE_NO_GNU_OPS
))
1544 token
->type
= ANCHOR
;
1545 token
->opr
.idx
= WORD_DELIM
;
1549 if (!(syntax
& RE_NO_GNU_OPS
))
1551 token
->type
= ANCHOR
;
1552 token
->opr
.idx
= INSIDE_WORD
;
1556 if (!(syntax
& RE_NO_GNU_OPS
))
1557 token
->type
= OP_WORD
;
1560 if (!(syntax
& RE_NO_GNU_OPS
))
1561 token
->type
= OP_NOTWORD
;
1564 if (!(syntax
& RE_NO_GNU_OPS
))
1566 token
->type
= ANCHOR
;
1567 token
->opr
.idx
= BUF_FIRST
;
1571 if (!(syntax
& RE_NO_GNU_OPS
))
1573 token
->type
= ANCHOR
;
1574 token
->opr
.idx
= BUF_LAST
;
1578 if (!(syntax
& RE_NO_BK_PARENS
))
1579 token
->type
= OP_OPEN_SUBEXP
;
1582 if (!(syntax
& RE_NO_BK_PARENS
))
1583 token
->type
= OP_CLOSE_SUBEXP
;
1586 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1587 token
->type
= OP_DUP_PLUS
;
1590 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1591 token
->type
= OP_DUP_QUESTION
;
1594 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1595 token
->type
= OP_OPEN_DUP_NUM
;
1598 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1599 token
->type
= OP_CLOSE_DUP_NUM
;
1607 token
->type
= CHARACTER
;
1611 if (syntax
& RE_NEWLINE_ALT
)
1612 token
->type
= OP_ALT
;
1615 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1616 token
->type
= OP_ALT
;
1619 token
->type
= OP_DUP_ASTERISK
;
1622 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1623 token
->type
= OP_DUP_PLUS
;
1626 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1627 token
->type
= OP_DUP_QUESTION
;
1630 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1631 token
->type
= OP_OPEN_DUP_NUM
;
1634 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1635 token
->type
= OP_CLOSE_DUP_NUM
;
1638 if (syntax
& RE_NO_BK_PARENS
)
1639 token
->type
= OP_OPEN_SUBEXP
;
1642 if (syntax
& RE_NO_BK_PARENS
)
1643 token
->type
= OP_CLOSE_SUBEXP
;
1646 token
->type
= OP_OPEN_BRACKET
;
1649 token
->type
= OP_PERIOD
;
1652 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1653 re_string_cur_idx (input
) != 0)
1655 char prev
= re_string_peek_byte (input
, -1);
1656 if (prev
!= '|' && prev
!= '(' &&
1657 (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n'))
1660 token
->type
= ANCHOR
;
1661 token
->opr
.idx
= LINE_FIRST
;
1664 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1665 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1668 re_string_skip_bytes (input
, 1);
1669 peek_token (&next
, input
, syntax
);
1670 re_string_skip_bytes (input
, -1);
1671 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1674 token
->type
= ANCHOR
;
1675 token
->opr
.idx
= LINE_LAST
;
1683 /* Peek a token from INPUT, and return the length of the token.
1684 We must not use this function out of bracket expressions. */
1687 peek_token_bracket (token
, input
, syntax
)
1690 reg_syntax_t syntax
;
1693 if (re_string_eoi (input
))
1695 token
->type
= END_OF_RE
;
1698 c
= re_string_peek_byte (input
, 0);
1701 #ifdef RE_ENABLE_I18N
1702 if (MB_CUR_MAX
> 1 &&
1703 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1705 token
->type
= CHARACTER
;
1708 #endif /* RE_ENABLE_I18N */
1710 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1712 /* In this case, '\' escape a character. */
1714 re_string_skip_bytes (input
, 1);
1715 c2
= re_string_peek_byte (input
, 0);
1717 token
->type
= CHARACTER
;
1720 if (c
== '[') /* '[' is a special char in a bracket exps. */
1724 c2
= re_string_peek_byte (input
, 1);
1730 token
->type
= OP_OPEN_COLL_ELEM
;
1733 token
->type
= OP_OPEN_EQUIV_CLASS
;
1736 if (syntax
& RE_CHAR_CLASSES
)
1738 token
->type
= OP_OPEN_CHAR_CLASS
;
1741 /* else fall through. */
1743 token
->type
= CHARACTER
;
1753 token
->type
= OP_CHARSET_RANGE
;
1756 token
->type
= OP_CLOSE_BRACKET
;
1759 token
->type
= OP_NON_MATCH_LIST
;
1762 token
->type
= CHARACTER
;
1767 /* Functions for parser. */
1769 /* Entry point of the parser.
1770 Parse the regular expression REGEXP and return the structure tree.
1771 If an error is occured, ERR is set by error code, and return NULL.
1772 This function build the following tree, from regular expression <reg_exp>:
1778 CAT means concatenation.
1779 EOR means end of regular expression. */
1782 parse (regexp
, preg
, syntax
, err
)
1783 re_string_t
*regexp
;
1785 reg_syntax_t syntax
;
1788 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1789 bin_tree_t
*tree
, *eor
, *root
;
1790 re_token_t current_token
;
1792 current_token
= fetch_token (regexp
, syntax
);
1793 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1794 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1796 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1797 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1799 root
= create_tree (tree
, eor
, CONCAT
, 0);
1802 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1810 /* This function build the following tree, from regular expression
1811 <branch1>|<branch2>:
1817 ALT means alternative, which represents the operator `|'. */
1820 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1821 re_string_t
*regexp
;
1824 reg_syntax_t syntax
;
1828 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1829 bin_tree_t
*tree
, *branch
= NULL
;
1831 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1832 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1835 while (token
->type
== OP_ALT
)
1837 re_token_t alt_token
= *token
;
1838 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1839 *token
= fetch_token (regexp
, syntax
);
1840 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1841 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1843 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1844 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1846 free_bin_tree (tree
);
1852 tree
= create_tree (tree
, branch
, 0, new_idx
);
1853 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1858 dfa
->has_plural_match
= 1;
1863 /* This function build the following tree, from regular expression
1870 CAT means concatenation. */
1873 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
1874 re_string_t
*regexp
;
1877 reg_syntax_t syntax
;
1881 bin_tree_t
*tree
, *exp
;
1882 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1883 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1886 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1887 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1889 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1890 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
1892 free_bin_tree (tree
);
1895 if (tree
!= NULL
&& exp
!= NULL
)
1897 tree
= create_tree (tree
, exp
, CONCAT
, 0);
1904 else if (tree
== NULL
)
1906 /* Otherwise exp == NULL, we don't need to create new tree. */
1911 /* This function build the following tree, from regular expression a*:
1918 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
1919 re_string_t
*regexp
;
1922 reg_syntax_t syntax
;
1926 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1929 switch (token
->type
)
1932 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1933 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1934 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1939 #ifdef RE_ENABLE_I18N
1942 while (!re_string_eoi (regexp
)
1943 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
1945 bin_tree_t
*mbc_remain
;
1946 *token
= fetch_token (regexp
, syntax
);
1947 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1948 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
1949 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
1950 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
1951 return *err
= REG_ESPACE
, NULL
;
1956 case OP_OPEN_SUBEXP
:
1957 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
1958 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1961 case OP_OPEN_BRACKET
:
1962 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
1963 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1967 if (BE (preg
->re_nsub
< token
->opr
.idx
1968 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
1973 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1974 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1975 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1981 dfa
->has_mb_node
= 1;
1983 case OP_DUP_ASTERISK
:
1985 case OP_DUP_QUESTION
:
1986 case OP_OPEN_DUP_NUM
:
1987 if (syntax
& RE_CONTEXT_INVALID_OPS
)
1992 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
1994 *token
= fetch_token (regexp
, syntax
);
1995 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1997 /* else fall through */
1998 case OP_CLOSE_SUBEXP
:
1999 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2000 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2005 /* else fall through */
2006 case OP_CLOSE_DUP_NUM
:
2007 /* We treat it as a normal character. */
2009 /* Then we can these characters as normal characters. */
2010 token
->type
= CHARACTER
;
2011 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2012 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2013 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2020 if (dfa
->word_char
== NULL
)
2022 *err
= init_word_char (dfa
);
2023 if (BE (*err
!= REG_NOERROR
, 0))
2026 if (token
->opr
.ctx_type
== WORD_DELIM
)
2028 bin_tree_t
*tree_first
, *tree_last
;
2029 int idx_first
, idx_last
;
2030 token
->opr
.ctx_type
= WORD_FIRST
;
2031 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
2032 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
2033 token
->opr
.ctx_type
= WORD_LAST
;
2034 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
2035 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
2036 token
->type
= OP_ALT
;
2037 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2038 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
2039 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
2040 || tree_first
== NULL
|| tree_last
== NULL
2041 || tree
== NULL
, 0))
2049 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2050 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2051 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2052 return *err
= REG_ESPACE
, NULL
;
2054 /* We must return here, since ANCHORs can't be followed
2055 by repetition operators.
2056 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2057 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2058 *token
= fetch_token (regexp
, syntax
);
2061 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2062 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2063 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2069 dfa
->has_mb_node
= 1;
2072 tree
= build_word_op (dfa
, 0, err
);
2073 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2077 tree
= build_word_op (dfa
, 1, err
);
2078 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2088 /* Must not happen? */
2094 *token
= fetch_token (regexp
, syntax
);
2096 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2097 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2099 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2100 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2102 dfa
->has_plural_match
= 1;
2108 /* This function build the following tree, from regular expression
2116 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2117 re_string_t
*regexp
;
2120 reg_syntax_t syntax
;
2124 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2125 bin_tree_t
*tree
, *left_par
, *right_par
;
2128 cur_nsub
= preg
->re_nsub
++;
2129 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2131 re_subexp_t
*new_array
;
2132 dfa
->subexps_alloc
*= 2;
2133 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2134 if (BE (new_array
== NULL
, 0))
2136 dfa
->subexps_alloc
/= 2;
2140 dfa
->subexps
= new_array
;
2142 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2143 dfa
->subexps
[cur_nsub
].end
= -1;
2145 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2146 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2147 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2152 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2153 *token
= fetch_token (regexp
, syntax
);
2155 /* The subexpression may be a null string. */
2156 if (token
->type
== OP_CLOSE_SUBEXP
)
2160 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2161 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2164 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2166 free_bin_tree (tree
);
2170 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2171 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2172 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2173 tree
= ((tree
== NULL
) ? right_par
2174 : create_tree (tree
, right_par
, CONCAT
, 0));
2175 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2176 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2181 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2186 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2189 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2190 bin_tree_t
*dup_elem
;
2191 re_string_t
*regexp
;
2194 reg_syntax_t syntax
;
2197 re_token_t dup_token
;
2198 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2199 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2200 re_token_t start_token
= *token
;
2201 if (token
->type
== OP_OPEN_DUP_NUM
)
2205 int start
= fetch_number (regexp
, token
, syntax
);
2209 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2210 start
= 0; /* We treat "{,m}" as "{0,m}". */
2213 *err
= REG_BADBR
; /* <re>{} is invalid. */
2217 if (BE (start
!= -2, 1))
2219 /* We treat "{n}" as "{n,n}". */
2220 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2221 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2222 ? fetch_number (regexp
, token
, syntax
) : -2));
2224 if (BE (start
== -2 || end
== -2, 0))
2226 /* Invalid sequence. */
2227 if (token
->type
== OP_CLOSE_DUP_NUM
)
2228 goto parse_dup_op_invalid_interval
;
2230 goto parse_dup_op_ebrace
;
2232 if (BE (start
== 0 && end
== 0, 0))
2234 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2235 *token
= fetch_token (regexp
, syntax
);
2236 free_bin_tree (dup_elem
);
2240 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2242 for (i
= 0; i
< start
; ++i
)
2245 work_tree
= duplicate_tree (elem
, dfa
);
2246 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2247 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2248 goto parse_dup_op_espace
;
2253 /* We treat "<re>{0,}" as "<re>*". */
2254 dup_token
.type
= OP_DUP_ASTERISK
;
2257 elem
= duplicate_tree (elem
, dfa
);
2258 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2259 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2260 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2261 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2262 || tree
== NULL
, 0))
2263 goto parse_dup_op_espace
;
2267 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2268 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2269 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2270 goto parse_dup_op_espace
;
2273 else if (end
- start
> 0)
2275 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2276 dup_token
.type
= OP_DUP_QUESTION
;
2279 elem
= duplicate_tree (elem
, dfa
);
2280 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2281 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2282 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2283 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2284 goto parse_dup_op_espace
;
2288 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2289 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2290 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2291 goto parse_dup_op_espace
;
2293 for (i
= 1; i
< end
- start
; ++i
)
2295 work_tree
= duplicate_tree (elem
, dfa
);
2296 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2297 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2307 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2308 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2309 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2315 *token
= fetch_token (regexp
, syntax
);
2318 parse_dup_op_espace
:
2319 free_bin_tree (tree
);
2323 parse_dup_op_ebrace
:
2324 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2329 goto parse_dup_op_rollback
;
2330 parse_dup_op_invalid_interval
:
2331 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2336 parse_dup_op_rollback
:
2337 re_string_set_index (regexp
, start_idx
);
2338 *token
= start_token
;
2339 token
->type
= CHARACTER
;
2343 /* Size of the names for collating symbol/equivalence_class/character_class.
2344 I'm not sure, but maybe enough. */
2345 #define BRACKET_NAME_BUF_SIZE 32
2348 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2349 Build the range expression which starts from START_ELEM, and ends
2350 at END_ELEM. The result are written to MBCSET and SBCSET.
2351 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2352 mbcset->range_ends, is a pointer argument sinse we may
2355 static reg_errcode_t
2356 # ifdef RE_ENABLE_I18N
2357 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2358 re_charset_t
*mbcset
;
2360 # else /* not RE_ENABLE_I18N */
2361 build_range_exp (sbcset
, start_elem
, end_elem
)
2362 # endif /* not RE_ENABLE_I18N */
2363 re_bitset_ptr_t sbcset
;
2364 bracket_elem_t
*start_elem
, *end_elem
;
2366 unsigned int start_ch
, end_ch
;
2367 /* Equivalence Classes and Character Classes can't be a range start/end. */
2368 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2369 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2373 /* We can handle no multi character collating elements without libc
2375 if (BE ((start_elem
->type
== COLL_SYM
2376 && strlen ((char *) start_elem
->opr
.name
) > 1)
2377 || (end_elem
->type
== COLL_SYM
2378 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2379 return REG_ECOLLATE
;
2381 # ifdef RE_ENABLE_I18N
2383 wchar_t wc
, start_wc
, end_wc
;
2384 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2386 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2387 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2389 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2390 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2392 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2393 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2394 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2395 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2396 cmp_buf
[0] = start_wc
;
2397 cmp_buf
[4] = end_wc
;
2398 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2401 /* Check the space of the arrays. */
2402 if (*range_alloc
== mbcset
->nranges
)
2404 /* There are not enough space, need realloc. */
2405 wchar_t *new_array_start
, *new_array_end
;
2408 /* +1 in case of mbcset->nranges is 0. */
2409 new_nranges
= 2 * mbcset
->nranges
+ 1;
2410 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2411 are NULL if *range_alloc == 0. */
2412 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2414 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2417 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2420 mbcset
->range_starts
= new_array_start
;
2421 mbcset
->range_ends
= new_array_end
;
2422 *range_alloc
= new_nranges
;
2425 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2426 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2428 /* Build the table for single byte characters. */
2429 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2432 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2433 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2434 bitset_set (sbcset
, wc
);
2437 # else /* not RE_ENABLE_I18N */
2440 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2441 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2443 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2444 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2446 if (start_ch
> end_ch
)
2448 /* Build the table for single byte characters. */
2449 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2450 if (start_ch
<= ch
&& ch
<= end_ch
)
2451 bitset_set (sbcset
, ch
);
2453 # endif /* not RE_ENABLE_I18N */
2456 #endif /* not _LIBC */
2459 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2460 Build the collating element which is represented by NAME.
2461 The result are written to MBCSET and SBCSET.
2462 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2463 pointer argument since we may update it. */
2465 static reg_errcode_t
2466 # ifdef RE_ENABLE_I18N
2467 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2468 re_charset_t
*mbcset
;
2469 int *coll_sym_alloc
;
2470 # else /* not RE_ENABLE_I18N */
2471 build_collating_symbol (sbcset
, name
)
2472 # endif /* not RE_ENABLE_I18N */
2473 re_bitset_ptr_t sbcset
;
2474 const unsigned char *name
;
2476 size_t name_len
= strlen ((const char *) name
);
2477 if (BE (name_len
!= 1, 0))
2478 return REG_ECOLLATE
;
2481 bitset_set (sbcset
, name
[0]);
2485 #endif /* not _LIBC */
2487 /* This function parse bracket expression like "[abc]", "[a-c]",
2491 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2492 re_string_t
*regexp
;
2495 reg_syntax_t syntax
;
2499 const unsigned char *collseqmb
;
2500 const char *collseqwc
;
2503 const int32_t *symb_table
;
2504 const unsigned char *extra
;
2506 /* Local function for parse_bracket_exp used in _LIBC environement.
2507 Seek the collating symbol entry correspondings to NAME.
2508 Return the index of the symbol in the SYMB_TABLE. */
2510 static inline int32_t
2511 seek_collating_symbol_entry (name
, name_len
)
2512 const unsigned char *name
;
2515 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2516 int32_t elem
= hash
% table_size
;
2517 int32_t second
= hash
% (table_size
- 2);
2518 while (symb_table
[2 * elem
] != 0)
2520 /* First compare the hashing value. */
2521 if (symb_table
[2 * elem
] == hash
2522 /* Compare the length of the name. */
2523 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2524 /* Compare the name. */
2525 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2528 /* Yep, this is the entry. */
2538 /* Local function for parse_bracket_exp used in _LIBC environement.
2539 Look up the collation sequence value of BR_ELEM.
2540 Return the value if succeeded, UINT_MAX otherwise. */
2542 static inline unsigned int
2543 lookup_collation_sequence_value (br_elem
)
2544 bracket_elem_t
*br_elem
;
2546 if (br_elem
->type
== SB_CHAR
)
2549 if (MB_CUR_MAX == 1)
2552 return collseqmb
[br_elem
->opr
.ch
];
2555 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2556 return collseq_table_lookup (collseqwc
, wc
);
2559 else if (br_elem
->type
== MB_CHAR
)
2561 return collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2563 else if (br_elem
->type
== COLL_SYM
)
2565 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2569 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2571 if (symb_table
[2 * elem
] != 0)
2573 /* We found the entry. */
2574 idx
= symb_table
[2 * elem
+ 1];
2575 /* Skip the name of collating element name. */
2576 idx
+= 1 + extra
[idx
];
2577 /* Skip the byte sequence of the collating element. */
2578 idx
+= 1 + extra
[idx
];
2579 /* Adjust for the alignment. */
2580 idx
= (idx
+ 3) & ~3;
2581 /* Skip the multibyte collation sequence value. */
2582 idx
+= sizeof (unsigned int);
2583 /* Skip the wide char sequence of the collating element. */
2584 idx
+= sizeof (unsigned int) *
2585 (1 + *(unsigned int *) (extra
+ idx
));
2586 /* Return the collation sequence value. */
2587 return *(unsigned int *) (extra
+ idx
);
2589 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2591 /* No valid character. Match it as a single byte
2593 return collseqmb
[br_elem
->opr
.name
[0]];
2596 else if (sym_name_len
== 1)
2597 return collseqmb
[br_elem
->opr
.name
[0]];
2602 /* Local function for parse_bracket_exp used in _LIBC environement.
2603 Build the range expression which starts from START_ELEM, and ends
2604 at END_ELEM. The result are written to MBCSET and SBCSET.
2605 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2606 mbcset->range_ends, is a pointer argument sinse we may
2609 static inline reg_errcode_t
2610 # ifdef RE_ENABLE_I18N
2611 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2612 re_charset_t
*mbcset
;
2614 # else /* not RE_ENABLE_I18N */
2615 build_range_exp (sbcset
, start_elem
, end_elem
)
2616 # endif /* not RE_ENABLE_I18N */
2617 re_bitset_ptr_t sbcset
;
2618 bracket_elem_t
*start_elem
, *end_elem
;
2621 uint32_t start_collseq
;
2622 uint32_t end_collseq
;
2624 # ifdef RE_ENABLE_I18N
2625 /* Check the space of the arrays. */
2626 if (*range_alloc
== mbcset
->nranges
)
2628 /* There are not enough space, need realloc. */
2629 uint32_t *new_array_start
;
2630 uint32_t *new_array_end
;
2633 /* +1 in case of mbcset->nranges is 0. */
2634 new_nranges
= 2 * mbcset
->nranges
+ 1;
2635 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2636 are NULL if *range_alloc == 0. */
2637 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2639 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2642 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2645 mbcset
->range_starts
= new_array_start
;
2646 mbcset
->range_ends
= new_array_end
;
2647 *range_alloc
= new_nranges
;
2649 # endif /* RE_ENABLE_I18N */
2651 /* Equivalence Classes and Character Classes can't be a range
2653 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2654 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2658 start_collseq
= lookup_collation_sequence_value (start_elem
);
2659 end_collseq
= lookup_collation_sequence_value (end_elem
);
2660 /* Check start/end collation sequence values. */
2661 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2662 return REG_ECOLLATE
;
2663 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2666 # ifdef RE_ENABLE_I18N
2667 /* Got valid collation sequence values, add them as a new entry. */
2668 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2669 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2670 # endif /* RE_ENABLE_I18N */
2672 /* Build the table for single byte characters. */
2673 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2675 uint32_t ch_collseq
;
2677 if (MB_CUR_MAX == 1)
2680 ch_collseq
= collseqmb
[ch
];
2682 ch_collseq
= collseq_table_lookup (collseqwc
, __btowc (ch
));
2683 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2684 bitset_set (sbcset
, ch
);
2689 /* Local function for parse_bracket_exp used in _LIBC environement.
2690 Build the collating element which is represented by NAME.
2691 The result are written to MBCSET and SBCSET.
2692 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2693 pointer argument sinse we may update it. */
2695 static inline reg_errcode_t
2696 # ifdef RE_ENABLE_I18N
2697 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2698 re_charset_t
*mbcset
;
2699 int *coll_sym_alloc
;
2700 # else /* not RE_ENABLE_I18N */
2701 build_collating_symbol (sbcset
, name
)
2702 # endif /* not RE_ENABLE_I18N */
2703 re_bitset_ptr_t sbcset
;
2704 const unsigned char *name
;
2707 size_t name_len
= strlen ((const char *) name
);
2710 elem
= seek_collating_symbol_entry (name
, name_len
);
2711 if (symb_table
[2 * elem
] != 0)
2713 /* We found the entry. */
2714 idx
= symb_table
[2 * elem
+ 1];
2715 /* Skip the name of collating element name. */
2716 idx
+= 1 + extra
[idx
];
2718 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2720 /* No valid character, treat it as a normal
2722 bitset_set (sbcset
, name
[0]);
2726 return REG_ECOLLATE
;
2728 # ifdef RE_ENABLE_I18N
2729 /* Got valid collation sequence, add it as a new entry. */
2730 /* Check the space of the arrays. */
2731 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2733 /* Not enough, realloc it. */
2734 /* +1 in case of mbcset->ncoll_syms is 0. */
2735 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2736 /* Use realloc since mbcset->coll_syms is NULL
2738 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2740 if (BE (mbcset
->coll_syms
== NULL
, 0))
2743 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2744 # endif /* RE_ENABLE_I18N */
2749 if (BE (name_len
!= 1, 0))
2750 return REG_ECOLLATE
;
2753 bitset_set (sbcset
, name
[0]);
2760 re_token_t br_token
;
2761 re_bitset_ptr_t sbcset
;
2762 #ifdef RE_ENABLE_I18N
2763 re_charset_t
*mbcset
;
2764 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2765 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2766 #else /* not RE_ENABLE_I18N */
2768 #endif /* not RE_ENABLE_I18N */
2769 bin_tree_t
*work_tree
;
2770 int token_len
, new_idx
;
2772 collseqmb
= (const unsigned char *)
2773 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2774 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2780 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2781 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2782 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2783 _NL_COLLATE_SYMB_TABLEMB
);
2784 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2785 _NL_COLLATE_SYMB_EXTRAMB
);
2788 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2789 #ifdef RE_ENABLE_I18N
2790 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2791 #endif /* RE_ENABLE_I18N */
2792 #ifdef RE_ENABLE_I18N
2793 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2795 if (BE (sbcset
== NULL
, 0))
2796 #endif /* RE_ENABLE_I18N */
2802 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2803 if (BE (token
->type
== END_OF_RE
, 0))
2806 goto parse_bracket_exp_free_return
;
2808 if (token
->type
== OP_NON_MATCH_LIST
)
2810 #ifdef RE_ENABLE_I18N
2812 mbcset
->non_match
= 1;
2813 #else /* not RE_ENABLE_I18N */
2815 #endif /* not RE_ENABLE_I18N */
2816 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2817 bitset_set (sbcset
, '\0');
2818 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2819 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2820 if (BE (token
->type
== END_OF_RE
, 0))
2823 goto parse_bracket_exp_free_return
;
2825 #ifdef RE_ENABLE_I18N
2827 for (i
= 0; i
< SBC_MAX
; ++i
)
2828 if (__btowc (i
) == WEOF
)
2829 bitset_set (sbcset
, i
);
2830 #endif /* RE_ENABLE_I18N */
2833 /* We treat the first ']' as a normal character. */
2834 if (token
->type
== OP_CLOSE_BRACKET
)
2835 token
->type
= CHARACTER
;
2839 bracket_elem_t start_elem
, end_elem
;
2840 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
2841 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
2843 int token_len2
= 0, is_range_exp
= 0;
2846 start_elem
.opr
.name
= start_name_buf
;
2847 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
2849 if (BE (ret
!= REG_NOERROR
, 0))
2852 goto parse_bracket_exp_free_return
;
2855 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2856 if (BE (token
->type
== END_OF_RE
, 0))
2859 goto parse_bracket_exp_free_return
;
2861 if (token
->type
== OP_CHARSET_RANGE
)
2863 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
2864 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
2865 if (BE (token
->type
== END_OF_RE
, 0))
2868 goto parse_bracket_exp_free_return
;
2870 if (token2
.type
== OP_CLOSE_BRACKET
)
2872 /* We treat the last '-' as a normal character. */
2873 re_string_skip_bytes (regexp
, -token_len
);
2874 token
->type
= CHARACTER
;
2880 if (is_range_exp
== 1)
2882 end_elem
.opr
.name
= end_name_buf
;
2883 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
2885 if (BE (ret
!= REG_NOERROR
, 0))
2888 goto parse_bracket_exp_free_return
;
2891 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2892 if (BE (token
->type
== END_OF_RE
, 0))
2895 goto parse_bracket_exp_free_return
;
2897 *err
= build_range_exp (sbcset
,
2898 #ifdef RE_ENABLE_I18N
2899 mbcset
, &range_alloc
,
2900 #endif /* RE_ENABLE_I18N */
2901 &start_elem
, &end_elem
);
2902 if (BE (*err
!= REG_NOERROR
, 0))
2903 goto parse_bracket_exp_free_return
;
2907 switch (start_elem
.type
)
2910 bitset_set (sbcset
, start_elem
.opr
.ch
);
2912 #ifdef RE_ENABLE_I18N
2914 /* Check whether the array has enough space. */
2915 if (mbchar_alloc
== mbcset
->nmbchars
)
2917 /* Not enough, realloc it. */
2918 /* +1 in case of mbcset->nmbchars is 0. */
2919 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
2920 /* Use realloc since array is NULL if *alloc == 0. */
2921 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
2923 if (BE (mbcset
->mbchars
== NULL
, 0))
2924 goto parse_bracket_exp_espace
;
2926 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
2928 #endif /* RE_ENABLE_I18N */
2930 *err
= build_equiv_class (sbcset
,
2931 #ifdef RE_ENABLE_I18N
2932 mbcset
, &equiv_class_alloc
,
2933 #endif /* RE_ENABLE_I18N */
2934 start_elem
.opr
.name
);
2935 if (BE (*err
!= REG_NOERROR
, 0))
2936 goto parse_bracket_exp_free_return
;
2939 *err
= build_collating_symbol (sbcset
,
2940 #ifdef RE_ENABLE_I18N
2941 mbcset
, &coll_sym_alloc
,
2942 #endif /* RE_ENABLE_I18N */
2943 start_elem
.opr
.name
);
2944 if (BE (*err
!= REG_NOERROR
, 0))
2945 goto parse_bracket_exp_free_return
;
2948 ret
= build_charclass (sbcset
,
2949 #ifdef RE_ENABLE_I18N
2950 mbcset
, &char_class_alloc
,
2951 #endif /* RE_ENABLE_I18N */
2952 start_elem
.opr
.name
, syntax
);
2953 if (BE (ret
!= REG_NOERROR
, 0))
2954 goto parse_bracket_exp_espace
;
2961 if (token
->type
== OP_CLOSE_BRACKET
)
2965 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2967 /* If it is non-matching list. */
2968 #ifdef RE_ENABLE_I18N
2969 if (mbcset
->non_match
)
2970 #else /* not RE_ENABLE_I18N */
2972 #endif /* not RE_ENABLE_I18N */
2973 bitset_not (sbcset
);
2975 /* Build a tree for simple bracket. */
2976 br_token
.type
= SIMPLE_BRACKET
;
2977 br_token
.opr
.sbcset
= sbcset
;
2978 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2979 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2980 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
2981 goto parse_bracket_exp_espace
;
2983 #ifdef RE_ENABLE_I18N
2984 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
2985 || mbcset
->nranges
|| (MB_CUR_MAX
> 1 && (mbcset
->nchar_classes
2986 || mbcset
->non_match
)))
2988 re_token_t alt_token
;
2989 bin_tree_t
*mbc_tree
;
2990 /* Build a tree for complex bracket. */
2991 br_token
.type
= COMPLEX_BRACKET
;
2992 br_token
.opr
.mbcset
= mbcset
;
2993 dfa
->has_mb_node
= 1;
2994 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2995 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2996 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
2997 goto parse_bracket_exp_espace
;
2998 /* Then join them by ALT node. */
2999 dfa
->has_plural_match
= 1;
3000 alt_token
.type
= OP_ALT
;
3001 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3002 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
3003 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3008 free_charset (mbcset
);
3011 #else /* not RE_ENABLE_I18N */
3013 #endif /* not RE_ENABLE_I18N */
3015 parse_bracket_exp_espace
:
3017 parse_bracket_exp_free_return
:
3019 #ifdef RE_ENABLE_I18N
3020 free_charset (mbcset
);
3021 #endif /* RE_ENABLE_I18N */
3025 /* Parse an element in the bracket expression. */
3027 static reg_errcode_t
3028 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
)
3029 bracket_elem_t
*elem
;
3030 re_string_t
*regexp
;
3034 reg_syntax_t syntax
;
3036 #ifdef RE_ENABLE_I18N
3038 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3039 if (cur_char_size
> 1)
3041 elem
->type
= MB_CHAR
;
3042 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3043 re_string_skip_bytes (regexp
, cur_char_size
);
3046 #endif /* RE_ENABLE_I18N */
3047 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3048 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3049 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3050 return parse_bracket_symbol (elem
, regexp
, token
);
3051 elem
->type
= SB_CHAR
;
3052 elem
->opr
.ch
= token
->opr
.c
;
3056 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3057 such as [:<character_class>:], [.<collating_element>.], and
3058 [=<equivalent_class>=]. */
3060 static reg_errcode_t
3061 parse_bracket_symbol (elem
, regexp
, token
)
3062 bracket_elem_t
*elem
;
3063 re_string_t
*regexp
;
3066 unsigned char ch
, delim
= token
->opr
.c
;
3070 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3072 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3073 ch
= re_string_fetch_byte_case (regexp
);
3075 ch
= re_string_fetch_byte (regexp
);
3076 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3078 elem
->opr
.name
[i
] = ch
;
3080 re_string_skip_bytes (regexp
, 1);
3081 elem
->opr
.name
[i
] = '\0';
3082 switch (token
->type
)
3084 case OP_OPEN_COLL_ELEM
:
3085 elem
->type
= COLL_SYM
;
3087 case OP_OPEN_EQUIV_CLASS
:
3088 elem
->type
= EQUIV_CLASS
;
3090 case OP_OPEN_CHAR_CLASS
:
3091 elem
->type
= CHAR_CLASS
;
3099 /* Helper function for parse_bracket_exp.
3100 Build the equivalence class which is represented by NAME.
3101 The result are written to MBCSET and SBCSET.
3102 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3103 is a pointer argument sinse we may update it. */
3105 static reg_errcode_t
3106 #ifdef RE_ENABLE_I18N
3107 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3108 re_charset_t
*mbcset
;
3109 int *equiv_class_alloc
;
3110 #else /* not RE_ENABLE_I18N */
3111 build_equiv_class (sbcset
, name
)
3112 #endif /* not RE_ENABLE_I18N */
3113 re_bitset_ptr_t sbcset
;
3114 const unsigned char *name
;
3116 #if defined _LIBC && defined RE_ENABLE_I18N
3117 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3120 const int32_t *table
, *indirect
;
3121 const unsigned char *weights
, *extra
, *cp
;
3122 unsigned char char_buf
[2];
3126 /* This #include defines a local function! */
3127 # include <locale/weight.h>
3128 /* Calculate the index for equivalence class. */
3130 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3131 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3132 _NL_COLLATE_WEIGHTMB
);
3133 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3134 _NL_COLLATE_EXTRAMB
);
3135 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3136 _NL_COLLATE_INDIRECTMB
);
3137 idx1
= findidx (&cp
);
3138 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3139 /* This isn't a valid character. */
3140 return REG_ECOLLATE
;
3142 /* Build single byte matcing table for this equivalence class. */
3143 char_buf
[1] = (unsigned char) '\0';
3144 len
= weights
[idx1
];
3145 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3149 idx2
= findidx (&cp
);
3154 /* This isn't a valid character. */
3156 if (len
== weights
[idx2
])
3159 while (cnt
<= len
&&
3160 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3164 bitset_set (sbcset
, ch
);
3167 /* Check whether the array has enough space. */
3168 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3170 /* Not enough, realloc it. */
3171 /* +1 in case of mbcset->nequiv_classes is 0. */
3172 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3173 /* Use realloc since the array is NULL if *alloc == 0. */
3174 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3175 *equiv_class_alloc
);
3176 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3179 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3182 #endif /* _LIBC && RE_ENABLE_I18N */
3184 if (BE (strlen ((const char *) name
) != 1, 0))
3185 return REG_ECOLLATE
;
3186 bitset_set (sbcset
, *name
);
3191 /* Helper function for parse_bracket_exp.
3192 Build the character class which is represented by NAME.
3193 The result are written to MBCSET and SBCSET.
3194 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3195 is a pointer argument sinse we may update it. */
3197 static reg_errcode_t
3198 #ifdef RE_ENABLE_I18N
3199 build_charclass (sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3200 re_charset_t
*mbcset
;
3201 int *char_class_alloc
;
3202 #else /* not RE_ENABLE_I18N */
3203 build_charclass (sbcset
, class_name
, syntax
)
3204 #endif /* not RE_ENABLE_I18N */
3205 re_bitset_ptr_t sbcset
;
3206 const unsigned char *class_name
;
3207 reg_syntax_t syntax
;
3210 const char *name
= (const char *) class_name
;
3212 /* In case of REG_ICASE "upper" and "lower" match the both of
3213 upper and lower cases. */
3214 if ((syntax
& RE_ICASE
)
3215 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3218 #ifdef RE_ENABLE_I18N
3219 /* Check the space of the arrays. */
3220 if (*char_class_alloc
== mbcset
->nchar_classes
)
3222 /* Not enough, realloc it. */
3223 /* +1 in case of mbcset->nchar_classes is 0. */
3224 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3225 /* Use realloc since array is NULL if *alloc == 0. */
3226 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3228 if (BE (mbcset
->char_classes
== NULL
, 0))
3231 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3232 #endif /* RE_ENABLE_I18N */
3234 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3235 for (i = 0; i < SBC_MAX; ++i) \
3237 if (ctype_func (i)) \
3238 bitset_set (sbcset, i); \
3241 if (strcmp (name
, "alnum") == 0)
3242 BUILD_CHARCLASS_LOOP (isalnum
)
3243 else if (strcmp (name
, "cntrl") == 0)
3244 BUILD_CHARCLASS_LOOP (iscntrl
)
3245 else if (strcmp (name
, "lower") == 0)
3246 BUILD_CHARCLASS_LOOP (islower
)
3247 else if (strcmp (name
, "space") == 0)
3248 BUILD_CHARCLASS_LOOP (isspace
)
3249 else if (strcmp (name
, "alpha") == 0)
3250 BUILD_CHARCLASS_LOOP (isalpha
)
3251 else if (strcmp (name
, "digit") == 0)
3252 BUILD_CHARCLASS_LOOP (isdigit
)
3253 else if (strcmp (name
, "print") == 0)
3254 BUILD_CHARCLASS_LOOP (isprint
)
3255 else if (strcmp (name
, "upper") == 0)
3256 BUILD_CHARCLASS_LOOP (isupper
)
3257 else if (strcmp (name
, "blank") == 0)
3258 BUILD_CHARCLASS_LOOP (isblank
)
3259 else if (strcmp (name
, "graph") == 0)
3260 BUILD_CHARCLASS_LOOP (isgraph
)
3261 else if (strcmp (name
, "punct") == 0)
3262 BUILD_CHARCLASS_LOOP (ispunct
)
3263 else if (strcmp (name
, "xdigit") == 0)
3264 BUILD_CHARCLASS_LOOP (isxdigit
)
3272 build_word_op (dfa
, not, err
)
3277 re_bitset_ptr_t sbcset
;
3278 #ifdef RE_ENABLE_I18N
3279 re_charset_t
*mbcset
;
3281 #else /* not RE_ENABLE_I18N */
3283 #endif /* not RE_ENABLE_I18N */
3285 re_token_t br_token
;
3289 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3290 #ifdef RE_ENABLE_I18N
3291 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3292 #endif /* RE_ENABLE_I18N */
3294 #ifdef RE_ENABLE_I18N
3295 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3296 #else /* not RE_ENABLE_I18N */
3297 if (BE (sbcset
== NULL
, 0))
3298 #endif /* not RE_ENABLE_I18N */
3306 #ifdef RE_ENABLE_I18N
3309 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3310 bitset_set(cset->sbcset, '\0');
3312 mbcset
->non_match
= 1;
3314 for (i
= 0; i
< SBC_MAX
; ++i
)
3315 if (__btowc (i
) == WEOF
)
3316 bitset_set (sbcset
, i
);
3317 #else /* not RE_ENABLE_I18N */
3319 #endif /* not RE_ENABLE_I18N */
3322 /* We don't care the syntax in this case. */
3323 ret
= build_charclass (sbcset
,
3324 #ifdef RE_ENABLE_I18N
3326 #endif /* RE_ENABLE_I18N */
3327 (const unsigned char *) "alpha", 0);
3329 if (BE (ret
!= REG_NOERROR
, 0))
3332 #ifdef RE_ENABLE_I18N
3333 free_charset (mbcset
);
3334 #endif /* RE_ENABLE_I18N */
3338 /* \w match '_' also. */
3339 bitset_set (sbcset
, '_');
3341 /* If it is non-matching list. */
3342 #ifdef RE_ENABLE_I18N
3343 if (mbcset
->non_match
)
3344 #else /* not RE_ENABLE_I18N */
3346 #endif /* not RE_ENABLE_I18N */
3347 bitset_not (sbcset
);
3349 /* Build a tree for simple bracket. */
3350 br_token
.type
= SIMPLE_BRACKET
;
3351 br_token
.opr
.sbcset
= sbcset
;
3352 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3353 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3354 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3355 goto build_word_op_espace
;
3357 #ifdef RE_ENABLE_I18N
3360 re_token_t alt_token
;
3361 bin_tree_t
*mbc_tree
;
3362 /* Build a tree for complex bracket. */
3363 br_token
.type
= COMPLEX_BRACKET
;
3364 br_token
.opr
.mbcset
= mbcset
;
3365 dfa
->has_mb_node
= 1;
3366 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3367 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3368 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3369 goto build_word_op_espace
;
3370 /* Then join them by ALT node. */
3371 alt_token
.type
= OP_ALT
;
3372 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3373 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3374 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3379 free_charset (mbcset
);
3382 #else /* not RE_ENABLE_I18N */
3384 #endif /* not RE_ENABLE_I18N */
3386 build_word_op_espace
:
3388 #ifdef RE_ENABLE_I18N
3389 free_charset (mbcset
);
3390 #endif /* RE_ENABLE_I18N */
3395 /* This is intended for the expressions like "a{1,3}".
3396 Fetch a number from `input', and return the number.
3397 Return -1, if the number field is empty like "{,1}".
3398 Return -2, If an error is occured. */
3401 fetch_number (input
, token
, syntax
)
3404 reg_syntax_t syntax
;
3410 *token
= fetch_token (input
, syntax
);
3412 if (BE (token
->type
== END_OF_RE
, 0))
3414 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3416 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3417 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3418 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3423 #ifdef RE_ENABLE_I18N
3425 free_charset (re_charset_t
*cset
)
3427 re_free (cset
->mbchars
);
3429 re_free (cset
->coll_syms
);
3430 re_free (cset
->equiv_classes
);
3431 re_free (cset
->range_starts
);
3432 re_free (cset
->range_ends
);
3434 re_free (cset
->char_classes
);
3437 #endif /* RE_ENABLE_I18N */
3439 /* Functions for binary tree operation. */
3441 /* Create a node of tree.
3442 Note: This function automatically free left and right if malloc fails. */
3445 create_tree (left
, right
, type
, index
)
3448 re_token_type_t type
;
3452 tree
= re_malloc (bin_tree_t
, 1);
3453 if (BE (tree
== NULL
, 0))
3455 free_bin_tree (left
);
3456 free_bin_tree (right
);
3459 tree
->parent
= NULL
;
3461 tree
->right
= right
;
3463 tree
->node_idx
= index
;
3466 re_node_set_init_empty (&tree
->eclosure
);
3469 left
->parent
= tree
;
3471 right
->parent
= tree
;
3475 /* Free the sub tree pointed by TREE. */
3478 free_bin_tree (tree
)
3483 /*re_node_set_free (&tree->eclosure);*/
3484 free_bin_tree (tree
->left
);
3485 free_bin_tree (tree
->right
);
3489 /* Duplicate the node SRC, and return new node. */
3492 duplicate_tree (src
, dfa
)
3493 const bin_tree_t
*src
;
3496 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3498 /* Since node indies must be according to Post-order of the tree,
3499 we must duplicate the left at first. */
3500 if (src
->left
!= NULL
)
3502 left
= duplicate_tree (src
->left
, dfa
);
3507 /* Secondaly, duplicate the right. */
3508 if (src
->right
!= NULL
)
3510 right
= duplicate_tree (src
->right
, dfa
);
3513 free_bin_tree (left
);
3518 /* At last, duplicate itself. */
3519 if (src
->type
== NON_TYPE
)
3521 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3522 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3523 if (BE (new_node_idx
== -1, 0))
3525 free_bin_tree (left
);
3526 free_bin_tree (right
);
3531 new_node_idx
= src
->type
;
3533 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3534 if (BE (new_tree
== NULL
, 0))
3536 free_bin_tree (left
);
3537 free_bin_tree (right
);