1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
72 int length
, reg_syntax_t syntax
);
73 static void re_compile_fastmap_iter (regex_t
*bufp
,
74 const re_dfastate_t
*init_state
,
76 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
77 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
79 static void free_charset (re_charset_t
*cset
);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t
*preg
);
82 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
83 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
84 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
85 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
86 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
87 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
88 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
89 unsigned int constraint
);
90 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
91 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
93 static void calc_inveclosure (re_dfa_t
*dfa
);
94 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
96 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
97 static int peek_token (re_token_t
*token
, re_string_t
*input
,
99 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
100 reg_syntax_t syntax
);
101 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
102 reg_syntax_t syntax
, reg_errcode_t
*err
);
103 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
104 re_token_t
*token
, reg_syntax_t syntax
,
105 int nest
, reg_errcode_t
*err
);
106 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
107 re_token_t
*token
, reg_syntax_t syntax
,
108 int nest
, reg_errcode_t
*err
);
109 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
110 re_token_t
*token
, reg_syntax_t syntax
,
111 int nest
, reg_errcode_t
*err
);
112 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
113 re_token_t
*token
, reg_syntax_t syntax
,
114 int nest
, reg_errcode_t
*err
);
115 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
116 re_dfa_t
*dfa
, re_token_t
*token
,
117 reg_syntax_t syntax
, reg_errcode_t
*err
);
118 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
119 re_token_t
*token
, reg_syntax_t syntax
,
121 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
123 re_token_t
*token
, int token_len
,
125 reg_syntax_t syntax
);
126 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
130 # ifdef RE_ENABLE_I18N
131 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
132 re_charset_t
*mbcset
, int *range_alloc
,
133 bracket_elem_t
*start_elem
,
134 bracket_elem_t
*end_elem
);
135 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
136 re_charset_t
*mbcset
,
138 const unsigned char *name
);
139 # else /* not RE_ENABLE_I18N */
140 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
141 bracket_elem_t
*start_elem
,
142 bracket_elem_t
*end_elem
);
143 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
144 const unsigned char *name
);
145 # endif /* not RE_ENABLE_I18N */
146 #endif /* not _LIBC */
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
149 re_charset_t
*mbcset
,
150 int *equiv_class_alloc
,
151 const unsigned char *name
);
152 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
153 re_charset_t
*mbcset
,
154 int *char_class_alloc
,
155 const unsigned char *class_name
,
156 reg_syntax_t syntax
);
157 #else /* not RE_ENABLE_I18N */
158 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
159 const unsigned char *name
);
160 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
161 const unsigned char *class_name
,
162 reg_syntax_t syntax
);
163 #endif /* not RE_ENABLE_I18N */
164 static bin_tree_t
*build_word_op (re_dfa_t
*dfa
, int not, reg_errcode_t
*err
);
165 static void free_bin_tree (bin_tree_t
*tree
);
166 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
167 re_token_type_t type
, int index
);
168 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
170 /* This table gives an error message for each of the error codes listed
171 in regex.h. Obviously the order here has to be same as there.
172 POSIX doesn't require that we do anything for REG_NOERROR,
173 but why not be nice? */
175 const char __re_error_msgid
[] attribute_hidden
=
177 #define REG_NOERROR_IDX 0
178 gettext_noop ("Success") /* REG_NOERROR */
180 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
181 gettext_noop ("No match") /* REG_NOMATCH */
183 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
184 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
186 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
187 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
189 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
190 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
192 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
193 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
195 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
196 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
198 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
199 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
201 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
202 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
204 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
205 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
207 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
208 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
210 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
211 gettext_noop ("Invalid range end") /* REG_ERANGE */
213 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
214 gettext_noop ("Memory exhausted") /* REG_ESPACE */
216 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
217 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
219 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
220 gettext_noop ("Premature end of regular expression") /* REG_EEND */
222 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
223 gettext_noop ("Regular expression too big") /* REG_ESIZE */
225 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
226 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
229 const size_t __re_error_msgid_idx
[] attribute_hidden
=
250 /* Entry points for GNU code. */
252 /* re_compile_pattern is the GNU regular expression compiler: it
253 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
254 Returns 0 if the pattern was valid, otherwise an error string.
256 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
257 are set in BUFP on entry. */
260 re_compile_pattern (pattern
, length
, bufp
)
263 struct re_pattern_buffer
*bufp
;
267 /* GNU code is written to assume at least RE_NREGS registers will be set
268 (and at least one extra will be -1). */
269 bufp
->regs_allocated
= REGS_UNALLOCATED
;
271 /* And GNU code determines whether or not to get register information
272 by passing null for the REGS argument to re_match, etc., not by
276 /* Match anchors at newline. */
277 bufp
->newline_anchor
= 1;
279 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
283 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
286 weak_alias (__re_compile_pattern
, re_compile_pattern
)
289 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
290 also be assigned to arbitrarily: each pattern buffer stores its own
291 syntax, so it can be changed between regex compilations. */
292 /* This has no initializer because initialized variables in Emacs
293 become read-only after dumping. */
294 reg_syntax_t re_syntax_options
;
297 /* Specify the precise syntax of regexps for compilation. This provides
298 for compatibility for various utilities which historically have
299 different, incompatible syntaxes.
301 The argument SYNTAX is a bit mask comprised of the various bits
302 defined in regex.h. We return the old syntax. */
305 re_set_syntax (syntax
)
308 reg_syntax_t ret
= re_syntax_options
;
310 re_syntax_options
= syntax
;
314 weak_alias (__re_set_syntax
, re_set_syntax
)
318 re_compile_fastmap (bufp
)
319 struct re_pattern_buffer
*bufp
;
321 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
322 char *fastmap
= bufp
->fastmap
;
324 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
325 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
326 if (dfa
->init_state
!= dfa
->init_state_word
)
327 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
328 if (dfa
->init_state
!= dfa
->init_state_nl
)
329 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
330 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
331 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
332 bufp
->fastmap_accurate
= 1;
336 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
339 /* Helper function for re_compile_fastmap.
340 Compile fastmap for the initial_state INIT_STATE. */
343 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
345 const re_dfastate_t
*init_state
;
348 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
350 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
352 int node
= init_state
->nodes
.elems
[node_cnt
];
353 re_token_type_t type
= dfa
->nodes
[node
].type
;
354 if (type
== OP_CONTEXT_NODE
)
356 node
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
357 type
= dfa
->nodes
[node
].type
;
360 if (type
== CHARACTER
)
361 fastmap
[dfa
->nodes
[node
].opr
.c
] = 1;
362 else if (type
== SIMPLE_BRACKET
)
365 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
366 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
367 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
370 #ifdef RE_ENABLE_I18N
371 else if (type
== COMPLEX_BRACKET
)
374 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
375 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
376 || cset
->nranges
|| cset
->nchar_classes
)
379 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
381 /* In this case we want to catch the bytes which are
382 the first byte of any collation elements.
383 e.g. In da_DK, we want to catch 'a' since "aa"
384 is a valid collation element, and don't catch
385 'b' since 'b' is the only collation element
386 which starts from 'b'. */
388 const int32_t *table
= (const int32_t *)
389 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
390 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
391 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
397 for (i
= 0; i
< SBC_MAX
; ++i
)
398 if (__btowc (i
) == WEOF
)
400 # endif /* not _LIBC */
402 for (i
= 0; i
< cset
->nmbchars
; ++i
)
405 wctomb (buf
, cset
->mbchars
[i
]);
406 fastmap
[*(unsigned char *) buf
] = 1;
409 #endif /* RE_ENABLE_I18N */
410 else if (type
== END_OF_RE
|| type
== OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type
== COMPLEX_BRACKET
413 #endif /* RE_ENABLE_I18N */
416 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
417 if (type
== END_OF_RE
)
418 bufp
->can_be_null
= 1;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 `buffer' to the compiled pattern;
431 `used' to the length of the compiled pattern;
432 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 `fastmap' to an allocated space for the fastmap;
437 `fastmap_accurate' to zero;
438 `re_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (preg
, pattern
, cflags
)
462 regex_t
*__restrict preg
;
463 const char *__restrict pattern
;
467 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
468 : RE_SYNTAX_POSIX_BASIC
);
474 /* Try to allocate space for the fastmap. */
475 preg
->fastmap
= re_malloc (char, SBC_MAX
);
476 if (BE (preg
->fastmap
== NULL
, 0))
479 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
481 /* If REG_NEWLINE is set, newlines are treated differently. */
482 if (cflags
& REG_NEWLINE
)
483 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
484 syntax
&= ~RE_DOT_NEWLINE
;
485 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
486 /* It also changes the matching behavior. */
487 preg
->newline_anchor
= 1;
490 preg
->newline_anchor
= 0;
491 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
492 preg
->translate
= NULL
;
494 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
496 /* POSIX doesn't distinguish between an unmatched open-group and an
497 unmatched close-group: both are REG_EPAREN. */
498 if (ret
== REG_ERPAREN
)
501 /* We have already checked preg->fastmap != NULL. */
502 if (BE (ret
== REG_NOERROR
, 1))
504 /* Compute the fastmap now, since regexec cannot modify the pattern
506 if (BE (re_compile_fastmap (preg
) == -2, 0))
508 /* Some error occurred while computing the fastmap, just forget
510 re_free (preg
->fastmap
);
511 preg
->fastmap
= NULL
;
518 weak_alias (__regcomp
, regcomp
)
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
525 regerror (errcode
, preg
, errbuf
, errbuf_size
)
535 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
536 / sizeof (__re_error_msgid_idx
[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
543 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
545 msg_size
= strlen (msg
) + 1; /* Includes the null. */
547 if (BE (errbuf_size
!= 0, 1))
549 if (BE (msg_size
> errbuf_size
, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
554 memcpy (errbuf
, msg
, errbuf_size
- 1);
555 errbuf
[errbuf_size
- 1] = 0;
559 memcpy (errbuf
, msg
, msg_size
);
565 weak_alias (__regerror
, regerror
)
568 /* Free dynamically allocated space used by PREG. */
575 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
576 if (BE (dfa
!= NULL
, 1))
578 re_free (dfa
->subexps
);
580 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
582 re_token_t
*node
= dfa
->nodes
+ i
;
583 #ifdef RE_ENABLE_I18N
584 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
585 free_charset (node
->opr
.mbcset
);
587 #endif /* RE_ENABLE_I18N */
588 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
589 re_free (node
->opr
.sbcset
);
590 else if (node
->type
== OP_CONTEXT_NODE
)
592 if (dfa
->nodes
[node
->opr
.ctx_info
->entity
].type
== OP_BACK_REF
)
594 if (node
->opr
.ctx_info
->bkref_eclosure
!= NULL
)
595 re_node_set_free (node
->opr
.ctx_info
->bkref_eclosure
);
596 re_free (node
->opr
.ctx_info
->bkref_eclosure
);
598 re_free (node
->opr
.ctx_info
);
601 re_free (dfa
->firsts
);
602 re_free (dfa
->nexts
);
603 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
605 if (dfa
->eclosures
!= NULL
)
606 re_node_set_free (dfa
->eclosures
+ i
);
607 if (dfa
->inveclosures
!= NULL
)
608 re_node_set_free (dfa
->inveclosures
+ i
);
609 if (dfa
->edests
!= NULL
)
610 re_node_set_free (dfa
->edests
+ i
);
612 re_free (dfa
->edests
);
613 re_free (dfa
->eclosures
);
614 re_free (dfa
->inveclosures
);
615 re_free (dfa
->nodes
);
617 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
619 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
620 for (j
= 0; j
< entry
->num
; ++j
)
622 re_dfastate_t
*state
= entry
->array
[j
];
623 if (state
->entrance_nodes
!= &state
->nodes
)
625 re_node_set_free (state
->entrance_nodes
);
626 re_free (state
->entrance_nodes
);
628 re_node_set_free (&state
->nodes
);
629 re_free (state
->trtable
);
630 re_free (state
->trtable_search
);
633 re_free (entry
->array
);
635 re_free (dfa
->state_table
);
637 if (dfa
->word_char
!= NULL
)
638 re_free (dfa
->word_char
);
640 re_free (dfa
->re_str
);
644 re_free (preg
->fastmap
);
647 weak_alias (__regfree
, regfree
)
650 /* Entry points compatible with 4.2 BSD regex library. We don't define
651 them unless specifically requested. */
653 #if defined _REGEX_RE_COMP || defined _LIBC
655 /* BSD has one and only one pattern buffer. */
656 static struct re_pattern_buffer re_comp_buf
;
660 /* Make these definitions weak in libc, so POSIX programs can redefine
661 these names if they don't use our functions, and still use
662 regcomp/regexec above without link errors. */
672 if (!re_comp_buf
.buffer
)
673 return gettext ("No previous regular expression");
677 if (!re_comp_buf
.buffer
)
679 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
680 if (re_comp_buf
.fastmap
== NULL
)
681 return (char *) gettext (__re_error_msgid
682 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
685 /* Since `re_exec' always passes NULL for the `regs' argument, we
686 don't need to initialize the pattern buffer fields which affect it. */
688 /* Match anchors at newlines. */
689 re_comp_buf
.newline_anchor
= 1;
691 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
696 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
697 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
699 #endif /* _REGEX_RE_COMP */
701 /* Internal entry point.
702 Compile the regular expression PATTERN, whose length is LENGTH.
703 SYNTAX indicate regular expression's syntax. */
706 re_compile_internal (preg
, pattern
, length
, syntax
)
708 const char * pattern
;
712 reg_errcode_t err
= REG_NOERROR
;
716 /* Initialize the pattern buffer. */
717 preg
->fastmap_accurate
= 0;
718 preg
->syntax
= syntax
;
719 preg
->not_bol
= preg
->not_eol
= 0;
723 /* Initialize the dfa. */
724 dfa
= (re_dfa_t
*) preg
->buffer
;
725 if (preg
->allocated
< sizeof (re_dfa_t
))
727 /* If zero allocated, but buffer is non-null, try to realloc
728 enough space. This loses if buffer's address is bogus, but
729 that is the user's responsibility. If ->buffer is NULL this
730 is a simple allocation. */
731 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
734 preg
->allocated
= sizeof (re_dfa_t
);
736 preg
->buffer
= (unsigned char *) dfa
;
737 preg
->used
= sizeof (re_dfa_t
);
739 err
= init_dfa (dfa
, length
);
740 if (BE (err
!= REG_NOERROR
, 0))
747 dfa
->re_str
= re_malloc (char, length
+ 1);
748 strncpy (dfa
->re_str
, pattern
, length
+ 1);
751 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
753 if (BE (err
!= REG_NOERROR
, 0))
760 /* Parse the regular expression, and build a structure tree. */
762 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
763 if (BE (dfa
->str_tree
== NULL
, 0))
764 goto re_compile_internal_free_return
;
766 /* Analyze the tree and collect information which is necessary to
769 if (BE (err
!= REG_NOERROR
, 0))
770 goto re_compile_internal_free_return
;
772 /* Then create the initial state of the dfa. */
773 err
= create_initial_state (dfa
);
774 if (BE (err
!= REG_NOERROR
, 0))
775 goto re_compile_internal_free_return
;
777 re_compile_internal_free_return
:
778 /* Release work areas. */
779 free_workarea_compile (preg
);
780 re_string_destruct (®exp
);
785 /* Initialize DFA. We use the length of the regular expression PAT_LEN
786 as the initial length of some arrays. */
789 init_dfa (dfa
, pat_len
)
795 memset (dfa
, '\0', sizeof (re_dfa_t
));
797 dfa
->nodes_alloc
= pat_len
+ 1;
798 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
800 dfa
->states_alloc
= pat_len
+ 1;
802 /* table_size = 2 ^ ceil(log pat_len) */
803 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
804 if (table_size
> pat_len
)
807 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
808 dfa
->state_hash_mask
= table_size
- 1;
810 dfa
->subexps_alloc
= 1;
811 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
812 dfa
->word_char
= NULL
;
814 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
815 || dfa
->subexps
== NULL
, 0))
817 /* We don't bother to free anything which was allocated. Very
818 soon the process will go down anyway. */
820 dfa
->state_table
= NULL
;
827 /* Initialize WORD_CHAR table, which indicate which character is
828 "word". In this case "word" means that it is the word construction
829 character used by some operators like "\<", "\>", etc. */
836 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
837 if (BE (dfa
->word_char
== NULL
, 0))
839 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
840 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
841 if (isalnum (ch
) || ch
== '_')
842 dfa
->word_char
[i
] |= 1 << j
;
846 /* Free the work area which are only used while compiling. */
849 free_workarea_compile (preg
)
852 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
853 free_bin_tree (dfa
->str_tree
);
854 dfa
->str_tree
= NULL
;
857 /* Create initial states for all contexts. */
860 create_initial_state (dfa
)
865 re_node_set init_nodes
;
867 /* Initial states have the epsilon closure of the node which is
868 the first node of the regular expression. */
869 first
= dfa
->str_tree
->first
;
870 dfa
->init_node
= first
;
871 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
872 if (BE (err
!= REG_NOERROR
, 0))
875 /* The back-references which are in initial states can epsilon transit,
876 since in this case all of the subexpressions can be null.
877 Then we add epsilon closures of the nodes which are the next nodes of
878 the back-references. */
879 if (dfa
->nbackref
> 0)
880 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
882 int node_idx
= init_nodes
.elems
[i
];
883 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
886 int entity
= (type
!= OP_CONTEXT_NODE
? node_idx
887 : dfa
->nodes
[node_idx
].opr
.ctx_info
->entity
);
888 if ((type
!= OP_CONTEXT_NODE
889 || (dfa
->nodes
[entity
].type
!= OP_BACK_REF
))
890 && (type
!= OP_BACK_REF
))
892 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
894 re_token_t
*clexp_node
;
895 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
896 if (clexp_node
->type
== OP_CLOSE_SUBEXP
897 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[entity
].opr
.idx
)
900 if (clexp_idx
== init_nodes
.nelem
)
903 if (type
== OP_CONTEXT_NODE
904 && (dfa
->nodes
[dfa
->nodes
[node_idx
].opr
.ctx_info
->entity
].type
907 int prev_nelem
= init_nodes
.nelem
;
908 re_node_set_merge (&init_nodes
,
909 dfa
->nodes
[node_idx
].opr
.ctx_info
->bkref_eclosure
);
910 if (prev_nelem
< init_nodes
.nelem
)
913 else if (type
== OP_BACK_REF
)
915 int next_idx
= dfa
->nexts
[node_idx
];
916 if (!re_node_set_contains (&init_nodes
, next_idx
))
918 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ next_idx
);
924 /* It must be the first time to invoke acquire_state. */
925 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
926 /* We don't check ERR here, since the initial state must not be NULL. */
927 if (BE (dfa
->init_state
== NULL
, 0))
929 if (dfa
->init_state
->has_constraint
)
931 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
933 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
935 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
939 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
940 || dfa
->init_state_begbuf
== NULL
, 0))
944 dfa
->init_state_word
= dfa
->init_state_nl
945 = dfa
->init_state_begbuf
= dfa
->init_state
;
947 re_node_set_free (&init_nodes
);
951 /* Analyze the structure tree, and calculate "first", "next", "edest",
952 "eclosure", and "inveclosure". */
961 /* Allocate arrays. */
962 dfa
->firsts
= re_malloc (int, dfa
->nodes_alloc
);
963 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
964 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
965 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
966 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
967 if (BE (dfa
->firsts
== NULL
|| dfa
->nexts
== NULL
|| dfa
->edests
== NULL
968 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
970 /* Initialize them. */
971 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
975 re_node_set_init_empty (dfa
->edests
+ i
);
976 re_node_set_init_empty (dfa
->eclosures
+ i
);
977 re_node_set_init_empty (dfa
->inveclosures
+ i
);
980 ret
= analyze_tree (dfa
, dfa
->str_tree
);
981 if (BE (ret
== REG_NOERROR
, 1))
983 ret
= calc_eclosure (dfa
);
984 if (ret
== REG_NOERROR
)
985 calc_inveclosure (dfa
);
990 /* Helper functions for analyze.
991 This function calculate "first", "next", and "edest" for the subtree
992 whose root is NODE. */
995 analyze_tree (dfa
, node
)
1000 if (node
->first
== -1)
1001 calc_first (dfa
, node
);
1002 if (node
->next
== -1)
1003 calc_next (dfa
, node
);
1004 if (node
->eclosure
.nelem
== 0)
1005 calc_epsdest (dfa
, node
);
1006 /* Calculate "first" etc. for the left child. */
1007 if (node
->left
!= NULL
)
1009 ret
= analyze_tree (dfa
, node
->left
);
1010 if (BE (ret
!= REG_NOERROR
, 0))
1013 /* Calculate "first" etc. for the right child. */
1014 if (node
->right
!= NULL
)
1016 ret
= analyze_tree (dfa
, node
->right
);
1017 if (BE (ret
!= REG_NOERROR
, 0))
1023 /* Calculate "first" for the node NODE. */
1025 calc_first (dfa
, node
)
1030 idx
= node
->node_idx
;
1031 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1036 case OP_OPEN_BRACKET
:
1037 case OP_CLOSE_BRACKET
:
1038 case OP_OPEN_DUP_NUM
:
1039 case OP_CLOSE_DUP_NUM
:
1040 case OP_NON_MATCH_LIST
:
1041 case OP_OPEN_COLL_ELEM
:
1042 case OP_CLOSE_COLL_ELEM
:
1043 case OP_OPEN_EQUIV_CLASS
:
1044 case OP_CLOSE_EQUIV_CLASS
:
1045 case OP_OPEN_CHAR_CLASS
:
1046 case OP_CLOSE_CHAR_CLASS
:
1047 /* These must not be appeared here. */
1053 case OP_DUP_ASTERISK
:
1054 case OP_DUP_QUESTION
:
1055 #ifdef RE_ENABLE_I18N
1056 case COMPLEX_BRACKET
:
1057 #endif /* RE_ENABLE_I18N */
1058 case SIMPLE_BRACKET
:
1061 case OP_OPEN_SUBEXP
:
1062 case OP_CLOSE_SUBEXP
:
1067 assert (node
->left
!= NULL
);
1069 if (node
->left
->first
== -1)
1070 calc_first (dfa
, node
->left
);
1071 node
->first
= node
->left
->first
;
1076 /* else fall through */
1079 assert (node
->left
!= NULL
);
1081 if (node
->left
->first
== -1)
1082 calc_first (dfa
, node
->left
);
1083 node
->first
= node
->left
->first
;
1086 if (node
->type
== 0)
1087 dfa
->firsts
[idx
] = node
->first
;
1090 /* Calculate "next" for the node NODE. */
1093 calc_next (dfa
, node
)
1098 bin_tree_t
*parent
= node
->parent
;
1102 idx
= node
->node_idx
;
1103 if (node
->type
== 0)
1104 dfa
->nexts
[idx
] = node
->next
;
1108 idx
= parent
->node_idx
;
1109 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1113 case OP_DUP_ASTERISK
:
1118 if (parent
->left
== node
)
1120 if (parent
->right
->first
== -1)
1121 calc_first (dfa
, parent
->right
);
1122 node
->next
= parent
->right
->first
;
1125 /* else fall through */
1127 if (parent
->next
== -1)
1128 calc_next (dfa
, parent
);
1129 node
->next
= parent
->next
;
1132 idx
= node
->node_idx
;
1133 if (node
->type
== 0)
1134 dfa
->nexts
[idx
] = node
->next
;
1137 /* Calculate "edest" for the node NODE. */
1140 calc_epsdest (dfa
, node
)
1145 idx
= node
->node_idx
;
1146 if (node
->type
== 0)
1148 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1149 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1150 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1152 if (node
->left
->first
== -1)
1153 calc_first (dfa
, node
->left
);
1154 if (node
->next
== -1)
1155 calc_next (dfa
, node
);
1156 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1159 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1162 if (node
->left
!= NULL
)
1164 if (node
->left
->first
== -1)
1165 calc_first (dfa
, node
->left
);
1166 left
= node
->left
->first
;
1170 if (node
->next
== -1)
1171 calc_next (dfa
, node
);
1174 if (node
->right
!= NULL
)
1176 if (node
->right
->first
== -1)
1177 calc_first (dfa
, node
->right
);
1178 right
= node
->right
->first
;
1182 if (node
->next
== -1)
1183 calc_next (dfa
, node
);
1186 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1188 else if (dfa
->nodes
[idx
].type
== ANCHOR
1189 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1190 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
)
1191 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1195 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1196 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1197 otherwise return the error code. */
1199 static reg_errcode_t
1200 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1202 int *new_idx
, org_idx
;
1203 unsigned int constraint
;
1209 dup
.type
= OP_CONTEXT_NODE
;
1210 if (dfa
->nodes
[org_idx
].type
== OP_CONTEXT_NODE
)
1212 /* If the node whose index is ORG_IDX is the same as the intended
1214 if (dfa
->nodes
[org_idx
].constraint
== constraint
)
1219 dup
.constraint
= constraint
|
1220 dfa
->nodes
[org_idx
].constraint
;
1223 dup
.constraint
= constraint
;
1225 /* In case that `entity' points OP_CONTEXT_NODE,
1226 we correct `entity' to real entity in calc_inveclosures(). */
1227 dup
.opr
.ctx_info
= malloc (sizeof (*dup
.opr
.ctx_info
));
1228 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1229 if (BE (dup
.opr
.ctx_info
== NULL
|| dup_idx
== -1, 0))
1231 dup
.opr
.ctx_info
->entity
= org_idx
;
1232 dup
.opr
.ctx_info
->bkref_eclosure
= NULL
;
1234 dfa
->nodes
[dup_idx
].duplicated
= 1;
1235 dfa
->firsts
[dup_idx
] = dfa
->firsts
[org_idx
];
1236 dfa
->nexts
[dup_idx
] = dfa
->nexts
[org_idx
];
1237 err
= re_node_set_init_copy (dfa
->edests
+ dup_idx
, dfa
->edests
+ org_idx
);
1238 if (BE (err
!= REG_NOERROR
, 0))
1240 /* Since we don't duplicate epsilon nodes, epsilon closure have
1242 err
= re_node_set_init_1 (dfa
->eclosures
+ dup_idx
, dup_idx
);
1243 if (BE (err
!= REG_NOERROR
, 0))
1245 err
= re_node_set_init_1 (dfa
->inveclosures
+ dup_idx
, dup_idx
);
1246 if (BE (err
!= REG_NOERROR
, 0))
1248 /* Then we must update inveclosure for this node.
1249 We process them at last part of calc_eclosure(),
1250 since we don't complete to calculate them here. */
1257 calc_inveclosure (dfa
)
1260 int src
, idx
, dest
, entity
;
1261 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1263 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1265 dest
= dfa
->eclosures
[src
].elems
[idx
];
1266 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1270 while (dfa
->nodes
[entity
].type
== OP_CONTEXT_NODE
)
1272 entity
= dfa
->nodes
[entity
].opr
.ctx_info
->entity
;
1273 re_node_set_merge (dfa
->inveclosures
+ src
,
1274 dfa
->inveclosures
+ entity
);
1275 dfa
->nodes
[src
].opr
.ctx_info
->entity
= entity
;
1280 /* Calculate "eclosure" for all the node in DFA. */
1282 static reg_errcode_t
1286 int idx
, node_idx
, max
, incomplete
= 0;
1288 assert (dfa
->nodes_len
> 0);
1290 /* For each nodes, calculate epsilon closure. */
1291 for (node_idx
= 0, max
= dfa
->nodes_len
; ; ++node_idx
)
1294 re_node_set eclosure_elem
;
1295 if (node_idx
== max
)
1304 assert (dfa
->nodes
[node_idx
].type
!= OP_CONTEXT_NODE
);
1305 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1307 /* If we have already calculated, skip it. */
1308 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1310 /* Calculate epsilon closure of `node_idx'. */
1311 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1312 if (BE (err
!= REG_NOERROR
, 0))
1315 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1318 re_node_set_free (&eclosure_elem
);
1322 /* for duplicated nodes. */
1323 for (idx
= max
; idx
< dfa
->nodes_len
; ++idx
)
1325 int entity
, i
, constraint
;
1326 re_node_set
*bkref_eclosure
;
1327 entity
= dfa
->nodes
[idx
].opr
.ctx_info
->entity
;
1328 re_node_set_merge (dfa
->inveclosures
+ idx
, dfa
->inveclosures
+ entity
);
1329 if (dfa
->nodes
[entity
].type
!= OP_BACK_REF
)
1332 /* If the node is backreference, duplicate the epsilon closure of
1333 the next node. Since it may epsilon transit. */
1334 /* Note: duplicate_node() may realloc dfa->eclosures, etc. */
1335 bkref_eclosure
= re_malloc (re_node_set
, 1);
1336 if (BE (bkref_eclosure
== NULL
, 0))
1338 re_node_set_init_empty (bkref_eclosure
);
1339 constraint
= dfa
->nodes
[idx
].constraint
;
1340 for (i
= 0; i
< dfa
->eclosures
[dfa
->nexts
[idx
]].nelem
; ++i
)
1342 int dest_node_idx
= dfa
->eclosures
[dfa
->nexts
[idx
]].elems
[i
];
1343 if (!IS_EPSILON_NODE (dfa
->nodes
[dest_node_idx
].type
))
1346 err
= duplicate_node (&dest_node_idx
, dfa
, dest_node_idx
,
1348 if (BE (err
!= REG_NOERROR
, 0))
1351 re_node_set_insert (bkref_eclosure
, dest_node_idx
);
1353 dfa
->nodes
[idx
].opr
.ctx_info
->bkref_eclosure
= bkref_eclosure
;
1359 /* Calculate epsilon closure of NODE. */
1361 static reg_errcode_t
1362 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1363 re_node_set
*new_set
;
1368 unsigned int constraint
;
1369 int i
, max
, incomplete
= 0;
1370 re_node_set eclosure
;
1371 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1372 if (BE (err
!= REG_NOERROR
, 0))
1375 /* This indicates that we are calculating this node now.
1376 We reference this value to avoid infinite loop. */
1377 dfa
->eclosures
[node
].nelem
= -1;
1379 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1380 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1382 /* Expand each epsilon destination nodes. */
1383 if (dfa
->edests
[node
].nelem
!= 0)
1384 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1386 re_node_set eclosure_elem
;
1387 int edest
= dfa
->edests
[node
].elems
[i
];
1388 /* If calculating the epsilon closure of `edest' is in progress,
1389 return intermediate result. */
1390 if (dfa
->eclosures
[edest
].nelem
== -1)
1395 /* If we haven't calculated the epsilon closure of `edest' yet,
1396 calculate now. Otherwise use calculated epsilon closure. */
1397 if (dfa
->eclosures
[edest
].nelem
== 0)
1399 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1400 if (BE (err
!= REG_NOERROR
, 0))
1404 eclosure_elem
= dfa
->eclosures
[edest
];
1405 /* Merge the epsilon closure of `edest'. */
1406 re_node_set_merge (&eclosure
, &eclosure_elem
);
1407 /* If the epsilon closure of `edest' is incomplete,
1408 the epsilon closure of this node is also incomplete. */
1409 if (dfa
->eclosures
[edest
].nelem
== 0)
1412 re_node_set_free (&eclosure_elem
);
1416 /* If the current node has constraints, duplicate all non-epsilon nodes.
1417 Since they must inherit the constraints. */
1419 for (i
= 0, max
= eclosure
.nelem
; i
< max
; ++i
)
1421 int dest
= eclosure
.elems
[i
];
1422 if (!IS_EPSILON_NODE (dfa
->nodes
[dest
].type
))
1426 err
= duplicate_node (&dup_dest
, dfa
, dest
, constraint
);
1427 if (BE (err
!= REG_NOERROR
, 0))
1429 if (dest
!= dup_dest
)
1431 re_node_set_remove_at (&eclosure
, i
--);
1432 re_node_set_insert (&eclosure
, dup_dest
);
1438 /* Epsilon closures include itself. */
1439 re_node_set_insert (&eclosure
, node
);
1440 if (incomplete
&& !root
)
1441 dfa
->eclosures
[node
].nelem
= 0;
1443 dfa
->eclosures
[node
] = eclosure
;
1444 *new_set
= eclosure
;
1448 /* Functions for token which are used in the parser. */
1450 /* Fetch a token from INPUT.
1451 We must not use this function inside bracket expressions. */
1454 fetch_token (input
, syntax
)
1456 reg_syntax_t syntax
;
1460 consumed_byte
= peek_token (&token
, input
, syntax
);
1461 re_string_skip_bytes (input
, consumed_byte
);
1465 /* Peek a token from INPUT, and return the length of the token.
1466 We must not use this function inside bracket expressions. */
1469 peek_token (token
, input
, syntax
)
1472 reg_syntax_t syntax
;
1476 if (re_string_eoi (input
))
1478 token
->type
= END_OF_RE
;
1482 c
= re_string_peek_byte (input
, 0);
1485 #ifdef RE_ENABLE_I18N
1486 token
->mb_partial
= 0;
1487 if (MB_CUR_MAX
> 1 &&
1488 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1490 token
->type
= CHARACTER
;
1491 token
->mb_partial
= 1;
1498 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1500 token
->type
= BACK_SLASH
;
1504 c2
= re_string_peek_byte_case (input
, 1);
1506 token
->type
= CHARACTER
;
1510 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1511 token
->type
= OP_ALT
;
1513 case '1': case '2': case '3': case '4': case '5':
1514 case '6': case '7': case '8': case '9':
1515 if (!(syntax
& RE_NO_BK_REFS
))
1517 token
->type
= OP_BACK_REF
;
1518 token
->opr
.idx
= c2
- '0';
1522 if (!(syntax
& RE_NO_GNU_OPS
))
1524 token
->type
= ANCHOR
;
1525 token
->opr
.idx
= WORD_FIRST
;
1529 if (!(syntax
& RE_NO_GNU_OPS
))
1531 token
->type
= ANCHOR
;
1532 token
->opr
.idx
= WORD_LAST
;
1536 if (!(syntax
& RE_NO_GNU_OPS
))
1538 token
->type
= ANCHOR
;
1539 token
->opr
.idx
= WORD_DELIM
;
1543 if (!(syntax
& RE_NO_GNU_OPS
))
1545 token
->type
= ANCHOR
;
1546 token
->opr
.idx
= INSIDE_WORD
;
1550 if (!(syntax
& RE_NO_GNU_OPS
))
1551 token
->type
= OP_WORD
;
1554 if (!(syntax
& RE_NO_GNU_OPS
))
1555 token
->type
= OP_NOTWORD
;
1558 if (!(syntax
& RE_NO_GNU_OPS
))
1560 token
->type
= ANCHOR
;
1561 token
->opr
.idx
= BUF_FIRST
;
1565 if (!(syntax
& RE_NO_GNU_OPS
))
1567 token
->type
= ANCHOR
;
1568 token
->opr
.idx
= BUF_LAST
;
1572 if (!(syntax
& RE_NO_BK_PARENS
))
1573 token
->type
= OP_OPEN_SUBEXP
;
1576 if (!(syntax
& RE_NO_BK_PARENS
))
1577 token
->type
= OP_CLOSE_SUBEXP
;
1580 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1581 token
->type
= OP_DUP_PLUS
;
1584 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1585 token
->type
= OP_DUP_QUESTION
;
1588 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1589 token
->type
= OP_OPEN_DUP_NUM
;
1592 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1593 token
->type
= OP_CLOSE_DUP_NUM
;
1601 token
->type
= CHARACTER
;
1605 if (syntax
& RE_NEWLINE_ALT
)
1606 token
->type
= OP_ALT
;
1609 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1610 token
->type
= OP_ALT
;
1613 token
->type
= OP_DUP_ASTERISK
;
1616 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1617 token
->type
= OP_DUP_PLUS
;
1620 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1621 token
->type
= OP_DUP_QUESTION
;
1624 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1625 token
->type
= OP_OPEN_DUP_NUM
;
1628 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1629 token
->type
= OP_CLOSE_DUP_NUM
;
1632 if (syntax
& RE_NO_BK_PARENS
)
1633 token
->type
= OP_OPEN_SUBEXP
;
1636 if (syntax
& RE_NO_BK_PARENS
)
1637 token
->type
= OP_CLOSE_SUBEXP
;
1640 token
->type
= OP_OPEN_BRACKET
;
1643 token
->type
= OP_PERIOD
;
1646 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1647 re_string_cur_idx (input
) != 0)
1649 char prev
= re_string_peek_byte (input
, -1);
1650 if (prev
!= '|' && prev
!= '(' &&
1651 (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n'))
1654 token
->type
= ANCHOR
;
1655 token
->opr
.idx
= LINE_FIRST
;
1658 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1659 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1662 re_string_skip_bytes (input
, 1);
1663 peek_token (&next
, input
, syntax
);
1664 re_string_skip_bytes (input
, -1);
1665 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1668 token
->type
= ANCHOR
;
1669 token
->opr
.idx
= LINE_LAST
;
1677 /* Peek a token from INPUT, and return the length of the token.
1678 We must not use this function out of bracket expressions. */
1681 peek_token_bracket (token
, input
, syntax
)
1684 reg_syntax_t syntax
;
1687 if (re_string_eoi (input
))
1689 token
->type
= END_OF_RE
;
1692 c
= re_string_peek_byte (input
, 0);
1695 #ifdef RE_ENABLE_I18N
1696 if (MB_CUR_MAX
> 1 &&
1697 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1699 token
->type
= CHARACTER
;
1702 #endif /* RE_ENABLE_I18N */
1704 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1706 /* In this case, '\' escape a character. */
1708 c2
= re_string_peek_byte (input
, 1);
1710 token
->type
= CHARACTER
;
1713 if (c
== '[') /* '[' is a special char in a bracket exps. */
1717 c2
= re_string_peek_byte (input
, 1);
1723 token
->type
= OP_OPEN_COLL_ELEM
;
1726 token
->type
= OP_OPEN_EQUIV_CLASS
;
1729 if (syntax
& RE_CHAR_CLASSES
)
1731 token
->type
= OP_OPEN_CHAR_CLASS
;
1734 /* else fall through. */
1736 token
->type
= CHARACTER
;
1746 token
->type
= OP_CHARSET_RANGE
;
1749 token
->type
= OP_CLOSE_BRACKET
;
1752 token
->type
= OP_NON_MATCH_LIST
;
1755 token
->type
= CHARACTER
;
1760 /* Functions for parser. */
1762 /* Entry point of the parser.
1763 Parse the regular expression REGEXP and return the structure tree.
1764 If an error is occured, ERR is set by error code, and return NULL.
1765 This function build the following tree, from regular expression <reg_exp>:
1771 CAT means concatenation.
1772 EOR means end of regular expression. */
1775 parse (regexp
, preg
, syntax
, err
)
1776 re_string_t
*regexp
;
1778 reg_syntax_t syntax
;
1781 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1782 bin_tree_t
*tree
, *eor
, *root
;
1783 re_token_t current_token
;
1785 current_token
= fetch_token (regexp
, syntax
);
1786 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1787 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1789 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1790 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1792 root
= create_tree (tree
, eor
, CONCAT
, 0);
1795 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1796 return *err
= REG_ESPACE
, NULL
;
1800 /* This function build the following tree, from regular expression
1801 <branch1>|<branch2>:
1807 ALT means alternative, which represents the operator `|'. */
1810 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1811 re_string_t
*regexp
;
1814 reg_syntax_t syntax
;
1818 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1819 bin_tree_t
*tree
, *branch
= NULL
;
1821 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1822 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1825 while (token
->type
== OP_ALT
)
1827 re_token_t alt_token
= *token
;
1828 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1829 *token
= fetch_token (regexp
, syntax
);
1830 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1831 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1833 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1834 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1836 free_bin_tree (tree
);
1842 tree
= create_tree (tree
, branch
, 0, new_idx
);
1843 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1844 return *err
= REG_ESPACE
, NULL
;
1845 dfa
->has_plural_match
= 1;
1850 /* This function build the following tree, from regular expression
1857 CAT means concatenation. */
1860 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
1861 re_string_t
*regexp
;
1864 reg_syntax_t syntax
;
1868 bin_tree_t
*tree
, *exp
;
1869 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1870 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1873 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1874 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1876 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1877 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
1879 free_bin_tree (tree
);
1882 if (tree
!= NULL
&& exp
!= NULL
)
1884 tree
= create_tree (tree
, exp
, CONCAT
, 0);
1886 return *err
= REG_ESPACE
, NULL
;
1888 else if (tree
== NULL
)
1890 /* Otherwise exp == NULL, we don't need to create new tree. */
1895 /* This function build the following tree, from regular expression a*:
1902 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
1903 re_string_t
*regexp
;
1906 reg_syntax_t syntax
;
1910 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1913 switch (token
->type
)
1916 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1917 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1918 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1919 return *err
= REG_ESPACE
, NULL
;
1920 #ifdef RE_ENABLE_I18N
1923 while (!re_string_eoi (regexp
)
1924 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
1926 bin_tree_t
*mbc_remain
;
1927 *token
= fetch_token (regexp
, syntax
);
1928 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1929 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
1930 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
1931 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
1932 return *err
= REG_ESPACE
, NULL
;
1937 case OP_OPEN_SUBEXP
:
1938 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
1939 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1942 case OP_OPEN_BRACKET
:
1943 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
1944 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1948 if (BE (preg
->re_nsub
< token
->opr
.idx
1949 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
1954 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1955 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1956 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1957 return *err
= REG_ESPACE
, NULL
;
1959 dfa
->has_mb_node
= 1;
1961 case OP_DUP_ASTERISK
:
1963 case OP_DUP_QUESTION
:
1964 case OP_OPEN_DUP_NUM
:
1965 if (syntax
& RE_CONTEXT_INVALID_OPS
)
1966 return *err
= REG_BADRPT
, NULL
;
1967 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
1969 *token
= fetch_token (regexp
, syntax
);
1970 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1972 /* else fall through */
1973 case OP_CLOSE_SUBEXP
:
1974 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
1975 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
1976 return *err
= REG_ERPAREN
, NULL
;
1977 /* else fall through */
1978 case OP_CLOSE_DUP_NUM
:
1979 /* We treat it as a normal character. */
1981 /* Then we can these characters as normal characters. */
1982 token
->type
= CHARACTER
;
1983 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1984 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1985 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1986 return *err
= REG_ESPACE
, NULL
;
1989 if (dfa
->word_char
== NULL
)
1991 *err
= init_word_char (dfa
);
1992 if (BE (*err
!= REG_NOERROR
, 0))
1995 if (token
->opr
.ctx_type
== WORD_DELIM
)
1997 bin_tree_t
*tree_first
, *tree_last
;
1998 int idx_first
, idx_last
;
1999 token
->opr
.ctx_type
= WORD_FIRST
;
2000 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
2001 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
2002 token
->opr
.ctx_type
= WORD_LAST
;
2003 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
2004 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
2005 token
->type
= OP_ALT
;
2006 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2007 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
2008 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
2009 || tree_first
== NULL
|| tree_last
== NULL
2010 || tree
== NULL
, 0))
2011 return *err
= REG_ESPACE
, NULL
;
2015 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2016 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2017 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2018 return *err
= REG_ESPACE
, NULL
;
2020 /* We must return here, since ANCHORs can't be followed
2021 by repetition operators.
2022 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2023 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2024 *token
= fetch_token (regexp
, syntax
);
2027 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2028 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2029 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2030 return *err
= REG_ESPACE
, NULL
;
2032 dfa
->has_mb_node
= 1;
2035 tree
= build_word_op (dfa
, 0, err
);
2036 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2040 tree
= build_word_op (dfa
, 1, err
);
2041 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2051 /* Must not happen? */
2057 *token
= fetch_token (regexp
, syntax
);
2059 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2060 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2062 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2063 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2065 dfa
->has_plural_match
= 1;
2071 /* This function build the following tree, from regular expression
2079 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2080 re_string_t
*regexp
;
2083 reg_syntax_t syntax
;
2087 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2088 bin_tree_t
*tree
, *left_par
, *right_par
;
2091 cur_nsub
= preg
->re_nsub
++;
2092 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2094 re_subexp_t
*new_array
;
2095 dfa
->subexps_alloc
*= 2;
2096 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2097 if (BE (new_array
== NULL
, 0))
2099 dfa
->subexps_alloc
/= 2;
2103 dfa
->subexps
= new_array
;
2105 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2106 dfa
->subexps
[cur_nsub
].end
= -1;
2108 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2109 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2110 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2111 return *err
= REG_ESPACE
, NULL
;
2112 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2113 *token
= fetch_token (regexp
, syntax
);
2115 /* The subexpression may be a null string. */
2116 if (token
->type
== OP_CLOSE_SUBEXP
)
2120 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2121 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2124 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2126 free_bin_tree (tree
);
2130 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2131 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2132 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2133 tree
= ((tree
== NULL
) ? right_par
2134 : create_tree (tree
, right_par
, CONCAT
, 0));
2135 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2136 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2137 return *err
= REG_ESPACE
, NULL
;
2138 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2143 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2146 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2147 bin_tree_t
*dup_elem
;
2148 re_string_t
*regexp
;
2151 reg_syntax_t syntax
;
2154 re_token_t dup_token
;
2155 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2156 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2157 re_token_t start_token
= *token
;
2158 if (token
->type
== OP_OPEN_DUP_NUM
)
2162 int start
= fetch_number (regexp
, token
, syntax
);
2166 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2167 start
= 0; /* We treat "{,m}" as "{0,m}". */
2170 *err
= REG_BADBR
; /* <re>{} is invalid. */
2174 if (BE (start
!= -2, 1))
2176 /* We treat "{n}" as "{n,n}". */
2177 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2178 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2179 ? fetch_number (regexp
, token
, syntax
) : -2));
2181 if (BE (start
== -2 || end
== -2, 0))
2183 /* Invalid sequence. */
2184 if (token
->type
== OP_CLOSE_DUP_NUM
)
2185 goto parse_dup_op_invalid_interval
;
2187 goto parse_dup_op_ebrace
;
2189 if (BE (start
== 0 && end
== 0, 0))
2191 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2192 *token
= fetch_token (regexp
, syntax
);
2193 free_bin_tree (dup_elem
);
2197 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2199 for (i
= 0; i
< start
; ++i
)
2202 work_tree
= duplicate_tree (elem
, dfa
);
2203 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2204 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2205 goto parse_dup_op_espace
;
2210 /* We treat "<re>{0,}" as "<re>*". */
2211 dup_token
.type
= OP_DUP_ASTERISK
;
2214 elem
= duplicate_tree (elem
, dfa
);
2215 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2216 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2217 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2218 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2219 || tree
== NULL
, 0))
2220 goto parse_dup_op_espace
;
2224 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2225 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2226 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2227 goto parse_dup_op_espace
;
2230 else if (end
- start
> 0)
2232 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2233 dup_token
.type
= OP_DUP_QUESTION
;
2236 elem
= duplicate_tree (elem
, dfa
);
2237 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2238 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2239 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2240 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2241 goto parse_dup_op_espace
;
2245 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2246 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2247 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2248 goto parse_dup_op_espace
;
2250 for (i
= 1; i
< end
- start
; ++i
)
2252 work_tree
= duplicate_tree (elem
, dfa
);
2253 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2254 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2255 return *err
= REG_ESPACE
, NULL
;
2261 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2262 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2263 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2264 return *err
= REG_ESPACE
, NULL
;
2266 *token
= fetch_token (regexp
, syntax
);
2269 parse_dup_op_espace
:
2270 free_bin_tree (tree
);
2274 parse_dup_op_ebrace
:
2275 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2280 goto parse_dup_op_rollback
;
2281 parse_dup_op_invalid_interval
:
2282 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2287 parse_dup_op_rollback
:
2288 re_string_set_index (regexp
, start_idx
);
2289 *token
= start_token
;
2290 token
->type
= CHARACTER
;
2294 /* Size of the names for collating symbol/equivalence_class/character_class.
2295 I'm not sure, but maybe enough. */
2296 #define BRACKET_NAME_BUF_SIZE 32
2299 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2300 Build the range expression which starts from START_ELEM, and ends
2301 at END_ELEM. The result are written to MBCSET and SBCSET.
2302 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2303 mbcset->range_ends, is a pointer argument sinse we may
2306 static reg_errcode_t
2307 # ifdef RE_ENABLE_I18N
2308 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2309 re_charset_t
*mbcset
;
2311 # else /* not RE_ENABLE_I18N */
2312 build_range_exp (sbcset
, start_elem
, end_elem
)
2313 # endif /* not RE_ENABLE_I18N */
2314 re_bitset_ptr_t sbcset
;
2315 bracket_elem_t
*start_elem
, *end_elem
;
2317 unsigned int start_ch
, end_ch
;
2318 /* Equivalence Classes and Character Classes can't be a range start/end. */
2319 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2320 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2324 /* We can handle no multi character collating elements without libc
2326 if (BE ((start_elem
->type
== COLL_SYM
2327 && strlen ((char *) start_elem
->opr
.name
) > 1)
2328 || (end_elem
->type
== COLL_SYM
2329 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2330 return REG_ECOLLATE
;
2332 # ifdef RE_ENABLE_I18N
2334 wchar_t wc
, start_wc
, end_wc
;
2335 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2337 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2338 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2340 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2341 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2343 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2344 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2345 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2346 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2347 cmp_buf
[0] = start_wc
;
2348 cmp_buf
[4] = end_wc
;
2349 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2352 /* Check the space of the arrays. */
2353 if (*range_alloc
== mbcset
->nranges
)
2355 /* There are not enough space, need realloc. */
2356 wchar_t *new_array_start
, *new_array_end
;
2359 /* +1 in case of mbcset->nranges is 0. */
2360 new_nranges
= 2 * mbcset
->nranges
+ 1;
2361 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2362 are NULL if *range_alloc == 0. */
2363 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2365 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2368 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2371 mbcset
->range_starts
= new_array_start
;
2372 mbcset
->range_ends
= new_array_end
;
2373 *range_alloc
= new_nranges
;
2376 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2377 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2379 /* Build the table for single byte characters. */
2380 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2383 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2384 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2385 bitset_set (sbcset
, wc
);
2388 # else /* not RE_ENABLE_I18N */
2391 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2392 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2394 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2395 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2397 if (start_ch
> end_ch
)
2399 /* Build the table for single byte characters. */
2400 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2401 if (start_ch
<= ch
&& ch
<= end_ch
)
2402 bitset_set (sbcset
, ch
);
2404 # endif /* not RE_ENABLE_I18N */
2407 #endif /* not _LIBC */
2410 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2411 Build the collating element which is represented by NAME.
2412 The result are written to MBCSET and SBCSET.
2413 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2414 pointer argument since we may update it. */
2416 static reg_errcode_t
2417 # ifdef RE_ENABLE_I18N
2418 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2419 re_charset_t
*mbcset
;
2420 int *coll_sym_alloc
;
2421 # else /* not RE_ENABLE_I18N */
2422 build_collating_symbol (sbcset
, name
)
2423 # endif /* not RE_ENABLE_I18N */
2424 re_bitset_ptr_t sbcset
;
2425 const unsigned char *name
;
2427 size_t name_len
= strlen ((const char *) name
);
2428 if (BE (name_len
!= 1, 0))
2429 return REG_ECOLLATE
;
2432 bitset_set (sbcset
, name
[0]);
2436 #endif /* not _LIBC */
2438 /* This function parse bracket expression like "[abc]", "[a-c]",
2442 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2443 re_string_t
*regexp
;
2446 reg_syntax_t syntax
;
2450 const unsigned char *collseqmb
;
2451 const char *collseqwc
;
2454 const int32_t *symb_table
;
2455 const unsigned char *extra
;
2457 /* Local function for parse_bracket_exp used in _LIBC environement.
2458 Seek the collating symbol entry correspondings to NAME.
2459 Return the index of the symbol in the SYMB_TABLE. */
2461 static inline int32_t
2462 seek_collating_symbol_entry (name
, name_len
)
2463 const unsigned char *name
;
2466 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2467 int32_t elem
= hash
% table_size
;
2468 int32_t second
= hash
% (table_size
- 2);
2469 while (symb_table
[2 * elem
] != 0)
2471 /* First compare the hashing value. */
2472 if (symb_table
[2 * elem
] == hash
2473 /* Compare the length of the name. */
2474 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2475 /* Compare the name. */
2476 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2479 /* Yep, this is the entry. */
2489 /* Local function for parse_bracket_exp used in _LIBC environement.
2490 Look up the collation sequence value of BR_ELEM.
2491 Return the value if succeeded, UINT_MAX otherwise. */
2493 static inline unsigned int
2494 lookup_collation_sequence_value (br_elem
)
2495 bracket_elem_t
*br_elem
;
2497 if (br_elem
->type
== SB_CHAR
)
2500 if (MB_CUR_MAX == 1)
2503 return collseqmb
[br_elem
->opr
.ch
];
2506 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2507 return collseq_table_lookup (collseqwc
, wc
);
2510 else if (br_elem
->type
== MB_CHAR
)
2512 return collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2514 else if (br_elem
->type
== COLL_SYM
)
2516 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2520 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2522 if (symb_table
[2 * elem
] != 0)
2524 /* We found the entry. */
2525 idx
= symb_table
[2 * elem
+ 1];
2526 /* Skip the name of collating element name. */
2527 idx
+= 1 + extra
[idx
];
2528 /* Skip the byte sequence of the collating element. */
2529 idx
+= 1 + extra
[idx
];
2530 /* Adjust for the alignment. */
2531 idx
= (idx
+ 3) & ~3;
2532 /* Skip the multibyte collation sequence value. */
2533 idx
+= sizeof (unsigned int);
2534 /* Skip the wide char sequence of the collating element. */
2535 idx
+= sizeof (unsigned int) *
2536 (1 + *(unsigned int *) (extra
+ idx
));
2537 /* Return the collation sequence value. */
2538 return *(unsigned int *) (extra
+ idx
);
2540 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2542 /* No valid character. Match it as a single byte
2544 return collseqmb
[br_elem
->opr
.name
[0]];
2547 else if (sym_name_len
== 1)
2548 return collseqmb
[br_elem
->opr
.name
[0]];
2553 /* Local function for parse_bracket_exp used in _LIBC environement.
2554 Build the range expression which starts from START_ELEM, and ends
2555 at END_ELEM. The result are written to MBCSET and SBCSET.
2556 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2557 mbcset->range_ends, is a pointer argument sinse we may
2560 static inline reg_errcode_t
2561 # ifdef RE_ENABLE_I18N
2562 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2563 re_charset_t
*mbcset
;
2565 # else /* not RE_ENABLE_I18N */
2566 build_range_exp (sbcset
, start_elem
, end_elem
)
2567 # endif /* not RE_ENABLE_I18N */
2568 re_bitset_ptr_t sbcset
;
2569 bracket_elem_t
*start_elem
, *end_elem
;
2572 uint32_t start_collseq
;
2573 uint32_t end_collseq
;
2575 # ifdef RE_ENABLE_I18N
2576 /* Check the space of the arrays. */
2577 if (*range_alloc
== mbcset
->nranges
)
2579 /* There are not enough space, need realloc. */
2580 uint32_t *new_array_start
;
2581 uint32_t *new_array_end
;
2584 /* +1 in case of mbcset->nranges is 0. */
2585 new_nranges
= 2 * mbcset
->nranges
+ 1;
2586 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2587 are NULL if *range_alloc == 0. */
2588 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2590 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2593 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2596 mbcset
->range_starts
= new_array_start
;
2597 mbcset
->range_ends
= new_array_end
;
2598 *range_alloc
= new_nranges
;
2600 # endif /* RE_ENABLE_I18N */
2602 /* Equivalence Classes and Character Classes can't be a range
2604 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2605 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2609 start_collseq
= lookup_collation_sequence_value (start_elem
);
2610 end_collseq
= lookup_collation_sequence_value (end_elem
);
2611 /* Check start/end collation sequence values. */
2612 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2613 return REG_ECOLLATE
;
2614 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2617 # ifdef RE_ENABLE_I18N
2618 /* Got valid collation sequence values, add them as a new entry. */
2619 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2620 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2621 # endif /* RE_ENABLE_I18N */
2623 /* Build the table for single byte characters. */
2624 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2626 uint32_t ch_collseq
;
2628 if (MB_CUR_MAX == 1)
2631 ch_collseq
= collseqmb
[ch
];
2633 ch_collseq
= collseq_table_lookup (collseqwc
, __btowc (ch
));
2634 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2635 bitset_set (sbcset
, ch
);
2640 /* Local function for parse_bracket_exp used in _LIBC environement.
2641 Build the collating element which is represented by NAME.
2642 The result are written to MBCSET and SBCSET.
2643 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2644 pointer argument sinse we may update it. */
2646 static inline reg_errcode_t
2647 # ifdef RE_ENABLE_I18N
2648 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2649 re_charset_t
*mbcset
;
2650 int *coll_sym_alloc
;
2651 # else /* not RE_ENABLE_I18N */
2652 build_collating_symbol (sbcset
, name
)
2653 # endif /* not RE_ENABLE_I18N */
2654 re_bitset_ptr_t sbcset
;
2655 const unsigned char *name
;
2658 size_t name_len
= strlen ((const char *) name
);
2661 elem
= seek_collating_symbol_entry (name
, name_len
);
2662 if (symb_table
[2 * elem
] != 0)
2664 /* We found the entry. */
2665 idx
= symb_table
[2 * elem
+ 1];
2666 /* Skip the name of collating element name. */
2667 idx
+= 1 + extra
[idx
];
2669 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2671 /* No valid character, treat it as a normal
2673 bitset_set (sbcset
, name
[0]);
2677 return REG_ECOLLATE
;
2679 # ifdef RE_ENABLE_I18N
2680 /* Got valid collation sequence, add it as a new entry. */
2681 /* Check the space of the arrays. */
2682 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2684 /* Not enough, realloc it. */
2685 /* +1 in case of mbcset->ncoll_syms is 0. */
2686 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2687 /* Use realloc since mbcset->coll_syms is NULL
2689 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2691 if (BE (mbcset
->coll_syms
== NULL
, 0))
2694 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2695 # endif /* RE_ENABLE_I18N */
2700 if (BE (name_len
!= 1, 0))
2701 return REG_ECOLLATE
;
2704 bitset_set (sbcset
, name
[0]);
2711 re_token_t br_token
;
2712 re_bitset_ptr_t sbcset
;
2713 #ifdef RE_ENABLE_I18N
2714 re_charset_t
*mbcset
;
2715 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2716 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2717 #else /* not RE_ENABLE_I18N */
2719 #endif /* not RE_ENABLE_I18N */
2720 bin_tree_t
*work_tree
;
2721 int token_len
, new_idx
;
2723 collseqmb
= (const unsigned char *)
2724 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2725 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2731 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2732 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2733 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2734 _NL_COLLATE_SYMB_TABLEMB
);
2735 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2736 _NL_COLLATE_SYMB_EXTRAMB
);
2739 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2740 #ifdef RE_ENABLE_I18N
2741 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2742 #endif /* RE_ENABLE_I18N */
2743 #ifdef RE_ENABLE_I18N
2744 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2746 if (BE (sbcset
== NULL
, 0))
2747 #endif /* RE_ENABLE_I18N */
2753 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2754 if (BE (token
->type
== END_OF_RE
, 0))
2757 goto parse_bracket_exp_free_return
;
2759 if (token
->type
== OP_NON_MATCH_LIST
)
2761 #ifdef RE_ENABLE_I18N
2763 mbcset
->non_match
= 1;
2764 #else /* not RE_ENABLE_I18N */
2766 #endif /* not RE_ENABLE_I18N */
2767 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2768 bitset_set (sbcset
, '\0');
2769 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2770 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2771 if (BE (token
->type
== END_OF_RE
, 0))
2774 goto parse_bracket_exp_free_return
;
2776 #ifdef RE_ENABLE_I18N
2778 for (i
= 0; i
< SBC_MAX
; ++i
)
2779 if (__btowc (i
) == WEOF
)
2780 bitset_set (sbcset
, i
);
2781 #endif /* RE_ENABLE_I18N */
2784 /* We treat the first ']' as a normal character. */
2785 if (token
->type
== OP_CLOSE_BRACKET
)
2786 token
->type
= CHARACTER
;
2790 bracket_elem_t start_elem
, end_elem
;
2791 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
2792 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
2794 int token_len2
= 0, is_range_exp
= 0;
2797 start_elem
.opr
.name
= start_name_buf
;
2798 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
2800 if (BE (ret
!= REG_NOERROR
, 0))
2803 goto parse_bracket_exp_free_return
;
2806 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2807 if (BE (token
->type
== END_OF_RE
, 0))
2810 goto parse_bracket_exp_free_return
;
2812 if (token
->type
== OP_CHARSET_RANGE
)
2814 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
2815 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
2816 if (BE (token
->type
== END_OF_RE
, 0))
2819 goto parse_bracket_exp_free_return
;
2821 if (token2
.type
== OP_CLOSE_BRACKET
)
2823 /* We treat the last '-' as a normal character. */
2824 re_string_skip_bytes (regexp
, -token_len
);
2825 token
->type
= CHARACTER
;
2831 if (is_range_exp
== 1)
2833 end_elem
.opr
.name
= end_name_buf
;
2834 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
2836 if (BE (ret
!= REG_NOERROR
, 0))
2839 goto parse_bracket_exp_free_return
;
2842 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2843 if (BE (token
->type
== END_OF_RE
, 0))
2846 goto parse_bracket_exp_free_return
;
2848 *err
= build_range_exp (sbcset
,
2849 #ifdef RE_ENABLE_I18N
2850 mbcset
, &range_alloc
,
2851 #endif /* RE_ENABLE_I18N */
2852 &start_elem
, &end_elem
);
2853 if (BE (*err
!= REG_NOERROR
, 0))
2854 goto parse_bracket_exp_free_return
;
2858 switch (start_elem
.type
)
2861 bitset_set (sbcset
, start_elem
.opr
.ch
);
2863 #ifdef RE_ENABLE_I18N
2865 /* Check whether the array has enough space. */
2866 if (mbchar_alloc
== mbcset
->nmbchars
)
2868 /* Not enough, realloc it. */
2869 /* +1 in case of mbcset->nmbchars is 0. */
2870 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
2871 /* Use realloc since array is NULL if *alloc == 0. */
2872 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
2874 if (BE (mbcset
->mbchars
== NULL
, 0))
2875 goto parse_bracket_exp_espace
;
2877 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
2879 #endif /* RE_ENABLE_I18N */
2881 *err
= build_equiv_class (sbcset
,
2882 #ifdef RE_ENABLE_I18N
2883 mbcset
, &equiv_class_alloc
,
2884 #endif /* RE_ENABLE_I18N */
2885 start_elem
.opr
.name
);
2886 if (BE (*err
!= REG_NOERROR
, 0))
2887 goto parse_bracket_exp_free_return
;
2890 *err
= build_collating_symbol (sbcset
,
2891 #ifdef RE_ENABLE_I18N
2892 mbcset
, &coll_sym_alloc
,
2893 #endif /* RE_ENABLE_I18N */
2894 start_elem
.opr
.name
);
2895 if (BE (*err
!= REG_NOERROR
, 0))
2896 goto parse_bracket_exp_free_return
;
2899 ret
= build_charclass (sbcset
,
2900 #ifdef RE_ENABLE_I18N
2901 mbcset
, &char_class_alloc
,
2902 #endif /* RE_ENABLE_I18N */
2903 start_elem
.opr
.name
, syntax
);
2904 if (BE (ret
!= REG_NOERROR
, 0))
2905 goto parse_bracket_exp_espace
;
2912 if (token
->type
== OP_CLOSE_BRACKET
)
2916 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2918 /* If it is non-matching list. */
2919 #ifdef RE_ENABLE_I18N
2920 if (mbcset
->non_match
)
2921 #else /* not RE_ENABLE_I18N */
2923 #endif /* not RE_ENABLE_I18N */
2924 bitset_not (sbcset
);
2926 /* Build a tree for simple bracket. */
2927 br_token
.type
= SIMPLE_BRACKET
;
2928 br_token
.opr
.sbcset
= sbcset
;
2929 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2930 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2931 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
2932 goto parse_bracket_exp_espace
;
2934 #ifdef RE_ENABLE_I18N
2935 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
2936 || mbcset
->nranges
|| (MB_CUR_MAX
> 1 && (mbcset
->nchar_classes
2937 || mbcset
->non_match
)))
2939 re_token_t alt_token
;
2940 bin_tree_t
*mbc_tree
;
2941 /* Build a tree for complex bracket. */
2942 br_token
.type
= COMPLEX_BRACKET
;
2943 br_token
.opr
.mbcset
= mbcset
;
2944 dfa
->has_mb_node
= 1;
2945 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2946 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2947 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
2948 goto parse_bracket_exp_espace
;
2949 /* Then join them by ALT node. */
2950 dfa
->has_plural_match
= 1;
2951 alt_token
.type
= OP_ALT
;
2952 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
2953 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
2954 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
2959 free_charset (mbcset
);
2962 #else /* not RE_ENABLE_I18N */
2964 #endif /* not RE_ENABLE_I18N */
2966 parse_bracket_exp_espace
:
2968 parse_bracket_exp_free_return
:
2970 #ifdef RE_ENABLE_I18N
2971 free_charset (mbcset
);
2972 #endif /* RE_ENABLE_I18N */
2976 /* Parse an element in the bracket expression. */
2978 static reg_errcode_t
2979 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
)
2980 bracket_elem_t
*elem
;
2981 re_string_t
*regexp
;
2985 reg_syntax_t syntax
;
2987 #ifdef RE_ENABLE_I18N
2989 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
2990 if (cur_char_size
> 1)
2992 elem
->type
= MB_CHAR
;
2993 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
2994 re_string_skip_bytes (regexp
, cur_char_size
);
2997 #endif /* RE_ENABLE_I18N */
2998 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2999 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3000 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3001 return parse_bracket_symbol (elem
, regexp
, token
);
3002 elem
->type
= SB_CHAR
;
3003 elem
->opr
.ch
= token
->opr
.c
;
3007 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3008 such as [:<character_class>:], [.<collating_element>.], and
3009 [=<equivalent_class>=]. */
3011 static reg_errcode_t
3012 parse_bracket_symbol (elem
, regexp
, token
)
3013 bracket_elem_t
*elem
;
3014 re_string_t
*regexp
;
3017 unsigned char ch
, delim
= token
->opr
.c
;
3021 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3023 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3024 ch
= re_string_fetch_byte_case (regexp
);
3026 ch
= re_string_fetch_byte (regexp
);
3027 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3029 elem
->opr
.name
[i
] = ch
;
3031 re_string_skip_bytes (regexp
, 1);
3032 elem
->opr
.name
[i
] = '\0';
3033 switch (token
->type
)
3035 case OP_OPEN_COLL_ELEM
:
3036 elem
->type
= COLL_SYM
;
3038 case OP_OPEN_EQUIV_CLASS
:
3039 elem
->type
= EQUIV_CLASS
;
3041 case OP_OPEN_CHAR_CLASS
:
3042 elem
->type
= CHAR_CLASS
;
3050 /* Helper function for parse_bracket_exp.
3051 Build the equivalence class which is represented by NAME.
3052 The result are written to MBCSET and SBCSET.
3053 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3054 is a pointer argument sinse we may update it. */
3056 static reg_errcode_t
3057 #ifdef RE_ENABLE_I18N
3058 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3059 re_charset_t
*mbcset
;
3060 int *equiv_class_alloc
;
3061 #else /* not RE_ENABLE_I18N */
3062 build_equiv_class (sbcset
, name
)
3063 #endif /* not RE_ENABLE_I18N */
3064 re_bitset_ptr_t sbcset
;
3065 const unsigned char *name
;
3067 #if defined _LIBC && defined RE_ENABLE_I18N
3068 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3071 const int32_t *table
, *indirect
;
3072 const unsigned char *weights
, *extra
, *cp
;
3073 unsigned char char_buf
[2];
3077 /* This #include defines a local function! */
3078 # include <locale/weight.h>
3079 /* Calculate the index for equivalence class. */
3081 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3082 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3083 _NL_COLLATE_WEIGHTMB
);
3084 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3085 _NL_COLLATE_EXTRAMB
);
3086 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3087 _NL_COLLATE_INDIRECTMB
);
3088 idx1
= findidx (&cp
);
3089 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3090 /* This isn't a valid character. */
3091 return REG_ECOLLATE
;
3093 /* Build single byte matcing table for this equivalence class. */
3094 char_buf
[1] = (unsigned char) '\0';
3095 len
= weights
[idx1
];
3096 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3100 idx2
= findidx (&cp
);
3105 /* This isn't a valid character. */
3107 if (len
== weights
[idx2
])
3110 while (cnt
<= len
&&
3111 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3115 bitset_set (sbcset
, ch
);
3118 /* Check whether the array has enough space. */
3119 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3121 /* Not enough, realloc it. */
3122 /* +1 in case of mbcset->nequiv_classes is 0. */
3123 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3124 /* Use realloc since the array is NULL if *alloc == 0. */
3125 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3126 *equiv_class_alloc
);
3127 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3130 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3133 #endif /* _LIBC && RE_ENABLE_I18N */
3135 if (BE (strlen ((const char *) name
) != 1, 0))
3136 return REG_ECOLLATE
;
3137 bitset_set (sbcset
, *name
);
3142 /* Helper function for parse_bracket_exp.
3143 Build the character class which is represented by NAME.
3144 The result are written to MBCSET and SBCSET.
3145 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3146 is a pointer argument sinse we may update it. */
3148 static reg_errcode_t
3149 #ifdef RE_ENABLE_I18N
3150 build_charclass (sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3151 re_charset_t
*mbcset
;
3152 int *char_class_alloc
;
3153 #else /* not RE_ENABLE_I18N */
3154 build_charclass (sbcset
, class_name
, syntax
)
3155 #endif /* not RE_ENABLE_I18N */
3156 re_bitset_ptr_t sbcset
;
3157 const unsigned char *class_name
;
3158 reg_syntax_t syntax
;
3161 const char *name
= (const char *) class_name
;
3163 /* In case of REG_ICASE "upper" and "lower" match the both of
3164 upper and lower cases. */
3165 if ((syntax
& RE_ICASE
)
3166 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3169 #ifdef RE_ENABLE_I18N
3170 /* Check the space of the arrays. */
3171 if (*char_class_alloc
== mbcset
->nchar_classes
)
3173 /* Not enough, realloc it. */
3174 /* +1 in case of mbcset->nchar_classes is 0. */
3175 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3176 /* Use realloc since array is NULL if *alloc == 0. */
3177 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3179 if (BE (mbcset
->char_classes
== NULL
, 0))
3182 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3183 #endif /* RE_ENABLE_I18N */
3185 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3186 for (i = 0; i < SBC_MAX; ++i) \
3188 if (ctype_func (i)) \
3189 bitset_set (sbcset, i); \
3192 if (strcmp (name
, "alnum") == 0)
3193 BUILD_CHARCLASS_LOOP (isalnum
)
3194 else if (strcmp (name
, "cntrl") == 0)
3195 BUILD_CHARCLASS_LOOP (iscntrl
)
3196 else if (strcmp (name
, "lower") == 0)
3197 BUILD_CHARCLASS_LOOP (islower
)
3198 else if (strcmp (name
, "space") == 0)
3199 BUILD_CHARCLASS_LOOP (isspace
)
3200 else if (strcmp (name
, "alpha") == 0)
3201 BUILD_CHARCLASS_LOOP (isalpha
)
3202 else if (strcmp (name
, "digit") == 0)
3203 BUILD_CHARCLASS_LOOP (isdigit
)
3204 else if (strcmp (name
, "print") == 0)
3205 BUILD_CHARCLASS_LOOP (isprint
)
3206 else if (strcmp (name
, "upper") == 0)
3207 BUILD_CHARCLASS_LOOP (isupper
)
3208 else if (strcmp (name
, "blank") == 0)
3209 BUILD_CHARCLASS_LOOP (isblank
)
3210 else if (strcmp (name
, "graph") == 0)
3211 BUILD_CHARCLASS_LOOP (isgraph
)
3212 else if (strcmp (name
, "punct") == 0)
3213 BUILD_CHARCLASS_LOOP (ispunct
)
3214 else if (strcmp (name
, "xdigit") == 0)
3215 BUILD_CHARCLASS_LOOP (isxdigit
)
3223 build_word_op (dfa
, not, err
)
3228 re_bitset_ptr_t sbcset
;
3229 #ifdef RE_ENABLE_I18N
3230 re_charset_t
*mbcset
;
3232 #else /* not RE_ENABLE_I18N */
3234 #endif /* not RE_ENABLE_I18N */
3236 re_token_t br_token
;
3240 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3241 #ifdef RE_ENABLE_I18N
3242 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3243 #endif /* RE_ENABLE_I18N */
3245 #ifdef RE_ENABLE_I18N
3246 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3247 #else /* not RE_ENABLE_I18N */
3248 if (BE (sbcset
== NULL
, 0))
3249 #endif /* not RE_ENABLE_I18N */
3257 #ifdef RE_ENABLE_I18N
3260 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3261 bitset_set(cset->sbcset, '\0');
3263 mbcset
->non_match
= 1;
3265 for (i
= 0; i
< SBC_MAX
; ++i
)
3266 if (__btowc (i
) == WEOF
)
3267 bitset_set (sbcset
, i
);
3268 #else /* not RE_ENABLE_I18N */
3270 #endif /* not RE_ENABLE_I18N */
3273 /* We don't care the syntax in this case. */
3274 ret
= build_charclass (sbcset
,
3275 #ifdef RE_ENABLE_I18N
3277 #endif /* RE_ENABLE_I18N */
3278 (const unsigned char *) "alpha", 0);
3280 if (BE (ret
!= REG_NOERROR
, 0))
3283 #ifdef RE_ENABLE_I18N
3284 free_charset (mbcset
);
3285 #endif /* RE_ENABLE_I18N */
3289 /* \w match '_' also. */
3290 bitset_set (sbcset
, '_');
3292 /* If it is non-matching list. */
3293 #ifdef RE_ENABLE_I18N
3294 if (mbcset
->non_match
)
3295 #else /* not RE_ENABLE_I18N */
3297 #endif /* not RE_ENABLE_I18N */
3298 bitset_not (sbcset
);
3300 /* Build a tree for simple bracket. */
3301 br_token
.type
= SIMPLE_BRACKET
;
3302 br_token
.opr
.sbcset
= sbcset
;
3303 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3304 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3305 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3306 goto build_word_op_espace
;
3308 #ifdef RE_ENABLE_I18N
3311 re_token_t alt_token
;
3312 bin_tree_t
*mbc_tree
;
3313 /* Build a tree for complex bracket. */
3314 br_token
.type
= COMPLEX_BRACKET
;
3315 br_token
.opr
.mbcset
= mbcset
;
3316 dfa
->has_mb_node
= 1;
3317 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3318 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3319 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3320 goto build_word_op_espace
;
3321 /* Then join them by ALT node. */
3322 alt_token
.type
= OP_ALT
;
3323 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3324 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3325 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3330 free_charset (mbcset
);
3333 #else /* not RE_ENABLE_I18N */
3335 #endif /* not RE_ENABLE_I18N */
3337 build_word_op_espace
:
3339 #ifdef RE_ENABLE_I18N
3340 free_charset (mbcset
);
3341 #endif /* RE_ENABLE_I18N */
3346 /* This is intended for the expressions like "a{1,3}".
3347 Fetch a number from `input', and return the number.
3348 Return -1, if the number field is empty like "{,1}".
3349 Return -2, If an error is occured. */
3352 fetch_number (input
, token
, syntax
)
3355 reg_syntax_t syntax
;
3361 *token
= fetch_token (input
, syntax
);
3363 if (BE (token
->type
== END_OF_RE
, 0))
3365 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3367 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3368 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3369 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3374 #ifdef RE_ENABLE_I18N
3376 free_charset (re_charset_t
*cset
)
3378 re_free (cset
->mbchars
);
3380 re_free (cset
->coll_syms
);
3381 re_free (cset
->equiv_classes
);
3382 re_free (cset
->range_starts
);
3383 re_free (cset
->range_ends
);
3385 re_free (cset
->char_classes
);
3388 #endif /* RE_ENABLE_I18N */
3390 /* Functions for binary tree operation. */
3392 /* Create a node of tree.
3393 Note: This function automatically free left and right if malloc fails. */
3396 create_tree (left
, right
, type
, index
)
3399 re_token_type_t type
;
3403 tree
= re_malloc (bin_tree_t
, 1);
3404 if (BE (tree
== NULL
, 0))
3406 free_bin_tree (left
);
3407 free_bin_tree (right
);
3410 tree
->parent
= NULL
;
3412 tree
->right
= right
;
3414 tree
->node_idx
= index
;
3417 re_node_set_init_empty (&tree
->eclosure
);
3420 left
->parent
= tree
;
3422 right
->parent
= tree
;
3426 /* Free the sub tree pointed by TREE. */
3429 free_bin_tree (tree
)
3434 /*re_node_set_free (&tree->eclosure);*/
3435 free_bin_tree (tree
->left
);
3436 free_bin_tree (tree
->right
);
3440 /* Duplicate the node SRC, and return new node. */
3443 duplicate_tree (src
, dfa
)
3444 const bin_tree_t
*src
;
3447 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3449 /* Since node indies must be according to Post-order of the tree,
3450 we must duplicate the left at first. */
3451 if (src
->left
!= NULL
)
3453 left
= duplicate_tree (src
->left
, dfa
);
3458 /* Secondaly, duplicate the right. */
3459 if (src
->right
!= NULL
)
3461 right
= duplicate_tree (src
->right
, dfa
);
3464 free_bin_tree (left
);
3469 /* At last, duplicate itself. */
3470 if (src
->type
== NON_TYPE
)
3472 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3473 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3474 if (BE (new_node_idx
== -1, 0))
3476 free_bin_tree (left
);
3477 free_bin_tree (right
);
3482 new_node_idx
= src
->type
;
3484 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3485 if (BE (new_tree
== NULL
, 0))
3487 free_bin_tree (left
);
3488 free_bin_tree (right
);