1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
72 int length
, reg_syntax_t syntax
);
73 static void re_compile_fastmap_iter (regex_t
*bufp
,
74 const re_dfastate_t
*init_state
,
76 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
77 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
79 static void free_charset (re_charset_t
*cset
);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t
*preg
);
82 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
83 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
84 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
85 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
86 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
87 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
88 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
89 int top_clone_node
, int root_node
,
90 unsigned int constraint
);
91 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
92 unsigned int constraint
);
93 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
94 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
96 static void calc_inveclosure (re_dfa_t
*dfa
);
97 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
99 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
100 static int peek_token (re_token_t
*token
, re_string_t
*input
,
101 reg_syntax_t syntax
);
102 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
103 reg_syntax_t syntax
);
104 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
105 reg_syntax_t syntax
, reg_errcode_t
*err
);
106 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
107 re_token_t
*token
, reg_syntax_t syntax
,
108 int nest
, reg_errcode_t
*err
);
109 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
110 re_token_t
*token
, reg_syntax_t syntax
,
111 int nest
, reg_errcode_t
*err
);
112 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
113 re_token_t
*token
, reg_syntax_t syntax
,
114 int nest
, reg_errcode_t
*err
);
115 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
116 re_token_t
*token
, reg_syntax_t syntax
,
117 int nest
, reg_errcode_t
*err
);
118 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
119 re_dfa_t
*dfa
, re_token_t
*token
,
120 reg_syntax_t syntax
, reg_errcode_t
*err
);
121 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
122 re_token_t
*token
, reg_syntax_t syntax
,
124 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
126 re_token_t
*token
, int token_len
,
128 reg_syntax_t syntax
);
129 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
135 re_charset_t
*mbcset
, int *range_alloc
,
136 bracket_elem_t
*start_elem
,
137 bracket_elem_t
*end_elem
);
138 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
139 re_charset_t
*mbcset
,
141 const unsigned char *name
);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
144 bracket_elem_t
*start_elem
,
145 bracket_elem_t
*end_elem
);
146 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
147 const unsigned char *name
);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
152 re_charset_t
*mbcset
,
153 int *equiv_class_alloc
,
154 const unsigned char *name
);
155 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
156 re_charset_t
*mbcset
,
157 int *char_class_alloc
,
158 const unsigned char *class_name
,
159 reg_syntax_t syntax
);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
162 const unsigned char *name
);
163 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
164 const unsigned char *class_name
,
165 reg_syntax_t syntax
);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t
*build_word_op (re_dfa_t
*dfa
, int not, reg_errcode_t
*err
);
168 static void free_bin_tree (bin_tree_t
*tree
);
169 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
170 re_token_type_t type
, int index
);
171 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
178 const char __re_error_msgid
[] attribute_hidden
=
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
232 const size_t __re_error_msgid_idx
[] attribute_hidden
=
253 /* Entry points for GNU code. */
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
263 re_compile_pattern (pattern
, length
, bufp
)
266 struct re_pattern_buffer
*bufp
;
270 /* And GNU code determines whether or not to get register information
271 by passing null for the REGS argument to re_match, etc., not by
275 /* Match anchors at newline. */
276 bufp
->newline_anchor
= 1;
278 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
282 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
285 weak_alias (__re_compile_pattern
, re_compile_pattern
)
288 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
289 also be assigned to arbitrarily: each pattern buffer stores its own
290 syntax, so it can be changed between regex compilations. */
291 /* This has no initializer because initialized variables in Emacs
292 become read-only after dumping. */
293 reg_syntax_t re_syntax_options
;
296 /* Specify the precise syntax of regexps for compilation. This provides
297 for compatibility for various utilities which historically have
298 different, incompatible syntaxes.
300 The argument SYNTAX is a bit mask comprised of the various bits
301 defined in regex.h. We return the old syntax. */
304 re_set_syntax (syntax
)
307 reg_syntax_t ret
= re_syntax_options
;
309 re_syntax_options
= syntax
;
313 weak_alias (__re_set_syntax
, re_set_syntax
)
317 re_compile_fastmap (bufp
)
318 struct re_pattern_buffer
*bufp
;
320 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
321 char *fastmap
= bufp
->fastmap
;
323 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
324 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
325 if (dfa
->init_state
!= dfa
->init_state_word
)
326 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
327 if (dfa
->init_state
!= dfa
->init_state_nl
)
328 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
329 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
330 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
331 bufp
->fastmap_accurate
= 1;
335 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
339 re_set_fastmap (char *fastmap
, int icase
, int ch
)
343 fastmap
[tolower (ch
)] = 1;
346 /* Helper function for re_compile_fastmap.
347 Compile fastmap for the initial_state INIT_STATE. */
350 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
352 const re_dfastate_t
*init_state
;
355 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
357 int icase
= (MB_CUR_MAX
== 1 && (bufp
->syntax
& RE_ICASE
));
358 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
360 int node
= init_state
->nodes
.elems
[node_cnt
];
361 re_token_type_t type
= dfa
->nodes
[node
].type
;
363 if (type
== CHARACTER
)
364 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
365 else if (type
== SIMPLE_BRACKET
)
368 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
369 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
370 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
371 re_set_fastmap (fastmap
, icase
, ch
);
373 #ifdef RE_ENABLE_I18N
374 else if (type
== COMPLEX_BRACKET
)
377 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
378 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
379 || cset
->nranges
|| cset
->nchar_classes
)
382 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
391 const int32_t *table
= (const int32_t *)
392 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
393 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
394 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
396 re_set_fastmap (fastmap
, icase
, ch
);
400 for (i
= 0; i
< SBC_MAX
; ++i
)
401 if (__btowc (i
) == WEOF
)
402 re_set_fastmap (fastmap
, icase
, i
);
403 # endif /* not _LIBC */
405 for (i
= 0; i
< cset
->nmbchars
; ++i
)
409 memset (&state
, '\0', sizeof (state
));
410 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
411 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
414 #endif /* RE_ENABLE_I18N */
415 else if (type
== END_OF_RE
|| type
== OP_PERIOD
)
417 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
418 if (type
== END_OF_RE
)
419 bufp
->can_be_null
= 1;
425 /* Entry point for POSIX code. */
426 /* regcomp takes a regular expression as a string and compiles it.
428 PREG is a regex_t *. We do not expect any fields to be initialized,
429 since POSIX says we shouldn't. Thus, we set
431 `buffer' to the compiled pattern;
432 `used' to the length of the compiled pattern;
433 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
434 REG_EXTENDED bit in CFLAGS is set; otherwise, to
435 RE_SYNTAX_POSIX_BASIC;
436 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
437 `fastmap' to an allocated space for the fastmap;
438 `fastmap_accurate' to zero;
439 `re_nsub' to the number of subexpressions in PATTERN.
441 PATTERN is the address of the pattern string.
443 CFLAGS is a series of bits which affect compilation.
445 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
446 use POSIX basic syntax.
448 If REG_NEWLINE is set, then . and [^...] don't match newline.
449 Also, regexec will try a match beginning after every newline.
451 If REG_ICASE is set, then we considers upper- and lowercase
452 versions of letters to be equivalent when matching.
454 If REG_NOSUB is set, then when PREG is passed to regexec, that
455 routine will report only success or failure, and nothing about the
458 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
459 the return codes and their meanings.) */
462 regcomp (preg
, pattern
, cflags
)
463 regex_t
*__restrict preg
;
464 const char *__restrict pattern
;
468 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
469 : RE_SYNTAX_POSIX_BASIC
);
475 /* Try to allocate space for the fastmap. */
476 preg
->fastmap
= re_malloc (char, SBC_MAX
);
477 if (BE (preg
->fastmap
== NULL
, 0))
480 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
482 /* If REG_NEWLINE is set, newlines are treated differently. */
483 if (cflags
& REG_NEWLINE
)
484 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
485 syntax
&= ~RE_DOT_NEWLINE
;
486 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
487 /* It also changes the matching behavior. */
488 preg
->newline_anchor
= 1;
491 preg
->newline_anchor
= 0;
492 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
493 preg
->translate
= NULL
;
495 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
497 /* POSIX doesn't distinguish between an unmatched open-group and an
498 unmatched close-group: both are REG_EPAREN. */
499 if (ret
== REG_ERPAREN
)
502 /* We have already checked preg->fastmap != NULL. */
503 if (BE (ret
== REG_NOERROR
, 1))
504 /* Compute the fastmap now, since regexec cannot modify the pattern
505 buffer. This function nevers fails in this implementation. */
506 (void) __re_compile_fastmap (preg
);
509 /* Some error occurred while compiling the expression. */
510 re_free (preg
->fastmap
);
511 preg
->fastmap
= NULL
;
517 weak_alias (__regcomp
, regcomp
)
520 /* Returns a message corresponding to an error code, ERRCODE, returned
521 from either regcomp or regexec. We don't use PREG here. */
524 regerror (errcode
, preg
, errbuf
, errbuf_size
)
534 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
535 / sizeof (__re_error_msgid_idx
[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
542 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
544 msg_size
= strlen (msg
) + 1; /* Includes the null. */
546 if (BE (errbuf_size
!= 0, 1))
548 if (BE (msg_size
> errbuf_size
, 0))
550 #if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
553 memcpy (errbuf
, msg
, errbuf_size
- 1);
554 errbuf
[errbuf_size
- 1] = 0;
558 memcpy (errbuf
, msg
, msg_size
);
564 weak_alias (__regerror
, regerror
)
569 free_dfa_content (re_dfa_t
*dfa
)
573 re_free (dfa
->subexps
);
575 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
577 re_token_t
*node
= dfa
->nodes
+ i
;
578 #ifdef RE_ENABLE_I18N
579 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
580 free_charset (node
->opr
.mbcset
);
582 #endif /* RE_ENABLE_I18N */
583 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
584 re_free (node
->opr
.sbcset
);
586 re_free (dfa
->nexts
);
587 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
589 if (dfa
->eclosures
!= NULL
)
590 re_node_set_free (dfa
->eclosures
+ i
);
591 if (dfa
->inveclosures
!= NULL
)
592 re_node_set_free (dfa
->inveclosures
+ i
);
593 if (dfa
->edests
!= NULL
)
594 re_node_set_free (dfa
->edests
+ i
);
596 re_free (dfa
->edests
);
597 re_free (dfa
->eclosures
);
598 re_free (dfa
->inveclosures
);
599 re_free (dfa
->nodes
);
601 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
603 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
604 for (j
= 0; j
< entry
->num
; ++j
)
606 re_dfastate_t
*state
= entry
->array
[j
];
609 re_free (entry
->array
);
611 re_free (dfa
->state_table
);
613 if (dfa
->word_char
!= NULL
)
614 re_free (dfa
->word_char
);
616 re_free (dfa
->re_str
);
623 /* Free dynamically allocated space used by PREG. */
629 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
630 if (BE (dfa
!= NULL
, 1))
631 free_dfa_content (dfa
);
633 re_free (preg
->fastmap
);
636 weak_alias (__regfree
, regfree
)
639 /* Entry points compatible with 4.2 BSD regex library. We don't define
640 them unless specifically requested. */
642 #if defined _REGEX_RE_COMP || defined _LIBC
644 /* BSD has one and only one pattern buffer. */
645 static struct re_pattern_buffer re_comp_buf
;
649 /* Make these definitions weak in libc, so POSIX programs can redefine
650 these names if they don't use our functions, and still use
651 regcomp/regexec above without link errors. */
662 if (!re_comp_buf
.buffer
)
663 return gettext ("No previous regular expression");
667 if (re_comp_buf
.buffer
)
669 fastmap
= re_comp_buf
.fastmap
;
670 re_comp_buf
.fastmap
= NULL
;
671 __regfree (&re_comp_buf
);
672 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
673 re_comp_buf
.fastmap
= fastmap
;
676 if (re_comp_buf
.fastmap
== NULL
)
678 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
679 if (re_comp_buf
.fastmap
== NULL
)
680 return (char *) gettext (__re_error_msgid
681 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
684 /* Since `re_exec' always passes NULL for the `regs' argument, we
685 don't need to initialize the pattern buffer fields which affect it. */
687 /* Match anchors at newlines. */
688 re_comp_buf
.newline_anchor
= 1;
690 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
695 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
696 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
700 libc_freeres_fn (free_mem
)
702 __regfree (&re_comp_buf
);
706 #endif /* _REGEX_RE_COMP */
708 /* Internal entry point.
709 Compile the regular expression PATTERN, whose length is LENGTH.
710 SYNTAX indicate regular expression's syntax. */
713 re_compile_internal (preg
, pattern
, length
, syntax
)
715 const char * pattern
;
719 reg_errcode_t err
= REG_NOERROR
;
723 /* Initialize the pattern buffer. */
724 preg
->fastmap_accurate
= 0;
725 preg
->syntax
= syntax
;
726 preg
->not_bol
= preg
->not_eol
= 0;
729 preg
->can_be_null
= 0;
730 preg
->regs_allocated
= REGS_UNALLOCATED
;
732 /* Initialize the dfa. */
733 dfa
= (re_dfa_t
*) preg
->buffer
;
734 if (preg
->allocated
< sizeof (re_dfa_t
))
736 /* If zero allocated, but buffer is non-null, try to realloc
737 enough space. This loses if buffer's address is bogus, but
738 that is the user's responsibility. If ->buffer is NULL this
739 is a simple allocation. */
740 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
743 preg
->allocated
= sizeof (re_dfa_t
);
745 preg
->buffer
= (unsigned char *) dfa
;
746 preg
->used
= sizeof (re_dfa_t
);
748 err
= init_dfa (dfa
, length
);
749 if (BE (err
!= REG_NOERROR
, 0))
756 dfa
->re_str
= re_malloc (char, length
+ 1);
757 strncpy (dfa
->re_str
, pattern
, length
+ 1);
760 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
762 if (BE (err
!= REG_NOERROR
, 0))
769 /* Parse the regular expression, and build a structure tree. */
771 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
772 if (BE (dfa
->str_tree
== NULL
, 0))
773 goto re_compile_internal_free_return
;
775 /* Analyze the tree and collect information which is necessary to
778 if (BE (err
!= REG_NOERROR
, 0))
779 goto re_compile_internal_free_return
;
781 /* Then create the initial state of the dfa. */
782 err
= create_initial_state (dfa
);
784 /* Release work areas. */
785 free_workarea_compile (preg
);
786 re_string_destruct (®exp
);
788 if (BE (err
!= REG_NOERROR
, 0))
790 re_compile_internal_free_return
:
791 free_dfa_content (dfa
);
798 /* Initialize DFA. We use the length of the regular expression PAT_LEN
799 as the initial length of some arrays. */
802 init_dfa (dfa
, pat_len
)
808 memset (dfa
, '\0', sizeof (re_dfa_t
));
810 dfa
->nodes_alloc
= pat_len
+ 1;
811 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
813 dfa
->states_alloc
= pat_len
+ 1;
815 /* table_size = 2 ^ ceil(log pat_len) */
816 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
817 if (table_size
> pat_len
)
820 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
821 dfa
->state_hash_mask
= table_size
- 1;
823 dfa
->subexps_alloc
= 1;
824 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
825 dfa
->word_char
= NULL
;
827 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
828 || dfa
->subexps
== NULL
, 0))
830 /* We don't bother to free anything which was allocated. Very
831 soon the process will go down anyway. */
833 dfa
->state_table
= NULL
;
840 /* Initialize WORD_CHAR table, which indicate which character is
841 "word". In this case "word" means that it is the word construction
842 character used by some operators like "\<", "\>", etc. */
849 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
850 if (BE (dfa
->word_char
== NULL
, 0))
852 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
853 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
854 if (isalnum (ch
) || ch
== '_')
855 dfa
->word_char
[i
] |= 1 << j
;
859 /* Free the work area which are only used while compiling. */
862 free_workarea_compile (preg
)
865 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
866 free_bin_tree (dfa
->str_tree
);
867 dfa
->str_tree
= NULL
;
870 /* Create initial states for all contexts. */
873 create_initial_state (dfa
)
878 re_node_set init_nodes
;
880 /* Initial states have the epsilon closure of the node which is
881 the first node of the regular expression. */
882 first
= dfa
->str_tree
->first
;
883 dfa
->init_node
= first
;
884 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
885 if (BE (err
!= REG_NOERROR
, 0))
888 /* The back-references which are in initial states can epsilon transit,
889 since in this case all of the subexpressions can be null.
890 Then we add epsilon closures of the nodes which are the next nodes of
891 the back-references. */
892 if (dfa
->nbackref
> 0)
893 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
895 int node_idx
= init_nodes
.elems
[i
];
896 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
899 if (type
!= OP_BACK_REF
)
901 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
903 re_token_t
*clexp_node
;
904 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
905 if (clexp_node
->type
== OP_CLOSE_SUBEXP
906 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
909 if (clexp_idx
== init_nodes
.nelem
)
912 if (type
== OP_BACK_REF
)
914 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
915 if (!re_node_set_contains (&init_nodes
, dest_idx
))
917 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
923 /* It must be the first time to invoke acquire_state. */
924 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
925 /* We don't check ERR here, since the initial state must not be NULL. */
926 if (BE (dfa
->init_state
== NULL
, 0))
928 if (dfa
->init_state
->has_constraint
)
930 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
932 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
934 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
938 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
939 || dfa
->init_state_begbuf
== NULL
, 0))
943 dfa
->init_state_word
= dfa
->init_state_nl
944 = dfa
->init_state_begbuf
= dfa
->init_state
;
946 re_node_set_free (&init_nodes
);
950 /* Analyze the structure tree, and calculate "first", "next", "edest",
951 "eclosure", and "inveclosure". */
960 /* Allocate arrays. */
961 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
962 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
963 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
964 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
965 if (BE (dfa
->nexts
== NULL
|| dfa
->edests
== NULL
966 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
968 /* Initialize them. */
969 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
972 re_node_set_init_empty (dfa
->edests
+ i
);
973 re_node_set_init_empty (dfa
->eclosures
+ i
);
974 re_node_set_init_empty (dfa
->inveclosures
+ i
);
977 ret
= analyze_tree (dfa
, dfa
->str_tree
);
978 if (BE (ret
== REG_NOERROR
, 1))
980 ret
= calc_eclosure (dfa
);
981 if (ret
== REG_NOERROR
)
982 calc_inveclosure (dfa
);
987 /* Helper functions for analyze.
988 This function calculate "first", "next", and "edest" for the subtree
989 whose root is NODE. */
992 analyze_tree (dfa
, node
)
997 if (node
->first
== -1)
998 calc_first (dfa
, node
);
999 if (node
->next
== -1)
1000 calc_next (dfa
, node
);
1001 if (node
->eclosure
.nelem
== 0)
1002 calc_epsdest (dfa
, node
);
1003 /* Calculate "first" etc. for the left child. */
1004 if (node
->left
!= NULL
)
1006 ret
= analyze_tree (dfa
, node
->left
);
1007 if (BE (ret
!= REG_NOERROR
, 0))
1010 /* Calculate "first" etc. for the right child. */
1011 if (node
->right
!= NULL
)
1013 ret
= analyze_tree (dfa
, node
->right
);
1014 if (BE (ret
!= REG_NOERROR
, 0))
1020 /* Calculate "first" for the node NODE. */
1022 calc_first (dfa
, node
)
1027 idx
= node
->node_idx
;
1028 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1033 case OP_OPEN_BRACKET
:
1034 case OP_CLOSE_BRACKET
:
1035 case OP_OPEN_DUP_NUM
:
1036 case OP_CLOSE_DUP_NUM
:
1037 case OP_NON_MATCH_LIST
:
1038 case OP_OPEN_COLL_ELEM
:
1039 case OP_CLOSE_COLL_ELEM
:
1040 case OP_OPEN_EQUIV_CLASS
:
1041 case OP_CLOSE_EQUIV_CLASS
:
1042 case OP_OPEN_CHAR_CLASS
:
1043 case OP_CLOSE_CHAR_CLASS
:
1044 /* These must not be appeared here. */
1050 case OP_DUP_ASTERISK
:
1051 case OP_DUP_QUESTION
:
1052 #ifdef RE_ENABLE_I18N
1053 case COMPLEX_BRACKET
:
1054 #endif /* RE_ENABLE_I18N */
1055 case SIMPLE_BRACKET
:
1058 case OP_OPEN_SUBEXP
:
1059 case OP_CLOSE_SUBEXP
:
1064 assert (node
->left
!= NULL
);
1066 if (node
->left
->first
== -1)
1067 calc_first (dfa
, node
->left
);
1068 node
->first
= node
->left
->first
;
1073 /* else fall through */
1076 assert (node
->left
!= NULL
);
1078 if (node
->left
->first
== -1)
1079 calc_first (dfa
, node
->left
);
1080 node
->first
= node
->left
->first
;
1085 /* Calculate "next" for the node NODE. */
1088 calc_next (dfa
, node
)
1093 bin_tree_t
*parent
= node
->parent
;
1097 idx
= node
->node_idx
;
1098 if (node
->type
== 0)
1099 dfa
->nexts
[idx
] = node
->next
;
1103 idx
= parent
->node_idx
;
1104 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1108 case OP_DUP_ASTERISK
:
1113 if (parent
->left
== node
)
1115 if (parent
->right
->first
== -1)
1116 calc_first (dfa
, parent
->right
);
1117 node
->next
= parent
->right
->first
;
1120 /* else fall through */
1122 if (parent
->next
== -1)
1123 calc_next (dfa
, parent
);
1124 node
->next
= parent
->next
;
1127 idx
= node
->node_idx
;
1128 if (node
->type
== 0)
1129 dfa
->nexts
[idx
] = node
->next
;
1132 /* Calculate "edest" for the node NODE. */
1135 calc_epsdest (dfa
, node
)
1140 idx
= node
->node_idx
;
1141 if (node
->type
== 0)
1143 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1144 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1145 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1147 if (node
->left
->first
== -1)
1148 calc_first (dfa
, node
->left
);
1149 if (node
->next
== -1)
1150 calc_next (dfa
, node
);
1151 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1154 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1157 if (node
->left
!= NULL
)
1159 if (node
->left
->first
== -1)
1160 calc_first (dfa
, node
->left
);
1161 left
= node
->left
->first
;
1165 if (node
->next
== -1)
1166 calc_next (dfa
, node
);
1169 if (node
->right
!= NULL
)
1171 if (node
->right
->first
== -1)
1172 calc_first (dfa
, node
->right
);
1173 right
= node
->right
->first
;
1177 if (node
->next
== -1)
1178 calc_next (dfa
, node
);
1181 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1183 else if (dfa
->nodes
[idx
].type
== ANCHOR
1184 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1185 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1186 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1187 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1191 /* Duplicate the epsilon closure of the node ROOT_NODE.
1192 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1193 to their own constraint. */
1195 static reg_errcode_t
1196 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1199 int top_org_node
, top_clone_node
, root_node
;
1200 unsigned int init_constraint
;
1203 int org_node
, clone_node
, ret
;
1204 unsigned int constraint
= init_constraint
;
1205 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1207 int org_dest
, clone_dest
;
1208 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1210 /* If the back reference epsilon-transit, its destination must
1211 also have the constraint. Then duplicate the epsilon closure
1212 of the destination of the back reference, and store it in
1213 edests of the back reference. */
1214 org_dest
= dfa
->nexts
[org_node
];
1215 re_node_set_empty (dfa
->edests
+ clone_node
);
1216 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1217 if (BE (err
!= REG_NOERROR
, 0))
1219 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1220 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1221 if (BE (ret
< 0, 0))
1224 else if (dfa
->edests
[org_node
].nelem
== 0)
1226 /* In case of the node can't epsilon-transit, don't duplicate the
1227 destination and store the original destination as the
1228 destination of the node. */
1229 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1232 else if (dfa
->edests
[org_node
].nelem
== 1)
1234 /* In case of the node can epsilon-transit, and it has only one
1236 org_dest
= dfa
->edests
[org_node
].elems
[0];
1237 re_node_set_empty (dfa
->edests
+ clone_node
);
1238 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1240 /* In case of the node has another constraint, append it. */
1241 if (org_node
== root_node
&& clone_node
!= org_node
)
1243 /* ...but if the node is root_node itself, it means the
1244 epsilon closure have a loop, then tie it to the
1245 destination of the root_node. */
1246 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1248 if (BE (ret
< 0, 0))
1252 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1254 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1255 if (BE (err
!= REG_NOERROR
, 0))
1257 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1258 if (BE (ret
< 0, 0))
1261 else /* dfa->edests[org_node].nelem == 2 */
1263 /* In case of the node can epsilon-transit, and it has two
1265 org_dest
= dfa
->edests
[org_node
].elems
[0];
1266 re_node_set_empty (dfa
->edests
+ clone_node
);
1267 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1268 if (BE (err
!= REG_NOERROR
, 0))
1270 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1271 if (BE (ret
< 0, 0))
1274 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
, root_node
,
1276 if (BE (err
!= REG_NOERROR
, 0))
1279 org_dest
= dfa
->edests
[org_node
].elems
[1];
1280 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1281 if (BE (err
!= REG_NOERROR
, 0))
1283 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1284 if (BE (ret
< 0, 0))
1287 org_node
= org_dest
;
1288 clone_node
= clone_dest
;
1293 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1294 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1295 otherwise return the error code. */
1297 static reg_errcode_t
1298 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1300 int *new_idx
, org_idx
;
1301 unsigned int constraint
;
1306 dup
= dfa
->nodes
[org_idx
];
1307 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1308 if (BE (dup_idx
== -1, 0))
1310 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1311 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1312 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1313 dfa
->nodes
[dup_idx
].duplicated
= 1;
1314 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1315 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1316 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1323 calc_inveclosure (dfa
)
1327 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1329 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1331 dest
= dfa
->eclosures
[src
].elems
[idx
];
1332 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1337 /* Calculate "eclosure" for all the node in DFA. */
1339 static reg_errcode_t
1343 int node_idx
, incomplete
;
1345 assert (dfa
->nodes_len
> 0);
1348 /* For each nodes, calculate epsilon closure. */
1349 for (node_idx
= 0; ; ++node_idx
)
1352 re_node_set eclosure_elem
;
1353 if (node_idx
== dfa
->nodes_len
)
1362 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1364 /* If we have already calculated, skip it. */
1365 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1367 /* Calculate epsilon closure of `node_idx'. */
1368 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1369 if (BE (err
!= REG_NOERROR
, 0))
1372 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1375 re_node_set_free (&eclosure_elem
);
1381 /* Calculate epsilon closure of NODE. */
1383 static reg_errcode_t
1384 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1385 re_node_set
*new_set
;
1390 unsigned int constraint
;
1392 re_node_set eclosure
;
1394 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1395 if (BE (err
!= REG_NOERROR
, 0))
1398 /* This indicates that we are calculating this node now.
1399 We reference this value to avoid infinite loop. */
1400 dfa
->eclosures
[node
].nelem
= -1;
1402 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1403 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1404 /* If the current node has constraints, duplicate all nodes.
1405 Since they must inherit the constraints. */
1406 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1408 int org_node
, cur_node
;
1409 org_node
= cur_node
= node
;
1410 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1411 if (BE (err
!= REG_NOERROR
, 0))
1415 /* Expand each epsilon destination nodes. */
1416 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1417 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1419 re_node_set eclosure_elem
;
1420 int edest
= dfa
->edests
[node
].elems
[i
];
1421 /* If calculating the epsilon closure of `edest' is in progress,
1422 return intermediate result. */
1423 if (dfa
->eclosures
[edest
].nelem
== -1)
1428 /* If we haven't calculated the epsilon closure of `edest' yet,
1429 calculate now. Otherwise use calculated epsilon closure. */
1430 if (dfa
->eclosures
[edest
].nelem
== 0)
1432 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1433 if (BE (err
!= REG_NOERROR
, 0))
1437 eclosure_elem
= dfa
->eclosures
[edest
];
1438 /* Merge the epsilon closure of `edest'. */
1439 re_node_set_merge (&eclosure
, &eclosure_elem
);
1440 /* If the epsilon closure of `edest' is incomplete,
1441 the epsilon closure of this node is also incomplete. */
1442 if (dfa
->eclosures
[edest
].nelem
== 0)
1445 re_node_set_free (&eclosure_elem
);
1449 /* Epsilon closures include itself. */
1450 re_node_set_insert (&eclosure
, node
);
1451 if (incomplete
&& !root
)
1452 dfa
->eclosures
[node
].nelem
= 0;
1454 dfa
->eclosures
[node
] = eclosure
;
1455 *new_set
= eclosure
;
1459 /* Functions for token which are used in the parser. */
1461 /* Fetch a token from INPUT.
1462 We must not use this function inside bracket expressions. */
1465 fetch_token (input
, syntax
)
1467 reg_syntax_t syntax
;
1471 consumed_byte
= peek_token (&token
, input
, syntax
);
1472 re_string_skip_bytes (input
, consumed_byte
);
1476 /* Peek a token from INPUT, and return the length of the token.
1477 We must not use this function inside bracket expressions. */
1480 peek_token (token
, input
, syntax
)
1483 reg_syntax_t syntax
;
1487 if (re_string_eoi (input
))
1489 token
->type
= END_OF_RE
;
1493 c
= re_string_peek_byte (input
, 0);
1496 #ifdef RE_ENABLE_I18N
1497 token
->mb_partial
= 0;
1498 if (MB_CUR_MAX
> 1 &&
1499 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1501 token
->type
= CHARACTER
;
1502 token
->mb_partial
= 1;
1509 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1511 token
->type
= BACK_SLASH
;
1515 c2
= re_string_peek_byte_case (input
, 1);
1517 token
->type
= CHARACTER
;
1521 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1522 token
->type
= OP_ALT
;
1524 case '1': case '2': case '3': case '4': case '5':
1525 case '6': case '7': case '8': case '9':
1526 if (!(syntax
& RE_NO_BK_REFS
))
1528 token
->type
= OP_BACK_REF
;
1529 token
->opr
.idx
= c2
- '0';
1533 if (!(syntax
& RE_NO_GNU_OPS
))
1535 token
->type
= ANCHOR
;
1536 token
->opr
.idx
= WORD_FIRST
;
1540 if (!(syntax
& RE_NO_GNU_OPS
))
1542 token
->type
= ANCHOR
;
1543 token
->opr
.idx
= WORD_LAST
;
1547 if (!(syntax
& RE_NO_GNU_OPS
))
1549 token
->type
= ANCHOR
;
1550 token
->opr
.idx
= WORD_DELIM
;
1554 if (!(syntax
& RE_NO_GNU_OPS
))
1556 token
->type
= ANCHOR
;
1557 token
->opr
.idx
= INSIDE_WORD
;
1561 if (!(syntax
& RE_NO_GNU_OPS
))
1562 token
->type
= OP_WORD
;
1565 if (!(syntax
& RE_NO_GNU_OPS
))
1566 token
->type
= OP_NOTWORD
;
1569 if (!(syntax
& RE_NO_GNU_OPS
))
1571 token
->type
= ANCHOR
;
1572 token
->opr
.idx
= BUF_FIRST
;
1576 if (!(syntax
& RE_NO_GNU_OPS
))
1578 token
->type
= ANCHOR
;
1579 token
->opr
.idx
= BUF_LAST
;
1583 if (!(syntax
& RE_NO_BK_PARENS
))
1584 token
->type
= OP_OPEN_SUBEXP
;
1587 if (!(syntax
& RE_NO_BK_PARENS
))
1588 token
->type
= OP_CLOSE_SUBEXP
;
1591 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1592 token
->type
= OP_DUP_PLUS
;
1595 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1596 token
->type
= OP_DUP_QUESTION
;
1599 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1600 token
->type
= OP_OPEN_DUP_NUM
;
1603 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1604 token
->type
= OP_CLOSE_DUP_NUM
;
1612 token
->type
= CHARACTER
;
1616 if (syntax
& RE_NEWLINE_ALT
)
1617 token
->type
= OP_ALT
;
1620 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1621 token
->type
= OP_ALT
;
1624 token
->type
= OP_DUP_ASTERISK
;
1627 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1628 token
->type
= OP_DUP_PLUS
;
1631 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1632 token
->type
= OP_DUP_QUESTION
;
1635 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1636 token
->type
= OP_OPEN_DUP_NUM
;
1639 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1640 token
->type
= OP_CLOSE_DUP_NUM
;
1643 if (syntax
& RE_NO_BK_PARENS
)
1644 token
->type
= OP_OPEN_SUBEXP
;
1647 if (syntax
& RE_NO_BK_PARENS
)
1648 token
->type
= OP_CLOSE_SUBEXP
;
1651 token
->type
= OP_OPEN_BRACKET
;
1654 token
->type
= OP_PERIOD
;
1657 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1658 re_string_cur_idx (input
) != 0)
1660 char prev
= re_string_peek_byte (input
, -1);
1661 if (prev
!= '|' && prev
!= '(' &&
1662 (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n'))
1665 token
->type
= ANCHOR
;
1666 token
->opr
.idx
= LINE_FIRST
;
1669 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1670 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1673 re_string_skip_bytes (input
, 1);
1674 peek_token (&next
, input
, syntax
);
1675 re_string_skip_bytes (input
, -1);
1676 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1679 token
->type
= ANCHOR
;
1680 token
->opr
.idx
= LINE_LAST
;
1688 /* Peek a token from INPUT, and return the length of the token.
1689 We must not use this function out of bracket expressions. */
1692 peek_token_bracket (token
, input
, syntax
)
1695 reg_syntax_t syntax
;
1698 if (re_string_eoi (input
))
1700 token
->type
= END_OF_RE
;
1703 c
= re_string_peek_byte (input
, 0);
1706 #ifdef RE_ENABLE_I18N
1707 if (MB_CUR_MAX
> 1 &&
1708 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1710 token
->type
= CHARACTER
;
1713 #endif /* RE_ENABLE_I18N */
1715 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1717 /* In this case, '\' escape a character. */
1719 re_string_skip_bytes (input
, 1);
1720 c2
= re_string_peek_byte (input
, 0);
1722 token
->type
= CHARACTER
;
1725 if (c
== '[') /* '[' is a special char in a bracket exps. */
1729 c2
= re_string_peek_byte (input
, 1);
1735 token
->type
= OP_OPEN_COLL_ELEM
;
1738 token
->type
= OP_OPEN_EQUIV_CLASS
;
1741 if (syntax
& RE_CHAR_CLASSES
)
1743 token
->type
= OP_OPEN_CHAR_CLASS
;
1746 /* else fall through. */
1748 token
->type
= CHARACTER
;
1758 token
->type
= OP_CHARSET_RANGE
;
1761 token
->type
= OP_CLOSE_BRACKET
;
1764 token
->type
= OP_NON_MATCH_LIST
;
1767 token
->type
= CHARACTER
;
1772 /* Functions for parser. */
1774 /* Entry point of the parser.
1775 Parse the regular expression REGEXP and return the structure tree.
1776 If an error is occured, ERR is set by error code, and return NULL.
1777 This function build the following tree, from regular expression <reg_exp>:
1783 CAT means concatenation.
1784 EOR means end of regular expression. */
1787 parse (regexp
, preg
, syntax
, err
)
1788 re_string_t
*regexp
;
1790 reg_syntax_t syntax
;
1793 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1794 bin_tree_t
*tree
, *eor
, *root
;
1795 re_token_t current_token
;
1797 current_token
= fetch_token (regexp
, syntax
);
1798 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1799 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1801 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1802 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1804 root
= create_tree (tree
, eor
, CONCAT
, 0);
1807 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1815 /* This function build the following tree, from regular expression
1816 <branch1>|<branch2>:
1822 ALT means alternative, which represents the operator `|'. */
1825 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1826 re_string_t
*regexp
;
1829 reg_syntax_t syntax
;
1833 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1834 bin_tree_t
*tree
, *branch
= NULL
;
1836 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1837 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1840 while (token
->type
== OP_ALT
)
1842 re_token_t alt_token
= *token
;
1843 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1844 *token
= fetch_token (regexp
, syntax
);
1845 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1846 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1848 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1849 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1851 free_bin_tree (tree
);
1857 tree
= create_tree (tree
, branch
, 0, new_idx
);
1858 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1863 dfa
->has_plural_match
= 1;
1868 /* This function build the following tree, from regular expression
1875 CAT means concatenation. */
1878 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
1879 re_string_t
*regexp
;
1882 reg_syntax_t syntax
;
1886 bin_tree_t
*tree
, *exp
;
1887 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1888 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1891 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1892 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1894 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1895 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
1897 free_bin_tree (tree
);
1900 if (tree
!= NULL
&& exp
!= NULL
)
1902 tree
= create_tree (tree
, exp
, CONCAT
, 0);
1909 else if (tree
== NULL
)
1911 /* Otherwise exp == NULL, we don't need to create new tree. */
1916 /* This function build the following tree, from regular expression a*:
1923 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
1924 re_string_t
*regexp
;
1927 reg_syntax_t syntax
;
1931 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1934 switch (token
->type
)
1937 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1938 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1939 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1944 #ifdef RE_ENABLE_I18N
1947 while (!re_string_eoi (regexp
)
1948 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
1950 bin_tree_t
*mbc_remain
;
1951 *token
= fetch_token (regexp
, syntax
);
1952 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1953 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
1954 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
1955 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
1956 return *err
= REG_ESPACE
, NULL
;
1961 case OP_OPEN_SUBEXP
:
1962 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
1963 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1966 case OP_OPEN_BRACKET
:
1967 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
1968 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1972 if (BE (preg
->re_nsub
< token
->opr
.idx
1973 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
1978 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1979 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1980 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1986 dfa
->has_mb_node
= 1;
1988 case OP_DUP_ASTERISK
:
1990 case OP_DUP_QUESTION
:
1991 case OP_OPEN_DUP_NUM
:
1992 if (syntax
& RE_CONTEXT_INVALID_OPS
)
1997 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
1999 *token
= fetch_token (regexp
, syntax
);
2000 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2002 /* else fall through */
2003 case OP_CLOSE_SUBEXP
:
2004 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2005 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2010 /* else fall through */
2011 case OP_CLOSE_DUP_NUM
:
2012 /* We treat it as a normal character. */
2014 /* Then we can these characters as normal characters. */
2015 token
->type
= CHARACTER
;
2016 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2017 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2018 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2025 if (dfa
->word_char
== NULL
)
2027 *err
= init_word_char (dfa
);
2028 if (BE (*err
!= REG_NOERROR
, 0))
2031 if (token
->opr
.ctx_type
== WORD_DELIM
)
2033 bin_tree_t
*tree_first
, *tree_last
;
2034 int idx_first
, idx_last
;
2035 token
->opr
.ctx_type
= WORD_FIRST
;
2036 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
2037 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
2038 token
->opr
.ctx_type
= WORD_LAST
;
2039 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
2040 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
2041 token
->type
= OP_ALT
;
2042 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2043 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
2044 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
2045 || tree_first
== NULL
|| tree_last
== NULL
2046 || tree
== NULL
, 0))
2054 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2055 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2056 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2057 return *err
= REG_ESPACE
, NULL
;
2059 /* We must return here, since ANCHORs can't be followed
2060 by repetition operators.
2061 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2062 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2063 *token
= fetch_token (regexp
, syntax
);
2066 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2067 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2068 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2074 dfa
->has_mb_node
= 1;
2077 tree
= build_word_op (dfa
, 0, err
);
2078 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2082 tree
= build_word_op (dfa
, 1, err
);
2083 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2093 /* Must not happen? */
2099 *token
= fetch_token (regexp
, syntax
);
2101 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2102 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2104 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2105 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2107 dfa
->has_plural_match
= 1;
2113 /* This function build the following tree, from regular expression
2121 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2122 re_string_t
*regexp
;
2125 reg_syntax_t syntax
;
2129 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2130 bin_tree_t
*tree
, *left_par
, *right_par
;
2133 cur_nsub
= preg
->re_nsub
++;
2134 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2136 re_subexp_t
*new_array
;
2137 dfa
->subexps_alloc
*= 2;
2138 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2139 if (BE (new_array
== NULL
, 0))
2141 dfa
->subexps_alloc
/= 2;
2145 dfa
->subexps
= new_array
;
2147 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2148 dfa
->subexps
[cur_nsub
].end
= -1;
2150 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2151 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2152 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2157 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2158 *token
= fetch_token (regexp
, syntax
);
2160 /* The subexpression may be a null string. */
2161 if (token
->type
== OP_CLOSE_SUBEXP
)
2165 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2166 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2169 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2171 free_bin_tree (tree
);
2175 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2176 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2177 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2178 tree
= ((tree
== NULL
) ? right_par
2179 : create_tree (tree
, right_par
, CONCAT
, 0));
2180 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2181 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2186 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2191 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2194 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2195 bin_tree_t
*dup_elem
;
2196 re_string_t
*regexp
;
2199 reg_syntax_t syntax
;
2202 re_token_t dup_token
;
2203 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2204 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2205 re_token_t start_token
= *token
;
2206 if (token
->type
== OP_OPEN_DUP_NUM
)
2210 int start
= fetch_number (regexp
, token
, syntax
);
2214 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2215 start
= 0; /* We treat "{,m}" as "{0,m}". */
2218 *err
= REG_BADBR
; /* <re>{} is invalid. */
2222 if (BE (start
!= -2, 1))
2224 /* We treat "{n}" as "{n,n}". */
2225 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2226 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2227 ? fetch_number (regexp
, token
, syntax
) : -2));
2229 if (BE (start
== -2 || end
== -2, 0))
2231 /* Invalid sequence. */
2232 if (token
->type
== OP_CLOSE_DUP_NUM
)
2233 goto parse_dup_op_invalid_interval
;
2235 goto parse_dup_op_ebrace
;
2237 if (BE (start
== 0 && end
== 0, 0))
2239 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2240 *token
= fetch_token (regexp
, syntax
);
2241 free_bin_tree (dup_elem
);
2245 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2247 for (i
= 0; i
< start
; ++i
)
2250 work_tree
= duplicate_tree (elem
, dfa
);
2251 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2252 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2253 goto parse_dup_op_espace
;
2258 /* We treat "<re>{0,}" as "<re>*". */
2259 dup_token
.type
= OP_DUP_ASTERISK
;
2262 elem
= duplicate_tree (elem
, dfa
);
2263 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2264 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2265 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2266 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2267 || tree
== NULL
, 0))
2268 goto parse_dup_op_espace
;
2272 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2273 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2274 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2275 goto parse_dup_op_espace
;
2278 else if (end
- start
> 0)
2280 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2281 dup_token
.type
= OP_DUP_QUESTION
;
2284 elem
= duplicate_tree (elem
, dfa
);
2285 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2286 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2287 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2288 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2289 goto parse_dup_op_espace
;
2293 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2294 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2295 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2296 goto parse_dup_op_espace
;
2298 for (i
= 1; i
< end
- start
; ++i
)
2300 work_tree
= duplicate_tree (elem
, dfa
);
2301 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2302 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2312 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2313 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2314 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2320 *token
= fetch_token (regexp
, syntax
);
2323 parse_dup_op_espace
:
2324 free_bin_tree (tree
);
2328 parse_dup_op_ebrace
:
2329 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2334 goto parse_dup_op_rollback
;
2335 parse_dup_op_invalid_interval
:
2336 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2341 parse_dup_op_rollback
:
2342 re_string_set_index (regexp
, start_idx
);
2343 *token
= start_token
;
2344 token
->type
= CHARACTER
;
2348 /* Size of the names for collating symbol/equivalence_class/character_class.
2349 I'm not sure, but maybe enough. */
2350 #define BRACKET_NAME_BUF_SIZE 32
2353 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2354 Build the range expression which starts from START_ELEM, and ends
2355 at END_ELEM. The result are written to MBCSET and SBCSET.
2356 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2357 mbcset->range_ends, is a pointer argument sinse we may
2360 static reg_errcode_t
2361 # ifdef RE_ENABLE_I18N
2362 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2363 re_charset_t
*mbcset
;
2365 # else /* not RE_ENABLE_I18N */
2366 build_range_exp (sbcset
, start_elem
, end_elem
)
2367 # endif /* not RE_ENABLE_I18N */
2368 re_bitset_ptr_t sbcset
;
2369 bracket_elem_t
*start_elem
, *end_elem
;
2371 unsigned int start_ch
, end_ch
;
2372 /* Equivalence Classes and Character Classes can't be a range start/end. */
2373 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2374 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2378 /* We can handle no multi character collating elements without libc
2380 if (BE ((start_elem
->type
== COLL_SYM
2381 && strlen ((char *) start_elem
->opr
.name
) > 1)
2382 || (end_elem
->type
== COLL_SYM
2383 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2384 return REG_ECOLLATE
;
2386 # ifdef RE_ENABLE_I18N
2388 wchar_t wc
, start_wc
, end_wc
;
2389 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2391 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2392 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2394 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2395 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2397 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2398 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2399 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2400 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2401 cmp_buf
[0] = start_wc
;
2402 cmp_buf
[4] = end_wc
;
2403 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2406 /* Check the space of the arrays. */
2407 if (*range_alloc
== mbcset
->nranges
)
2409 /* There are not enough space, need realloc. */
2410 wchar_t *new_array_start
, *new_array_end
;
2413 /* +1 in case of mbcset->nranges is 0. */
2414 new_nranges
= 2 * mbcset
->nranges
+ 1;
2415 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2416 are NULL if *range_alloc == 0. */
2417 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2419 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2422 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2425 mbcset
->range_starts
= new_array_start
;
2426 mbcset
->range_ends
= new_array_end
;
2427 *range_alloc
= new_nranges
;
2430 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2431 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2433 /* Build the table for single byte characters. */
2434 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2437 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2438 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2439 bitset_set (sbcset
, wc
);
2442 # else /* not RE_ENABLE_I18N */
2445 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2446 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2448 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2449 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2451 if (start_ch
> end_ch
)
2453 /* Build the table for single byte characters. */
2454 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2455 if (start_ch
<= ch
&& ch
<= end_ch
)
2456 bitset_set (sbcset
, ch
);
2458 # endif /* not RE_ENABLE_I18N */
2461 #endif /* not _LIBC */
2464 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2465 Build the collating element which is represented by NAME.
2466 The result are written to MBCSET and SBCSET.
2467 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2468 pointer argument since we may update it. */
2470 static reg_errcode_t
2471 # ifdef RE_ENABLE_I18N
2472 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2473 re_charset_t
*mbcset
;
2474 int *coll_sym_alloc
;
2475 # else /* not RE_ENABLE_I18N */
2476 build_collating_symbol (sbcset
, name
)
2477 # endif /* not RE_ENABLE_I18N */
2478 re_bitset_ptr_t sbcset
;
2479 const unsigned char *name
;
2481 size_t name_len
= strlen ((const char *) name
);
2482 if (BE (name_len
!= 1, 0))
2483 return REG_ECOLLATE
;
2486 bitset_set (sbcset
, name
[0]);
2490 #endif /* not _LIBC */
2492 /* This function parse bracket expression like "[abc]", "[a-c]",
2496 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2497 re_string_t
*regexp
;
2500 reg_syntax_t syntax
;
2504 const unsigned char *collseqmb
;
2505 const char *collseqwc
;
2508 const int32_t *symb_table
;
2509 const unsigned char *extra
;
2511 /* Local function for parse_bracket_exp used in _LIBC environement.
2512 Seek the collating symbol entry correspondings to NAME.
2513 Return the index of the symbol in the SYMB_TABLE. */
2515 static inline int32_t
2516 seek_collating_symbol_entry (name
, name_len
)
2517 const unsigned char *name
;
2520 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2521 int32_t elem
= hash
% table_size
;
2522 int32_t second
= hash
% (table_size
- 2);
2523 while (symb_table
[2 * elem
] != 0)
2525 /* First compare the hashing value. */
2526 if (symb_table
[2 * elem
] == hash
2527 /* Compare the length of the name. */
2528 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2529 /* Compare the name. */
2530 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2533 /* Yep, this is the entry. */
2543 /* Local function for parse_bracket_exp used in _LIBC environement.
2544 Look up the collation sequence value of BR_ELEM.
2545 Return the value if succeeded, UINT_MAX otherwise. */
2547 static inline unsigned int
2548 lookup_collation_sequence_value (br_elem
)
2549 bracket_elem_t
*br_elem
;
2551 if (br_elem
->type
== SB_CHAR
)
2554 if (MB_CUR_MAX == 1)
2557 return collseqmb
[br_elem
->opr
.ch
];
2560 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2561 return collseq_table_lookup (collseqwc
, wc
);
2564 else if (br_elem
->type
== MB_CHAR
)
2566 return collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2568 else if (br_elem
->type
== COLL_SYM
)
2570 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2574 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2576 if (symb_table
[2 * elem
] != 0)
2578 /* We found the entry. */
2579 idx
= symb_table
[2 * elem
+ 1];
2580 /* Skip the name of collating element name. */
2581 idx
+= 1 + extra
[idx
];
2582 /* Skip the byte sequence of the collating element. */
2583 idx
+= 1 + extra
[idx
];
2584 /* Adjust for the alignment. */
2585 idx
= (idx
+ 3) & ~3;
2586 /* Skip the multibyte collation sequence value. */
2587 idx
+= sizeof (unsigned int);
2588 /* Skip the wide char sequence of the collating element. */
2589 idx
+= sizeof (unsigned int) *
2590 (1 + *(unsigned int *) (extra
+ idx
));
2591 /* Return the collation sequence value. */
2592 return *(unsigned int *) (extra
+ idx
);
2594 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2596 /* No valid character. Match it as a single byte
2598 return collseqmb
[br_elem
->opr
.name
[0]];
2601 else if (sym_name_len
== 1)
2602 return collseqmb
[br_elem
->opr
.name
[0]];
2607 /* Local function for parse_bracket_exp used in _LIBC environement.
2608 Build the range expression which starts from START_ELEM, and ends
2609 at END_ELEM. The result are written to MBCSET and SBCSET.
2610 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2611 mbcset->range_ends, is a pointer argument sinse we may
2614 static inline reg_errcode_t
2615 # ifdef RE_ENABLE_I18N
2616 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2617 re_charset_t
*mbcset
;
2619 # else /* not RE_ENABLE_I18N */
2620 build_range_exp (sbcset
, start_elem
, end_elem
)
2621 # endif /* not RE_ENABLE_I18N */
2622 re_bitset_ptr_t sbcset
;
2623 bracket_elem_t
*start_elem
, *end_elem
;
2626 uint32_t start_collseq
;
2627 uint32_t end_collseq
;
2629 # ifdef RE_ENABLE_I18N
2630 /* Check the space of the arrays. */
2631 if (*range_alloc
== mbcset
->nranges
)
2633 /* There are not enough space, need realloc. */
2634 uint32_t *new_array_start
;
2635 uint32_t *new_array_end
;
2638 /* +1 in case of mbcset->nranges is 0. */
2639 new_nranges
= 2 * mbcset
->nranges
+ 1;
2640 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2641 are NULL if *range_alloc == 0. */
2642 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2644 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2647 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2650 mbcset
->range_starts
= new_array_start
;
2651 mbcset
->range_ends
= new_array_end
;
2652 *range_alloc
= new_nranges
;
2654 # endif /* RE_ENABLE_I18N */
2656 /* Equivalence Classes and Character Classes can't be a range
2658 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2659 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2663 start_collseq
= lookup_collation_sequence_value (start_elem
);
2664 end_collseq
= lookup_collation_sequence_value (end_elem
);
2665 /* Check start/end collation sequence values. */
2666 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2667 return REG_ECOLLATE
;
2668 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2671 # ifdef RE_ENABLE_I18N
2672 /* Got valid collation sequence values, add them as a new entry. */
2673 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2674 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2675 # endif /* RE_ENABLE_I18N */
2677 /* Build the table for single byte characters. */
2678 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2680 uint32_t ch_collseq
;
2682 if (MB_CUR_MAX == 1)
2685 ch_collseq
= collseqmb
[ch
];
2687 ch_collseq
= collseq_table_lookup (collseqwc
, __btowc (ch
));
2688 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2689 bitset_set (sbcset
, ch
);
2694 /* Local function for parse_bracket_exp used in _LIBC environement.
2695 Build the collating element which is represented by NAME.
2696 The result are written to MBCSET and SBCSET.
2697 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2698 pointer argument sinse we may update it. */
2700 static inline reg_errcode_t
2701 # ifdef RE_ENABLE_I18N
2702 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2703 re_charset_t
*mbcset
;
2704 int *coll_sym_alloc
;
2705 # else /* not RE_ENABLE_I18N */
2706 build_collating_symbol (sbcset
, name
)
2707 # endif /* not RE_ENABLE_I18N */
2708 re_bitset_ptr_t sbcset
;
2709 const unsigned char *name
;
2712 size_t name_len
= strlen ((const char *) name
);
2715 elem
= seek_collating_symbol_entry (name
, name_len
);
2716 if (symb_table
[2 * elem
] != 0)
2718 /* We found the entry. */
2719 idx
= symb_table
[2 * elem
+ 1];
2720 /* Skip the name of collating element name. */
2721 idx
+= 1 + extra
[idx
];
2723 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2725 /* No valid character, treat it as a normal
2727 bitset_set (sbcset
, name
[0]);
2731 return REG_ECOLLATE
;
2733 # ifdef RE_ENABLE_I18N
2734 /* Got valid collation sequence, add it as a new entry. */
2735 /* Check the space of the arrays. */
2736 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2738 /* Not enough, realloc it. */
2739 /* +1 in case of mbcset->ncoll_syms is 0. */
2740 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2741 /* Use realloc since mbcset->coll_syms is NULL
2743 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2745 if (BE (mbcset
->coll_syms
== NULL
, 0))
2748 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2749 # endif /* RE_ENABLE_I18N */
2754 if (BE (name_len
!= 1, 0))
2755 return REG_ECOLLATE
;
2758 bitset_set (sbcset
, name
[0]);
2765 re_token_t br_token
;
2766 re_bitset_ptr_t sbcset
;
2767 #ifdef RE_ENABLE_I18N
2768 re_charset_t
*mbcset
;
2769 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2770 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2771 #else /* not RE_ENABLE_I18N */
2773 #endif /* not RE_ENABLE_I18N */
2774 bin_tree_t
*work_tree
;
2775 int token_len
, new_idx
;
2777 collseqmb
= (const unsigned char *)
2778 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2779 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2785 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2786 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2787 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2788 _NL_COLLATE_SYMB_TABLEMB
);
2789 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2790 _NL_COLLATE_SYMB_EXTRAMB
);
2793 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2794 #ifdef RE_ENABLE_I18N
2795 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2796 #endif /* RE_ENABLE_I18N */
2797 #ifdef RE_ENABLE_I18N
2798 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2800 if (BE (sbcset
== NULL
, 0))
2801 #endif /* RE_ENABLE_I18N */
2807 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2808 if (BE (token
->type
== END_OF_RE
, 0))
2811 goto parse_bracket_exp_free_return
;
2813 if (token
->type
== OP_NON_MATCH_LIST
)
2815 #ifdef RE_ENABLE_I18N
2817 mbcset
->non_match
= 1;
2818 #else /* not RE_ENABLE_I18N */
2820 #endif /* not RE_ENABLE_I18N */
2821 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2822 bitset_set (sbcset
, '\0');
2823 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2824 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2825 if (BE (token
->type
== END_OF_RE
, 0))
2828 goto parse_bracket_exp_free_return
;
2830 #ifdef RE_ENABLE_I18N
2832 for (i
= 0; i
< SBC_MAX
; ++i
)
2833 if (__btowc (i
) == WEOF
)
2834 bitset_set (sbcset
, i
);
2835 #endif /* RE_ENABLE_I18N */
2838 /* We treat the first ']' as a normal character. */
2839 if (token
->type
== OP_CLOSE_BRACKET
)
2840 token
->type
= CHARACTER
;
2844 bracket_elem_t start_elem
, end_elem
;
2845 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
2846 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
2848 int token_len2
= 0, is_range_exp
= 0;
2851 start_elem
.opr
.name
= start_name_buf
;
2852 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
2854 if (BE (ret
!= REG_NOERROR
, 0))
2857 goto parse_bracket_exp_free_return
;
2860 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2861 if (BE (token
->type
== END_OF_RE
, 0))
2864 goto parse_bracket_exp_free_return
;
2866 if (token
->type
== OP_CHARSET_RANGE
)
2868 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
2869 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
2870 if (BE (token
->type
== END_OF_RE
, 0))
2873 goto parse_bracket_exp_free_return
;
2875 if (token2
.type
== OP_CLOSE_BRACKET
)
2877 /* We treat the last '-' as a normal character. */
2878 re_string_skip_bytes (regexp
, -token_len
);
2879 token
->type
= CHARACTER
;
2885 if (is_range_exp
== 1)
2887 end_elem
.opr
.name
= end_name_buf
;
2888 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
2890 if (BE (ret
!= REG_NOERROR
, 0))
2893 goto parse_bracket_exp_free_return
;
2896 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2897 if (BE (token
->type
== END_OF_RE
, 0))
2900 goto parse_bracket_exp_free_return
;
2902 *err
= build_range_exp (sbcset
,
2903 #ifdef RE_ENABLE_I18N
2904 mbcset
, &range_alloc
,
2905 #endif /* RE_ENABLE_I18N */
2906 &start_elem
, &end_elem
);
2907 if (BE (*err
!= REG_NOERROR
, 0))
2908 goto parse_bracket_exp_free_return
;
2912 switch (start_elem
.type
)
2915 bitset_set (sbcset
, start_elem
.opr
.ch
);
2917 #ifdef RE_ENABLE_I18N
2919 /* Check whether the array has enough space. */
2920 if (mbchar_alloc
== mbcset
->nmbchars
)
2922 /* Not enough, realloc it. */
2923 /* +1 in case of mbcset->nmbchars is 0. */
2924 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
2925 /* Use realloc since array is NULL if *alloc == 0. */
2926 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
2928 if (BE (mbcset
->mbchars
== NULL
, 0))
2929 goto parse_bracket_exp_espace
;
2931 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
2933 #endif /* RE_ENABLE_I18N */
2935 *err
= build_equiv_class (sbcset
,
2936 #ifdef RE_ENABLE_I18N
2937 mbcset
, &equiv_class_alloc
,
2938 #endif /* RE_ENABLE_I18N */
2939 start_elem
.opr
.name
);
2940 if (BE (*err
!= REG_NOERROR
, 0))
2941 goto parse_bracket_exp_free_return
;
2944 *err
= build_collating_symbol (sbcset
,
2945 #ifdef RE_ENABLE_I18N
2946 mbcset
, &coll_sym_alloc
,
2947 #endif /* RE_ENABLE_I18N */
2948 start_elem
.opr
.name
);
2949 if (BE (*err
!= REG_NOERROR
, 0))
2950 goto parse_bracket_exp_free_return
;
2953 ret
= build_charclass (sbcset
,
2954 #ifdef RE_ENABLE_I18N
2955 mbcset
, &char_class_alloc
,
2956 #endif /* RE_ENABLE_I18N */
2957 start_elem
.opr
.name
, syntax
);
2958 if (BE (ret
!= REG_NOERROR
, 0))
2959 goto parse_bracket_exp_espace
;
2966 if (token
->type
== OP_CLOSE_BRACKET
)
2970 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2972 /* If it is non-matching list. */
2973 #ifdef RE_ENABLE_I18N
2974 if (mbcset
->non_match
)
2975 #else /* not RE_ENABLE_I18N */
2977 #endif /* not RE_ENABLE_I18N */
2978 bitset_not (sbcset
);
2980 /* Build a tree for simple bracket. */
2981 br_token
.type
= SIMPLE_BRACKET
;
2982 br_token
.opr
.sbcset
= sbcset
;
2983 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2984 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2985 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
2986 goto parse_bracket_exp_espace
;
2988 #ifdef RE_ENABLE_I18N
2989 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
2990 || mbcset
->nranges
|| (MB_CUR_MAX
> 1 && (mbcset
->nchar_classes
2991 || mbcset
->non_match
)))
2993 re_token_t alt_token
;
2994 bin_tree_t
*mbc_tree
;
2995 /* Build a tree for complex bracket. */
2996 br_token
.type
= COMPLEX_BRACKET
;
2997 br_token
.opr
.mbcset
= mbcset
;
2998 dfa
->has_mb_node
= 1;
2999 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3000 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3001 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3002 goto parse_bracket_exp_espace
;
3003 /* Then join them by ALT node. */
3004 dfa
->has_plural_match
= 1;
3005 alt_token
.type
= OP_ALT
;
3006 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3007 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
3008 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3013 free_charset (mbcset
);
3016 #else /* not RE_ENABLE_I18N */
3018 #endif /* not RE_ENABLE_I18N */
3020 parse_bracket_exp_espace
:
3022 parse_bracket_exp_free_return
:
3024 #ifdef RE_ENABLE_I18N
3025 free_charset (mbcset
);
3026 #endif /* RE_ENABLE_I18N */
3030 /* Parse an element in the bracket expression. */
3032 static reg_errcode_t
3033 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
)
3034 bracket_elem_t
*elem
;
3035 re_string_t
*regexp
;
3039 reg_syntax_t syntax
;
3041 #ifdef RE_ENABLE_I18N
3043 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3044 if (cur_char_size
> 1)
3046 elem
->type
= MB_CHAR
;
3047 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3048 re_string_skip_bytes (regexp
, cur_char_size
);
3051 #endif /* RE_ENABLE_I18N */
3052 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3053 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3054 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3055 return parse_bracket_symbol (elem
, regexp
, token
);
3056 elem
->type
= SB_CHAR
;
3057 elem
->opr
.ch
= token
->opr
.c
;
3061 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3062 such as [:<character_class>:], [.<collating_element>.], and
3063 [=<equivalent_class>=]. */
3065 static reg_errcode_t
3066 parse_bracket_symbol (elem
, regexp
, token
)
3067 bracket_elem_t
*elem
;
3068 re_string_t
*regexp
;
3071 unsigned char ch
, delim
= token
->opr
.c
;
3075 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3077 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3078 ch
= re_string_fetch_byte_case (regexp
);
3080 ch
= re_string_fetch_byte (regexp
);
3081 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3083 elem
->opr
.name
[i
] = ch
;
3085 re_string_skip_bytes (regexp
, 1);
3086 elem
->opr
.name
[i
] = '\0';
3087 switch (token
->type
)
3089 case OP_OPEN_COLL_ELEM
:
3090 elem
->type
= COLL_SYM
;
3092 case OP_OPEN_EQUIV_CLASS
:
3093 elem
->type
= EQUIV_CLASS
;
3095 case OP_OPEN_CHAR_CLASS
:
3096 elem
->type
= CHAR_CLASS
;
3104 /* Helper function for parse_bracket_exp.
3105 Build the equivalence class which is represented by NAME.
3106 The result are written to MBCSET and SBCSET.
3107 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3108 is a pointer argument sinse we may update it. */
3110 static reg_errcode_t
3111 #ifdef RE_ENABLE_I18N
3112 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3113 re_charset_t
*mbcset
;
3114 int *equiv_class_alloc
;
3115 #else /* not RE_ENABLE_I18N */
3116 build_equiv_class (sbcset
, name
)
3117 #endif /* not RE_ENABLE_I18N */
3118 re_bitset_ptr_t sbcset
;
3119 const unsigned char *name
;
3121 #if defined _LIBC && defined RE_ENABLE_I18N
3122 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3125 const int32_t *table
, *indirect
;
3126 const unsigned char *weights
, *extra
, *cp
;
3127 unsigned char char_buf
[2];
3131 /* This #include defines a local function! */
3132 # include <locale/weight.h>
3133 /* Calculate the index for equivalence class. */
3135 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3136 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3137 _NL_COLLATE_WEIGHTMB
);
3138 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3139 _NL_COLLATE_EXTRAMB
);
3140 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3141 _NL_COLLATE_INDIRECTMB
);
3142 idx1
= findidx (&cp
);
3143 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3144 /* This isn't a valid character. */
3145 return REG_ECOLLATE
;
3147 /* Build single byte matcing table for this equivalence class. */
3148 char_buf
[1] = (unsigned char) '\0';
3149 len
= weights
[idx1
];
3150 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3154 idx2
= findidx (&cp
);
3159 /* This isn't a valid character. */
3161 if (len
== weights
[idx2
])
3164 while (cnt
<= len
&&
3165 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3169 bitset_set (sbcset
, ch
);
3172 /* Check whether the array has enough space. */
3173 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3175 /* Not enough, realloc it. */
3176 /* +1 in case of mbcset->nequiv_classes is 0. */
3177 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3178 /* Use realloc since the array is NULL if *alloc == 0. */
3179 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3180 *equiv_class_alloc
);
3181 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3184 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3187 #endif /* _LIBC && RE_ENABLE_I18N */
3189 if (BE (strlen ((const char *) name
) != 1, 0))
3190 return REG_ECOLLATE
;
3191 bitset_set (sbcset
, *name
);
3196 /* Helper function for parse_bracket_exp.
3197 Build the character class which is represented by NAME.
3198 The result are written to MBCSET and SBCSET.
3199 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3200 is a pointer argument sinse we may update it. */
3202 static reg_errcode_t
3203 #ifdef RE_ENABLE_I18N
3204 build_charclass (sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3205 re_charset_t
*mbcset
;
3206 int *char_class_alloc
;
3207 #else /* not RE_ENABLE_I18N */
3208 build_charclass (sbcset
, class_name
, syntax
)
3209 #endif /* not RE_ENABLE_I18N */
3210 re_bitset_ptr_t sbcset
;
3211 const unsigned char *class_name
;
3212 reg_syntax_t syntax
;
3215 const char *name
= (const char *) class_name
;
3217 /* In case of REG_ICASE "upper" and "lower" match the both of
3218 upper and lower cases. */
3219 if ((syntax
& RE_ICASE
)
3220 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3223 #ifdef RE_ENABLE_I18N
3224 /* Check the space of the arrays. */
3225 if (*char_class_alloc
== mbcset
->nchar_classes
)
3227 /* Not enough, realloc it. */
3228 /* +1 in case of mbcset->nchar_classes is 0. */
3229 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3230 /* Use realloc since array is NULL if *alloc == 0. */
3231 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3233 if (BE (mbcset
->char_classes
== NULL
, 0))
3236 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3237 #endif /* RE_ENABLE_I18N */
3239 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3240 for (i = 0; i < SBC_MAX; ++i) \
3242 if (ctype_func (i)) \
3243 bitset_set (sbcset, i); \
3246 if (strcmp (name
, "alnum") == 0)
3247 BUILD_CHARCLASS_LOOP (isalnum
)
3248 else if (strcmp (name
, "cntrl") == 0)
3249 BUILD_CHARCLASS_LOOP (iscntrl
)
3250 else if (strcmp (name
, "lower") == 0)
3251 BUILD_CHARCLASS_LOOP (islower
)
3252 else if (strcmp (name
, "space") == 0)
3253 BUILD_CHARCLASS_LOOP (isspace
)
3254 else if (strcmp (name
, "alpha") == 0)
3255 BUILD_CHARCLASS_LOOP (isalpha
)
3256 else if (strcmp (name
, "digit") == 0)
3257 BUILD_CHARCLASS_LOOP (isdigit
)
3258 else if (strcmp (name
, "print") == 0)
3259 BUILD_CHARCLASS_LOOP (isprint
)
3260 else if (strcmp (name
, "upper") == 0)
3261 BUILD_CHARCLASS_LOOP (isupper
)
3262 else if (strcmp (name
, "blank") == 0)
3263 BUILD_CHARCLASS_LOOP (isblank
)
3264 else if (strcmp (name
, "graph") == 0)
3265 BUILD_CHARCLASS_LOOP (isgraph
)
3266 else if (strcmp (name
, "punct") == 0)
3267 BUILD_CHARCLASS_LOOP (ispunct
)
3268 else if (strcmp (name
, "xdigit") == 0)
3269 BUILD_CHARCLASS_LOOP (isxdigit
)
3277 build_word_op (dfa
, not, err
)
3282 re_bitset_ptr_t sbcset
;
3283 #ifdef RE_ENABLE_I18N
3284 re_charset_t
*mbcset
;
3286 #else /* not RE_ENABLE_I18N */
3288 #endif /* not RE_ENABLE_I18N */
3290 re_token_t br_token
;
3294 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3295 #ifdef RE_ENABLE_I18N
3296 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3297 #endif /* RE_ENABLE_I18N */
3299 #ifdef RE_ENABLE_I18N
3300 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3301 #else /* not RE_ENABLE_I18N */
3302 if (BE (sbcset
== NULL
, 0))
3303 #endif /* not RE_ENABLE_I18N */
3311 #ifdef RE_ENABLE_I18N
3314 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3315 bitset_set(cset->sbcset, '\0');
3317 mbcset
->non_match
= 1;
3319 for (i
= 0; i
< SBC_MAX
; ++i
)
3320 if (__btowc (i
) == WEOF
)
3321 bitset_set (sbcset
, i
);
3322 #else /* not RE_ENABLE_I18N */
3324 #endif /* not RE_ENABLE_I18N */
3327 /* We don't care the syntax in this case. */
3328 ret
= build_charclass (sbcset
,
3329 #ifdef RE_ENABLE_I18N
3331 #endif /* RE_ENABLE_I18N */
3332 (const unsigned char *) "alpha", 0);
3334 if (BE (ret
!= REG_NOERROR
, 0))
3337 #ifdef RE_ENABLE_I18N
3338 free_charset (mbcset
);
3339 #endif /* RE_ENABLE_I18N */
3343 /* \w match '_' also. */
3344 bitset_set (sbcset
, '_');
3346 /* If it is non-matching list. */
3347 #ifdef RE_ENABLE_I18N
3348 if (mbcset
->non_match
)
3349 #else /* not RE_ENABLE_I18N */
3351 #endif /* not RE_ENABLE_I18N */
3352 bitset_not (sbcset
);
3354 /* Build a tree for simple bracket. */
3355 br_token
.type
= SIMPLE_BRACKET
;
3356 br_token
.opr
.sbcset
= sbcset
;
3357 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3358 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3359 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3360 goto build_word_op_espace
;
3362 #ifdef RE_ENABLE_I18N
3365 re_token_t alt_token
;
3366 bin_tree_t
*mbc_tree
;
3367 /* Build a tree for complex bracket. */
3368 br_token
.type
= COMPLEX_BRACKET
;
3369 br_token
.opr
.mbcset
= mbcset
;
3370 dfa
->has_mb_node
= 1;
3371 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3372 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3373 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3374 goto build_word_op_espace
;
3375 /* Then join them by ALT node. */
3376 alt_token
.type
= OP_ALT
;
3377 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3378 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3379 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3384 free_charset (mbcset
);
3387 #else /* not RE_ENABLE_I18N */
3389 #endif /* not RE_ENABLE_I18N */
3391 build_word_op_espace
:
3393 #ifdef RE_ENABLE_I18N
3394 free_charset (mbcset
);
3395 #endif /* RE_ENABLE_I18N */
3400 /* This is intended for the expressions like "a{1,3}".
3401 Fetch a number from `input', and return the number.
3402 Return -1, if the number field is empty like "{,1}".
3403 Return -2, If an error is occured. */
3406 fetch_number (input
, token
, syntax
)
3409 reg_syntax_t syntax
;
3415 *token
= fetch_token (input
, syntax
);
3417 if (BE (token
->type
== END_OF_RE
, 0))
3419 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3421 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3422 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3423 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3428 #ifdef RE_ENABLE_I18N
3430 free_charset (re_charset_t
*cset
)
3432 re_free (cset
->mbchars
);
3434 re_free (cset
->coll_syms
);
3435 re_free (cset
->equiv_classes
);
3436 re_free (cset
->range_starts
);
3437 re_free (cset
->range_ends
);
3439 re_free (cset
->char_classes
);
3442 #endif /* RE_ENABLE_I18N */
3444 /* Functions for binary tree operation. */
3446 /* Create a node of tree.
3447 Note: This function automatically free left and right if malloc fails. */
3450 create_tree (left
, right
, type
, index
)
3453 re_token_type_t type
;
3457 tree
= re_malloc (bin_tree_t
, 1);
3458 if (BE (tree
== NULL
, 0))
3460 free_bin_tree (left
);
3461 free_bin_tree (right
);
3464 tree
->parent
= NULL
;
3466 tree
->right
= right
;
3468 tree
->node_idx
= index
;
3471 re_node_set_init_empty (&tree
->eclosure
);
3474 left
->parent
= tree
;
3476 right
->parent
= tree
;
3480 /* Free the sub tree pointed by TREE. */
3483 free_bin_tree (tree
)
3488 /*re_node_set_free (&tree->eclosure);*/
3489 free_bin_tree (tree
->left
);
3490 free_bin_tree (tree
->right
);
3494 /* Duplicate the node SRC, and return new node. */
3497 duplicate_tree (src
, dfa
)
3498 const bin_tree_t
*src
;
3501 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3503 /* Since node indies must be according to Post-order of the tree,
3504 we must duplicate the left at first. */
3505 if (src
->left
!= NULL
)
3507 left
= duplicate_tree (src
->left
, dfa
);
3512 /* Secondaly, duplicate the right. */
3513 if (src
->right
!= NULL
)
3515 right
= duplicate_tree (src
->right
, dfa
);
3518 free_bin_tree (left
);
3523 /* At last, duplicate itself. */
3524 if (src
->type
== NON_TYPE
)
3526 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3527 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3528 if (BE (new_node_idx
== -1, 0))
3530 free_bin_tree (left
);
3531 free_bin_tree (right
);
3536 new_node_idx
= src
->type
;
3538 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3539 if (BE (new_tree
== NULL
, 0))
3541 free_bin_tree (left
);
3542 free_bin_tree (right
);