1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
72 int length
, reg_syntax_t syntax
);
73 static void re_compile_fastmap_iter (regex_t
*bufp
,
74 const re_dfastate_t
*init_state
,
76 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
77 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
79 static void free_charset (re_charset_t
*cset
);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t
*preg
);
82 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
83 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
84 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
85 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
86 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
87 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
88 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
89 unsigned int constraint
);
90 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
91 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
93 static void calc_inveclosure (re_dfa_t
*dfa
);
94 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
96 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
97 static int peek_token (re_token_t
*token
, re_string_t
*input
,
99 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
100 reg_syntax_t syntax
);
101 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
102 reg_syntax_t syntax
, reg_errcode_t
*err
);
103 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
104 re_token_t
*token
, reg_syntax_t syntax
,
105 int nest
, reg_errcode_t
*err
);
106 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
107 re_token_t
*token
, reg_syntax_t syntax
,
108 int nest
, reg_errcode_t
*err
);
109 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
110 re_token_t
*token
, reg_syntax_t syntax
,
111 int nest
, reg_errcode_t
*err
);
112 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
113 re_token_t
*token
, reg_syntax_t syntax
,
114 int nest
, reg_errcode_t
*err
);
115 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
116 re_dfa_t
*dfa
, re_token_t
*token
,
117 reg_syntax_t syntax
, reg_errcode_t
*err
);
118 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
119 re_token_t
*token
, reg_syntax_t syntax
,
121 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
123 re_token_t
*token
, int token_len
,
125 reg_syntax_t syntax
);
126 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
130 # ifdef RE_ENABLE_I18N
131 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
132 re_charset_t
*mbcset
, int *range_alloc
,
133 bracket_elem_t
*start_elem
,
134 bracket_elem_t
*end_elem
);
135 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
136 re_charset_t
*mbcset
,
138 const unsigned char *name
);
139 # else /* not RE_ENABLE_I18N */
140 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
141 bracket_elem_t
*start_elem
,
142 bracket_elem_t
*end_elem
);
143 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
144 const unsigned char *name
);
145 # endif /* not RE_ENABLE_I18N */
146 #endif /* not _LIBC */
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
149 re_charset_t
*mbcset
,
150 int *equiv_class_alloc
,
151 const unsigned char *name
);
152 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
153 re_charset_t
*mbcset
,
154 int *char_class_alloc
,
155 const unsigned char *class_name
,
156 reg_syntax_t syntax
);
157 #else /* not RE_ENABLE_I18N */
158 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
159 const unsigned char *name
);
160 static reg_errcode_t
build_charclass (re_bitset_ptr_t sbcset
,
161 const unsigned char *class_name
,
162 reg_syntax_t syntax
);
163 #endif /* not RE_ENABLE_I18N */
164 static bin_tree_t
*build_word_op (re_dfa_t
*dfa
, int not, reg_errcode_t
*err
);
165 static void free_bin_tree (bin_tree_t
*tree
);
166 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
167 re_token_type_t type
, int index
);
168 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
170 /* This table gives an error message for each of the error codes listed
171 in regex.h. Obviously the order here has to be same as there.
172 POSIX doesn't require that we do anything for REG_NOERROR,
173 but why not be nice? */
175 const char __re_error_msgid
[] attribute_hidden
=
177 #define REG_NOERROR_IDX 0
178 gettext_noop ("Success") /* REG_NOERROR */
180 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
181 gettext_noop ("No match") /* REG_NOMATCH */
183 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
184 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
186 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
187 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
189 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
190 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
192 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
193 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
195 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
196 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
198 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
199 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
201 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
202 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
204 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
205 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
207 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
208 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
210 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
211 gettext_noop ("Invalid range end") /* REG_ERANGE */
213 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
214 gettext_noop ("Memory exhausted") /* REG_ESPACE */
216 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
217 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
219 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
220 gettext_noop ("Premature end of regular expression") /* REG_EEND */
222 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
223 gettext_noop ("Regular expression too big") /* REG_ESIZE */
225 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
226 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
229 const size_t __re_error_msgid_idx
[] attribute_hidden
=
250 /* Entry points for GNU code. */
252 /* re_compile_pattern is the GNU regular expression compiler: it
253 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
254 Returns 0 if the pattern was valid, otherwise an error string.
256 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
257 are set in BUFP on entry. */
260 re_compile_pattern (pattern
, length
, bufp
)
263 struct re_pattern_buffer
*bufp
;
267 /* GNU code is written to assume at least RE_NREGS registers will be set
268 (and at least one extra will be -1). */
269 bufp
->regs_allocated
= REGS_UNALLOCATED
;
271 /* And GNU code determines whether or not to get register information
272 by passing null for the REGS argument to re_match, etc., not by
276 /* Match anchors at newline. */
277 bufp
->newline_anchor
= 1;
279 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
283 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
286 weak_alias (__re_compile_pattern
, re_compile_pattern
)
289 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
290 also be assigned to arbitrarily: each pattern buffer stores its own
291 syntax, so it can be changed between regex compilations. */
292 /* This has no initializer because initialized variables in Emacs
293 become read-only after dumping. */
294 reg_syntax_t re_syntax_options
;
297 /* Specify the precise syntax of regexps for compilation. This provides
298 for compatibility for various utilities which historically have
299 different, incompatible syntaxes.
301 The argument SYNTAX is a bit mask comprised of the various bits
302 defined in regex.h. We return the old syntax. */
305 re_set_syntax (syntax
)
308 reg_syntax_t ret
= re_syntax_options
;
310 re_syntax_options
= syntax
;
314 weak_alias (__re_set_syntax
, re_set_syntax
)
318 re_compile_fastmap (bufp
)
319 struct re_pattern_buffer
*bufp
;
321 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
322 char *fastmap
= bufp
->fastmap
;
324 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
325 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
326 if (dfa
->init_state
!= dfa
->init_state_word
)
327 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
328 if (dfa
->init_state
!= dfa
->init_state_nl
)
329 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
330 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
331 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
332 bufp
->fastmap_accurate
= 1;
336 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
339 /* Helper function for re_compile_fastmap.
340 Compile fastmap for the initial_state INIT_STATE. */
343 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
345 const re_dfastate_t
*init_state
;
348 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
350 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
352 int node
= init_state
->nodes
.elems
[node_cnt
];
353 re_token_type_t type
= dfa
->nodes
[node
].type
;
354 if (type
== OP_CONTEXT_NODE
)
356 node
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
357 type
= dfa
->nodes
[node
].type
;
360 if (type
== CHARACTER
)
361 fastmap
[dfa
->nodes
[node
].opr
.c
] = 1;
362 else if (type
== SIMPLE_BRACKET
)
365 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
366 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
367 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
370 #ifdef RE_ENABLE_I18N
371 else if (type
== COMPLEX_BRACKET
)
374 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
375 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
376 || cset
->nranges
|| cset
->nchar_classes
)
379 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
381 /* In this case we want to catch the bytes which are
382 the first byte of any collation elements.
383 e.g. In da_DK, we want to catch 'a' since "aa"
384 is a valid collation element, and don't catch
385 'b' since 'b' is the only collation element
386 which starts from 'b'. */
388 const int32_t *table
= (const int32_t *)
389 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
390 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
391 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
397 for (i
= 0; i
< SBC_MAX
; ++i
)
398 if (__btowc (i
) == WEOF
)
400 # endif /* not _LIBC */
402 for (i
= 0; i
< cset
->nmbchars
; ++i
)
405 wctomb (buf
, cset
->mbchars
[i
]);
406 fastmap
[*(unsigned char *) buf
] = 1;
409 #endif /* RE_ENABLE_I18N */
410 else if (type
== END_OF_RE
|| type
== OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type
== COMPLEX_BRACKET
413 #endif /* RE_ENABLE_I18N */
416 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
417 if (type
== END_OF_RE
)
418 bufp
->can_be_null
= 1;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 `buffer' to the compiled pattern;
431 `used' to the length of the compiled pattern;
432 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 `fastmap' to an allocated space for the fastmap;
437 `fastmap_accurate' to zero;
438 `re_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (preg
, pattern
, cflags
)
462 regex_t
*__restrict preg
;
463 const char *__restrict pattern
;
467 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
468 : RE_SYNTAX_POSIX_BASIC
);
474 /* Try to allocate space for the fastmap. */
475 preg
->fastmap
= re_malloc (char, SBC_MAX
);
476 if (BE (preg
->fastmap
== NULL
, 0))
479 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
481 /* If REG_NEWLINE is set, newlines are treated differently. */
482 if (cflags
& REG_NEWLINE
)
483 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
484 syntax
&= ~RE_DOT_NEWLINE
;
485 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
486 /* It also changes the matching behavior. */
487 preg
->newline_anchor
= 1;
490 preg
->newline_anchor
= 0;
491 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
492 preg
->translate
= NULL
;
494 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
496 /* POSIX doesn't distinguish between an unmatched open-group and an
497 unmatched close-group: both are REG_EPAREN. */
498 if (ret
== REG_ERPAREN
)
501 /* We have already checked preg->fastmap != NULL. */
502 if (BE (ret
== REG_NOERROR
, 1))
504 /* Compute the fastmap now, since regexec cannot modify the pattern
506 if (BE (re_compile_fastmap (preg
) == -2, 0))
508 /* Some error occurred while computing the fastmap, just forget
510 re_free (preg
->fastmap
);
511 preg
->fastmap
= NULL
;
518 weak_alias (__regcomp
, regcomp
)
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
525 regerror (errcode
, preg
, errbuf
, errbuf_size
)
535 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
536 / sizeof (__re_error_msgid_idx
[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
543 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
545 msg_size
= strlen (msg
) + 1; /* Includes the null. */
547 if (BE (errbuf_size
!= 0, 1))
549 if (BE (msg_size
> errbuf_size
, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
554 memcpy (errbuf
, msg
, errbuf_size
- 1);
555 errbuf
[errbuf_size
- 1] = 0;
559 memcpy (errbuf
, msg
, msg_size
);
565 weak_alias (__regerror
, regerror
)
568 /* Free dynamically allocated space used by PREG. */
575 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
576 if (BE (dfa
!= NULL
, 1))
578 re_free (dfa
->subexps
);
580 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
582 re_token_t
*node
= dfa
->nodes
+ i
;
583 #ifdef RE_ENABLE_I18N
584 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
585 free_charset (node
->opr
.mbcset
);
587 #endif /* RE_ENABLE_I18N */
588 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
589 re_free (node
->opr
.sbcset
);
590 else if (node
->type
== OP_CONTEXT_NODE
)
592 if (dfa
->nodes
[node
->opr
.ctx_info
->entity
].type
== OP_BACK_REF
)
594 if (node
->opr
.ctx_info
->bkref_eclosure
!= NULL
)
595 re_node_set_free (node
->opr
.ctx_info
->bkref_eclosure
);
596 re_free (node
->opr
.ctx_info
->bkref_eclosure
);
598 re_free (node
->opr
.ctx_info
);
601 re_free (dfa
->firsts
);
602 re_free (dfa
->nexts
);
603 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
605 if (dfa
->eclosures
!= NULL
)
606 re_node_set_free (dfa
->eclosures
+ i
);
607 if (dfa
->inveclosures
!= NULL
)
608 re_node_set_free (dfa
->inveclosures
+ i
);
609 if (dfa
->edests
!= NULL
)
610 re_node_set_free (dfa
->edests
+ i
);
612 re_free (dfa
->edests
);
613 re_free (dfa
->eclosures
);
614 re_free (dfa
->inveclosures
);
615 re_free (dfa
->nodes
);
617 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
619 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
620 for (j
= 0; j
< entry
->num
; ++j
)
622 re_dfastate_t
*state
= entry
->array
[j
];
623 if (state
->entrance_nodes
!= &state
->nodes
)
625 re_node_set_free (state
->entrance_nodes
);
626 re_free (state
->entrance_nodes
);
628 re_node_set_free (&state
->nodes
);
629 re_free (state
->trtable
);
630 re_free (state
->trtable_search
);
633 re_free (entry
->array
);
635 re_free (dfa
->state_table
);
637 if (dfa
->word_char
!= NULL
)
638 re_free (dfa
->word_char
);
641 re_free (preg
->fastmap
);
644 weak_alias (__regfree
, regfree
)
647 /* Entry points compatible with 4.2 BSD regex library. We don't define
648 them unless specifically requested. */
650 #if defined _REGEX_RE_COMP || defined _LIBC
652 /* BSD has one and only one pattern buffer. */
653 static struct re_pattern_buffer re_comp_buf
;
657 /* Make these definitions weak in libc, so POSIX programs can redefine
658 these names if they don't use our functions, and still use
659 regcomp/regexec above without link errors. */
669 if (!re_comp_buf
.buffer
)
670 return gettext ("No previous regular expression");
674 if (!re_comp_buf
.buffer
)
676 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
677 if (re_comp_buf
.fastmap
== NULL
)
678 return (char *) gettext (__re_error_msgid
679 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
682 /* Since `re_exec' always passes NULL for the `regs' argument, we
683 don't need to initialize the pattern buffer fields which affect it. */
685 /* Match anchors at newlines. */
686 re_comp_buf
.newline_anchor
= 1;
688 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
693 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
694 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
696 #endif /* _REGEX_RE_COMP */
698 /* Internal entry point.
699 Compile the regular expression PATTERN, whose length is LENGTH.
700 SYNTAX indicate regular expression's syntax. */
703 re_compile_internal (preg
, pattern
, length
, syntax
)
705 const char * pattern
;
709 reg_errcode_t err
= REG_NOERROR
;
713 /* Initialize the pattern buffer. */
714 preg
->fastmap_accurate
= 0;
715 preg
->syntax
= syntax
;
716 preg
->not_bol
= preg
->not_eol
= 0;
720 /* Initialize the dfa. */
721 dfa
= (re_dfa_t
*) preg
->buffer
;
722 if (preg
->allocated
< sizeof (re_dfa_t
))
724 /* If zero allocated, but buffer is non-null, try to realloc
725 enough space. This loses if buffer's address is bogus, but
726 that is the user's responsibility. If ->buffer is NULL this
727 is a simple allocation. */
728 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
731 preg
->allocated
= sizeof (re_dfa_t
);
733 preg
->buffer
= (unsigned char *) dfa
;
734 preg
->used
= sizeof (re_dfa_t
);
736 err
= init_dfa (dfa
, length
);
737 if (BE (err
!= REG_NOERROR
, 0))
744 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
746 if (BE (err
!= REG_NOERROR
, 0))
753 /* Parse the regular expression, and build a structure tree. */
755 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
756 if (BE (dfa
->str_tree
== NULL
, 0))
757 goto re_compile_internal_free_return
;
759 /* Analyze the tree and collect information which is necessary to
762 if (BE (err
!= REG_NOERROR
, 0))
763 goto re_compile_internal_free_return
;
765 /* Then create the initial state of the dfa. */
766 err
= create_initial_state (dfa
);
767 if (BE (err
!= REG_NOERROR
, 0))
768 goto re_compile_internal_free_return
;
770 re_compile_internal_free_return
:
771 /* Release work areas. */
772 free_workarea_compile (preg
);
773 re_string_destruct (®exp
);
778 /* Initialize DFA. We use the length of the regular expression PAT_LEN
779 as the initial length of some arrays. */
782 init_dfa (dfa
, pat_len
)
788 memset (dfa
, '\0', sizeof (re_dfa_t
));
790 dfa
->nodes_alloc
= pat_len
+ 1;
791 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
793 dfa
->states_alloc
= pat_len
+ 1;
795 /* table_size = 2 ^ ceil(log pat_len) */
796 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
797 if (table_size
> pat_len
)
800 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
801 dfa
->state_hash_mask
= table_size
- 1;
803 dfa
->subexps_alloc
= 1;
804 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
805 dfa
->word_char
= NULL
;
807 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
808 || dfa
->subexps
== NULL
, 0))
810 /* We don't bother to free anything which was allocated. Very
811 soon the process will go down anyway. */
813 dfa
->state_table
= NULL
;
820 /* Initialize WORD_CHAR table, which indicate which character is
821 "word". In this case "word" means that it is the word construction
822 character used by some operators like "\<", "\>", etc. */
829 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
830 if (BE (dfa
->word_char
== NULL
, 0))
832 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
833 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
834 if (isalnum (ch
) || ch
== '_')
835 dfa
->word_char
[i
] |= 1 << j
;
839 /* Free the work area which are only used while compiling. */
842 free_workarea_compile (preg
)
845 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
846 free_bin_tree (dfa
->str_tree
);
847 dfa
->str_tree
= NULL
;
850 /* Create initial states for all contexts. */
853 create_initial_state (dfa
)
858 re_node_set init_nodes
;
860 /* Initial states have the epsilon closure of the node which is
861 the first node of the regular expression. */
862 first
= dfa
->str_tree
->first
;
863 dfa
->init_node
= first
;
864 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
865 if (BE (err
!= REG_NOERROR
, 0))
868 /* The back-references which are in initial states can epsilon transit,
869 since in this case all of the subexpressions can be null.
870 Then we add epsilon closures of the nodes which are the next nodes of
871 the back-references. */
872 if (dfa
->nbackref
> 0)
873 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
875 int node_idx
= init_nodes
.elems
[i
];
876 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
877 if (type
== OP_CONTEXT_NODE
878 && (dfa
->nodes
[dfa
->nodes
[node_idx
].opr
.ctx_info
->entity
].type
881 int prev_nelem
= init_nodes
.nelem
;
882 re_node_set_merge (&init_nodes
,
883 dfa
->nodes
[node_idx
].opr
.ctx_info
->bkref_eclosure
);
884 if (prev_nelem
< init_nodes
.nelem
)
887 else if (type
== OP_BACK_REF
)
889 int next_idx
= dfa
->nexts
[node_idx
];
890 if (!re_node_set_contains (&init_nodes
, next_idx
))
892 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ next_idx
);
898 /* It must be the first time to invoke acquire_state. */
899 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
900 /* We don't check ERR here, since the initial state must not be NULL. */
901 if (BE (dfa
->init_state
== NULL
, 0))
903 if (dfa
->init_state
->has_constraint
)
905 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
907 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
909 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
913 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
914 || dfa
->init_state_begbuf
== NULL
, 0))
918 dfa
->init_state_word
= dfa
->init_state_nl
919 = dfa
->init_state_begbuf
= dfa
->init_state
;
921 re_node_set_free (&init_nodes
);
925 /* Analyze the structure tree, and calculate "first", "next", "edest",
926 "eclosure", and "inveclosure". */
935 /* Allocate arrays. */
936 dfa
->firsts
= re_malloc (int, dfa
->nodes_alloc
);
937 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
938 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
939 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
940 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
941 if (BE (dfa
->firsts
== NULL
|| dfa
->nexts
== NULL
|| dfa
->edests
== NULL
942 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
944 /* Initialize them. */
945 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
949 re_node_set_init_empty (dfa
->edests
+ i
);
950 re_node_set_init_empty (dfa
->eclosures
+ i
);
951 re_node_set_init_empty (dfa
->inveclosures
+ i
);
954 ret
= analyze_tree (dfa
, dfa
->str_tree
);
955 if (BE (ret
== REG_NOERROR
, 1))
957 ret
= calc_eclosure (dfa
);
958 if (ret
== REG_NOERROR
)
959 calc_inveclosure (dfa
);
964 /* Helper functions for analyze.
965 This function calculate "first", "next", and "edest" for the subtree
966 whose root is NODE. */
969 analyze_tree (dfa
, node
)
974 if (node
->first
== -1)
975 calc_first (dfa
, node
);
976 if (node
->next
== -1)
977 calc_next (dfa
, node
);
978 if (node
->eclosure
.nelem
== 0)
979 calc_epsdest (dfa
, node
);
980 /* Calculate "first" etc. for the left child. */
981 if (node
->left
!= NULL
)
983 ret
= analyze_tree (dfa
, node
->left
);
984 if (BE (ret
!= REG_NOERROR
, 0))
987 /* Calculate "first" etc. for the right child. */
988 if (node
->right
!= NULL
)
990 ret
= analyze_tree (dfa
, node
->right
);
991 if (BE (ret
!= REG_NOERROR
, 0))
997 /* Calculate "first" for the node NODE. */
999 calc_first (dfa
, node
)
1004 idx
= node
->node_idx
;
1005 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1010 case OP_OPEN_BRACKET
:
1011 case OP_CLOSE_BRACKET
:
1012 case OP_OPEN_DUP_NUM
:
1013 case OP_CLOSE_DUP_NUM
:
1014 case OP_NON_MATCH_LIST
:
1015 case OP_OPEN_COLL_ELEM
:
1016 case OP_CLOSE_COLL_ELEM
:
1017 case OP_OPEN_EQUIV_CLASS
:
1018 case OP_CLOSE_EQUIV_CLASS
:
1019 case OP_OPEN_CHAR_CLASS
:
1020 case OP_CLOSE_CHAR_CLASS
:
1021 /* These must not be appeared here. */
1027 case OP_DUP_ASTERISK
:
1028 case OP_DUP_QUESTION
:
1029 #ifdef RE_ENABLE_I18N
1030 case COMPLEX_BRACKET
:
1031 #endif /* RE_ENABLE_I18N */
1032 case SIMPLE_BRACKET
:
1035 case OP_OPEN_SUBEXP
:
1036 case OP_CLOSE_SUBEXP
:
1041 assert (node
->left
!= NULL
);
1043 if (node
->left
->first
== -1)
1044 calc_first (dfa
, node
->left
);
1045 node
->first
= node
->left
->first
;
1050 /* else fall through */
1053 assert (node
->left
!= NULL
);
1055 if (node
->left
->first
== -1)
1056 calc_first (dfa
, node
->left
);
1057 node
->first
= node
->left
->first
;
1060 if (node
->type
== 0)
1061 dfa
->firsts
[idx
] = node
->first
;
1064 /* Calculate "next" for the node NODE. */
1067 calc_next (dfa
, node
)
1072 bin_tree_t
*parent
= node
->parent
;
1076 idx
= node
->node_idx
;
1077 if (node
->type
== 0)
1078 dfa
->nexts
[idx
] = node
->next
;
1082 idx
= parent
->node_idx
;
1083 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1087 case OP_DUP_ASTERISK
:
1092 if (parent
->left
== node
)
1094 if (parent
->right
->first
== -1)
1095 calc_first (dfa
, parent
->right
);
1096 node
->next
= parent
->right
->first
;
1099 /* else fall through */
1101 if (parent
->next
== -1)
1102 calc_next (dfa
, parent
);
1103 node
->next
= parent
->next
;
1106 idx
= node
->node_idx
;
1107 if (node
->type
== 0)
1108 dfa
->nexts
[idx
] = node
->next
;
1111 /* Calculate "edest" for the node NODE. */
1114 calc_epsdest (dfa
, node
)
1119 idx
= node
->node_idx
;
1120 if (node
->type
== 0)
1122 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1123 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1124 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1126 if (node
->left
->first
== -1)
1127 calc_first (dfa
, node
->left
);
1128 if (node
->next
== -1)
1129 calc_next (dfa
, node
);
1130 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1133 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1136 if (node
->left
!= NULL
)
1138 if (node
->left
->first
== -1)
1139 calc_first (dfa
, node
->left
);
1140 left
= node
->left
->first
;
1144 if (node
->next
== -1)
1145 calc_next (dfa
, node
);
1148 if (node
->right
!= NULL
)
1150 if (node
->right
->first
== -1)
1151 calc_first (dfa
, node
->right
);
1152 right
= node
->right
->first
;
1156 if (node
->next
== -1)
1157 calc_next (dfa
, node
);
1160 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1162 else if (dfa
->nodes
[idx
].type
== ANCHOR
1163 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1164 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
)
1165 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1169 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1170 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1171 otherwise return the error code. */
1173 static reg_errcode_t
1174 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1176 int *new_idx
, org_idx
;
1177 unsigned int constraint
;
1183 dup
.type
= OP_CONTEXT_NODE
;
1184 if (dfa
->nodes
[org_idx
].type
== OP_CONTEXT_NODE
)
1186 /* If the node whose index is ORG_IDX is the same as the intended
1188 if (dfa
->nodes
[org_idx
].constraint
== constraint
)
1193 dup
.constraint
= constraint
|
1194 dfa
->nodes
[org_idx
].constraint
;
1197 dup
.constraint
= constraint
;
1199 /* In case that `entity' points OP_CONTEXT_NODE,
1200 we correct `entity' to real entity in calc_inveclosures(). */
1201 dup
.opr
.ctx_info
= malloc (sizeof (*dup
.opr
.ctx_info
));
1202 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1203 if (BE (dup
.opr
.ctx_info
== NULL
|| dup_idx
== -1, 0))
1205 dup
.opr
.ctx_info
->entity
= org_idx
;
1206 dup
.opr
.ctx_info
->bkref_eclosure
= NULL
;
1208 dfa
->nodes
[dup_idx
].duplicated
= 1;
1209 dfa
->firsts
[dup_idx
] = dfa
->firsts
[org_idx
];
1210 dfa
->nexts
[dup_idx
] = dfa
->nexts
[org_idx
];
1211 err
= re_node_set_init_copy (dfa
->edests
+ dup_idx
, dfa
->edests
+ org_idx
);
1212 if (BE (err
!= REG_NOERROR
, 0))
1214 /* Since we don't duplicate epsilon nodes, epsilon closure have
1216 err
= re_node_set_init_1 (dfa
->eclosures
+ dup_idx
, dup_idx
);
1217 if (BE (err
!= REG_NOERROR
, 0))
1219 err
= re_node_set_init_1 (dfa
->inveclosures
+ dup_idx
, dup_idx
);
1220 if (BE (err
!= REG_NOERROR
, 0))
1222 /* Then we must update inveclosure for this node.
1223 We process them at last part of calc_eclosure(),
1224 since we don't complete to calculate them here. */
1231 calc_inveclosure (dfa
)
1234 int src
, idx
, dest
, entity
;
1235 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1237 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1239 dest
= dfa
->eclosures
[src
].elems
[idx
];
1240 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1244 while (dfa
->nodes
[entity
].type
== OP_CONTEXT_NODE
)
1246 entity
= dfa
->nodes
[entity
].opr
.ctx_info
->entity
;
1247 re_node_set_merge (dfa
->inveclosures
+ src
,
1248 dfa
->inveclosures
+ entity
);
1249 dfa
->nodes
[src
].opr
.ctx_info
->entity
= entity
;
1254 /* Calculate "eclosure" for all the node in DFA. */
1256 static reg_errcode_t
1260 int idx
, node_idx
, max
, incomplete
= 0;
1262 assert (dfa
->nodes_len
> 0);
1264 /* For each nodes, calculate epsilon closure. */
1265 for (node_idx
= 0, max
= dfa
->nodes_len
; ; ++node_idx
)
1268 re_node_set eclosure_elem
;
1269 if (node_idx
== max
)
1278 assert (dfa
->nodes
[node_idx
].type
!= OP_CONTEXT_NODE
);
1279 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1281 /* If we have already calculated, skip it. */
1282 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1284 /* Calculate epsilon closure of `node_idx'. */
1285 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1286 if (BE (err
!= REG_NOERROR
, 0))
1289 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1292 re_node_set_free (&eclosure_elem
);
1296 /* for duplicated nodes. */
1297 for (idx
= max
; idx
< dfa
->nodes_len
; ++idx
)
1299 int entity
, i
, constraint
;
1300 re_node_set
*bkref_eclosure
;
1301 entity
= dfa
->nodes
[idx
].opr
.ctx_info
->entity
;
1302 re_node_set_merge (dfa
->inveclosures
+ idx
, dfa
->inveclosures
+ entity
);
1303 if (dfa
->nodes
[entity
].type
!= OP_BACK_REF
)
1306 /* If the node is backreference, duplicate the epsilon closure of
1307 the next node. Since it may epsilon transit. */
1308 /* Note: duplicate_node() may realloc dfa->eclosures, etc. */
1309 bkref_eclosure
= re_malloc (re_node_set
, 1);
1310 if (BE (bkref_eclosure
== NULL
, 0))
1312 re_node_set_init_empty (bkref_eclosure
);
1313 constraint
= dfa
->nodes
[idx
].constraint
;
1314 for (i
= 0; i
< dfa
->eclosures
[dfa
->nexts
[idx
]].nelem
; ++i
)
1316 int dest_node_idx
= dfa
->eclosures
[dfa
->nexts
[idx
]].elems
[i
];
1317 if (!IS_EPSILON_NODE (dfa
->nodes
[dest_node_idx
].type
))
1320 err
= duplicate_node (&dest_node_idx
, dfa
, dest_node_idx
,
1322 if (BE (err
!= REG_NOERROR
, 0))
1325 re_node_set_insert (bkref_eclosure
, dest_node_idx
);
1327 dfa
->nodes
[idx
].opr
.ctx_info
->bkref_eclosure
= bkref_eclosure
;
1333 /* Calculate epsilon closure of NODE. */
1335 static reg_errcode_t
1336 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1337 re_node_set
*new_set
;
1342 unsigned int constraint
;
1343 int i
, max
, incomplete
= 0;
1344 re_node_set eclosure
;
1345 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1346 if (BE (err
!= REG_NOERROR
, 0))
1349 /* This indicates that we are calculating this node now.
1350 We reference this value to avoid infinite loop. */
1351 dfa
->eclosures
[node
].nelem
= -1;
1353 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1354 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1356 /* Expand each epsilon destination nodes. */
1357 if (dfa
->edests
[node
].nelem
!= 0)
1358 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1360 re_node_set eclosure_elem
;
1361 int edest
= dfa
->edests
[node
].elems
[i
];
1362 /* If calculating the epsilon closure of `edest' is in progress,
1363 return intermediate result. */
1364 if (dfa
->eclosures
[edest
].nelem
== -1)
1369 /* If we haven't calculated the epsilon closure of `edest' yet,
1370 calculate now. Otherwise use calculated epsilon closure. */
1371 if (dfa
->eclosures
[edest
].nelem
== 0)
1373 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1374 if (BE (err
!= REG_NOERROR
, 0))
1378 eclosure_elem
= dfa
->eclosures
[edest
];
1379 /* Merge the epsilon closure of `edest'. */
1380 re_node_set_merge (&eclosure
, &eclosure_elem
);
1381 /* If the epsilon closure of `edest' is incomplete,
1382 the epsilon closure of this node is also incomplete. */
1383 if (dfa
->eclosures
[edest
].nelem
== 0)
1386 re_node_set_free (&eclosure_elem
);
1390 /* If the current node has constraints, duplicate all non-epsilon nodes.
1391 Since they must inherit the constraints. */
1393 for (i
= 0, max
= eclosure
.nelem
; i
< max
; ++i
)
1395 int dest
= eclosure
.elems
[i
];
1396 if (!IS_EPSILON_NODE (dfa
->nodes
[dest
].type
))
1400 err
= duplicate_node (&dup_dest
, dfa
, dest
, constraint
);
1401 if (BE (err
!= REG_NOERROR
, 0))
1403 if (dest
!= dup_dest
)
1405 re_node_set_remove_at (&eclosure
, i
--);
1406 re_node_set_insert (&eclosure
, dup_dest
);
1412 /* Epsilon closures include itself. */
1413 re_node_set_insert (&eclosure
, node
);
1414 if (incomplete
&& !root
)
1415 dfa
->eclosures
[node
].nelem
= 0;
1417 dfa
->eclosures
[node
] = eclosure
;
1418 *new_set
= eclosure
;
1422 /* Functions for token which are used in the parser. */
1424 /* Fetch a token from INPUT.
1425 We must not use this function inside bracket expressions. */
1428 fetch_token (input
, syntax
)
1430 reg_syntax_t syntax
;
1434 consumed_byte
= peek_token (&token
, input
, syntax
);
1435 re_string_skip_bytes (input
, consumed_byte
);
1439 /* Peek a token from INPUT, and return the length of the token.
1440 We must not use this function inside bracket expressions. */
1443 peek_token (token
, input
, syntax
)
1446 reg_syntax_t syntax
;
1450 if (re_string_eoi (input
))
1452 token
->type
= END_OF_RE
;
1456 c
= re_string_peek_byte (input
, 0);
1459 #ifdef RE_ENABLE_I18N
1460 token
->mb_partial
= 0;
1461 if (MB_CUR_MAX
> 1 &&
1462 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1464 token
->type
= CHARACTER
;
1465 token
->mb_partial
= 1;
1472 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1474 token
->type
= BACK_SLASH
;
1478 c2
= re_string_peek_byte_case (input
, 1);
1480 token
->type
= CHARACTER
;
1484 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1485 token
->type
= OP_ALT
;
1487 case '1': case '2': case '3': case '4': case '5':
1488 case '6': case '7': case '8': case '9':
1489 if (!(syntax
& RE_NO_BK_REFS
))
1491 token
->type
= OP_BACK_REF
;
1492 token
->opr
.idx
= c2
- '0';
1496 if (!(syntax
& RE_NO_GNU_OPS
))
1498 token
->type
= ANCHOR
;
1499 token
->opr
.idx
= WORD_FIRST
;
1503 if (!(syntax
& RE_NO_GNU_OPS
))
1505 token
->type
= ANCHOR
;
1506 token
->opr
.idx
= WORD_LAST
;
1510 if (!(syntax
& RE_NO_GNU_OPS
))
1512 token
->type
= ANCHOR
;
1513 token
->opr
.idx
= WORD_DELIM
;
1517 if (!(syntax
& RE_NO_GNU_OPS
))
1519 token
->type
= ANCHOR
;
1520 token
->opr
.idx
= INSIDE_WORD
;
1524 if (!(syntax
& RE_NO_GNU_OPS
))
1525 token
->type
= OP_WORD
;
1528 if (!(syntax
& RE_NO_GNU_OPS
))
1529 token
->type
= OP_NOTWORD
;
1532 if (!(syntax
& RE_NO_GNU_OPS
))
1534 token
->type
= ANCHOR
;
1535 token
->opr
.idx
= BUF_FIRST
;
1539 if (!(syntax
& RE_NO_GNU_OPS
))
1541 token
->type
= ANCHOR
;
1542 token
->opr
.idx
= BUF_LAST
;
1546 if (!(syntax
& RE_NO_BK_PARENS
))
1547 token
->type
= OP_OPEN_SUBEXP
;
1550 if (!(syntax
& RE_NO_BK_PARENS
))
1551 token
->type
= OP_CLOSE_SUBEXP
;
1554 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1555 token
->type
= OP_DUP_PLUS
;
1558 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1559 token
->type
= OP_DUP_QUESTION
;
1562 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1563 token
->type
= OP_OPEN_DUP_NUM
;
1566 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1567 token
->type
= OP_CLOSE_DUP_NUM
;
1575 token
->type
= CHARACTER
;
1579 if (syntax
& RE_NEWLINE_ALT
)
1580 token
->type
= OP_ALT
;
1583 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1584 token
->type
= OP_ALT
;
1587 token
->type
= OP_DUP_ASTERISK
;
1590 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1591 token
->type
= OP_DUP_PLUS
;
1594 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1595 token
->type
= OP_DUP_QUESTION
;
1598 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1599 token
->type
= OP_OPEN_DUP_NUM
;
1602 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1603 token
->type
= OP_CLOSE_DUP_NUM
;
1606 if (syntax
& RE_NO_BK_PARENS
)
1607 token
->type
= OP_OPEN_SUBEXP
;
1610 if (syntax
& RE_NO_BK_PARENS
)
1611 token
->type
= OP_CLOSE_SUBEXP
;
1614 token
->type
= OP_OPEN_BRACKET
;
1617 token
->type
= OP_PERIOD
;
1620 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1621 re_string_cur_idx (input
) != 0)
1623 char prev
= re_string_peek_byte (input
, -1);
1624 if (prev
!= '|' && prev
!= '(' &&
1625 (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n'))
1628 token
->type
= ANCHOR
;
1629 token
->opr
.idx
= LINE_FIRST
;
1632 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1633 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1636 re_string_skip_bytes (input
, 1);
1637 peek_token (&next
, input
, syntax
);
1638 re_string_skip_bytes (input
, -1);
1639 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1642 token
->type
= ANCHOR
;
1643 token
->opr
.idx
= LINE_LAST
;
1651 /* Peek a token from INPUT, and return the length of the token.
1652 We must not use this function out of bracket expressions. */
1655 peek_token_bracket (token
, input
, syntax
)
1658 reg_syntax_t syntax
;
1661 if (re_string_eoi (input
))
1663 token
->type
= END_OF_RE
;
1666 c
= re_string_peek_byte (input
, 0);
1669 #ifdef RE_ENABLE_I18N
1670 if (MB_CUR_MAX
> 1 &&
1671 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1673 token
->type
= CHARACTER
;
1676 #endif /* RE_ENABLE_I18N */
1678 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1680 /* In this case, '\' escape a character. */
1682 c2
= re_string_peek_byte (input
, 1);
1684 token
->type
= CHARACTER
;
1687 if (c
== '[') /* '[' is a special char in a bracket exps. */
1691 c2
= re_string_peek_byte (input
, 1);
1697 token
->type
= OP_OPEN_COLL_ELEM
;
1700 token
->type
= OP_OPEN_EQUIV_CLASS
;
1703 if (syntax
& RE_CHAR_CLASSES
)
1705 token
->type
= OP_OPEN_CHAR_CLASS
;
1708 /* else fall through. */
1710 token
->type
= CHARACTER
;
1720 token
->type
= OP_CHARSET_RANGE
;
1723 token
->type
= OP_CLOSE_BRACKET
;
1726 token
->type
= OP_NON_MATCH_LIST
;
1729 token
->type
= CHARACTER
;
1734 /* Functions for parser. */
1736 /* Entry point of the parser.
1737 Parse the regular expression REGEXP and return the structure tree.
1738 If an error is occured, ERR is set by error code, and return NULL.
1739 This function build the following tree, from regular expression <reg_exp>:
1745 CAT means concatenation.
1746 EOR means end of regular expression. */
1749 parse (regexp
, preg
, syntax
, err
)
1750 re_string_t
*regexp
;
1752 reg_syntax_t syntax
;
1755 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1756 bin_tree_t
*tree
, *eor
, *root
;
1757 re_token_t current_token
;
1759 current_token
= fetch_token (regexp
, syntax
);
1760 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1761 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1763 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1764 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1766 root
= create_tree (tree
, eor
, CONCAT
, 0);
1769 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1770 return *err
= REG_ESPACE
, NULL
;
1774 /* This function build the following tree, from regular expression
1775 <branch1>|<branch2>:
1781 ALT means alternative, which represents the operator `|'. */
1784 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1785 re_string_t
*regexp
;
1788 reg_syntax_t syntax
;
1792 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1793 bin_tree_t
*tree
, *branch
= NULL
;
1795 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1796 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1799 while (token
->type
== OP_ALT
)
1801 re_token_t alt_token
= *token
;
1802 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1803 *token
= fetch_token (regexp
, syntax
);
1804 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1805 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1807 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1808 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1810 free_bin_tree (tree
);
1816 tree
= create_tree (tree
, branch
, 0, new_idx
);
1817 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1818 return *err
= REG_ESPACE
, NULL
;
1823 /* This function build the following tree, from regular expression
1830 CAT means concatenation. */
1833 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
1834 re_string_t
*regexp
;
1837 reg_syntax_t syntax
;
1841 bin_tree_t
*tree
, *exp
;
1842 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1843 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1846 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1847 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1849 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1850 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
1852 free_bin_tree (tree
);
1855 if (tree
!= NULL
&& exp
!= NULL
)
1857 tree
= create_tree (tree
, exp
, CONCAT
, 0);
1859 return *err
= REG_ESPACE
, NULL
;
1861 else if (tree
== NULL
)
1863 /* Otherwise exp == NULL, we don't need to create new tree. */
1868 /* This function build the following tree, from regular expression a*:
1875 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
1876 re_string_t
*regexp
;
1879 reg_syntax_t syntax
;
1883 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1886 switch (token
->type
)
1889 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1890 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1891 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1892 return *err
= REG_ESPACE
, NULL
;
1893 #ifdef RE_ENABLE_I18N
1896 while (!re_string_eoi (regexp
)
1897 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
1899 bin_tree_t
*mbc_remain
;
1900 *token
= fetch_token (regexp
, syntax
);
1901 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1902 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
1903 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
1904 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
1905 return *err
= REG_ESPACE
, NULL
;
1910 case OP_OPEN_SUBEXP
:
1911 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
1912 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1915 case OP_OPEN_BRACKET
:
1916 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
1917 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1921 if (BE (preg
->re_nsub
< token
->opr
.idx
1922 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
1927 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1928 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1929 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1930 return *err
= REG_ESPACE
, NULL
;
1932 dfa
->has_mb_node
= 1;
1934 case OP_DUP_ASTERISK
:
1936 case OP_DUP_QUESTION
:
1937 case OP_OPEN_DUP_NUM
:
1938 if (syntax
& RE_CONTEXT_INVALID_OPS
)
1939 return *err
= REG_BADRPT
, NULL
;
1940 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
1942 *token
= fetch_token (regexp
, syntax
);
1943 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1945 /* else fall through */
1946 case OP_CLOSE_SUBEXP
:
1947 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
1948 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
1949 return *err
= REG_ERPAREN
, NULL
;
1950 /* else fall through */
1951 case OP_CLOSE_DUP_NUM
:
1952 /* We treat it as a normal character. */
1954 /* Then we can these characters as normal characters. */
1955 token
->type
= CHARACTER
;
1956 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1957 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1958 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1959 return *err
= REG_ESPACE
, NULL
;
1962 if (dfa
->word_char
== NULL
)
1964 *err
= init_word_char (dfa
);
1965 if (BE (*err
!= REG_NOERROR
, 0))
1968 if (token
->opr
.ctx_type
== WORD_DELIM
)
1970 bin_tree_t
*tree_first
, *tree_last
;
1971 int idx_first
, idx_last
;
1972 token
->opr
.ctx_type
= WORD_FIRST
;
1973 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
1974 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
1975 token
->opr
.ctx_type
= WORD_LAST
;
1976 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
1977 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
1978 token
->type
= OP_ALT
;
1979 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1980 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
1981 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
1982 || tree_first
== NULL
|| tree_last
== NULL
1983 || tree
== NULL
, 0))
1984 return *err
= REG_ESPACE
, NULL
;
1988 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1989 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1990 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1991 return *err
= REG_ESPACE
, NULL
;
1993 /* We must return here, since ANCHORs can't be followed
1994 by repetition operators.
1995 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
1996 it must not be "<ANCHOR(^)><REPEAT(*)>". */
1997 *token
= fetch_token (regexp
, syntax
);
2000 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2001 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2002 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2003 return *err
= REG_ESPACE
, NULL
;
2005 dfa
->has_mb_node
= 1;
2008 tree
= build_word_op (dfa
, 0, err
);
2009 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2013 tree
= build_word_op (dfa
, 1, err
);
2014 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2024 /* Must not happen? */
2030 *token
= fetch_token (regexp
, syntax
);
2032 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2033 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2035 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2036 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2043 /* This function build the following tree, from regular expression
2051 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2052 re_string_t
*regexp
;
2055 reg_syntax_t syntax
;
2059 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2060 bin_tree_t
*tree
, *left_par
, *right_par
;
2063 cur_nsub
= preg
->re_nsub
++;
2064 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2066 re_subexp_t
*new_array
;
2067 dfa
->subexps_alloc
*= 2;
2068 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2069 if (BE (new_array
== NULL
, 0))
2071 dfa
->subexps_alloc
/= 2;
2075 dfa
->subexps
= new_array
;
2077 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2078 dfa
->subexps
[cur_nsub
].end
= -1;
2080 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2081 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2082 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2083 return *err
= REG_ESPACE
, NULL
;
2084 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2085 *token
= fetch_token (regexp
, syntax
);
2087 /* The subexpression may be a null string. */
2088 if (token
->type
== OP_CLOSE_SUBEXP
)
2092 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2093 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2096 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2098 free_bin_tree (tree
);
2102 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2103 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2104 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2105 tree
= ((tree
== NULL
) ? right_par
2106 : create_tree (tree
, right_par
, CONCAT
, 0));
2107 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2108 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2109 return *err
= REG_ESPACE
, NULL
;
2110 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2115 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2118 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2119 bin_tree_t
*dup_elem
;
2120 re_string_t
*regexp
;
2123 reg_syntax_t syntax
;
2126 re_token_t dup_token
;
2127 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2128 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2129 re_token_t start_token
= *token
;
2130 if (token
->type
== OP_OPEN_DUP_NUM
)
2134 int start
= fetch_number (regexp
, token
, syntax
);
2138 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2139 start
= 0; /* We treat "{,m}" as "{0,m}". */
2142 *err
= REG_BADBR
; /* <re>{} is invalid. */
2146 if (BE (start
!= -2, 1))
2148 /* We treat "{n}" as "{n,n}". */
2149 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2150 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2151 ? fetch_number (regexp
, token
, syntax
) : -2));
2153 if (BE (start
== -2 || end
== -2, 0))
2155 /* Invalid sequence. */
2156 if (token
->type
== OP_CLOSE_DUP_NUM
)
2157 goto parse_dup_op_invalid_interval
;
2159 goto parse_dup_op_ebrace
;
2161 if (BE (start
== 0 && end
== 0, 0))
2163 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2164 *token
= fetch_token (regexp
, syntax
);
2165 free_bin_tree (dup_elem
);
2169 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2171 for (i
= 0; i
< start
; ++i
)
2174 work_tree
= duplicate_tree (elem
, dfa
);
2175 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2176 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2177 goto parse_dup_op_espace
;
2182 /* We treat "<re>{0,}" as "<re>*". */
2183 dup_token
.type
= OP_DUP_ASTERISK
;
2186 elem
= duplicate_tree (elem
, dfa
);
2187 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2188 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2189 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2190 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2191 || tree
== NULL
, 0))
2192 goto parse_dup_op_espace
;
2196 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2197 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2198 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2199 goto parse_dup_op_espace
;
2202 else if (end
- start
> 0)
2204 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2205 dup_token
.type
= OP_DUP_QUESTION
;
2208 elem
= duplicate_tree (elem
, dfa
);
2209 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2210 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2211 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2212 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2213 goto parse_dup_op_espace
;
2217 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2218 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2219 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2220 goto parse_dup_op_espace
;
2222 for (i
= 1; i
< end
- start
; ++i
)
2224 work_tree
= duplicate_tree (elem
, dfa
);
2225 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2226 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2227 return *err
= REG_ESPACE
, NULL
;
2233 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2234 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2235 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2236 return *err
= REG_ESPACE
, NULL
;
2238 *token
= fetch_token (regexp
, syntax
);
2241 parse_dup_op_espace
:
2242 free_bin_tree (tree
);
2246 parse_dup_op_ebrace
:
2247 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2252 goto parse_dup_op_rollback
;
2253 parse_dup_op_invalid_interval
:
2254 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2259 parse_dup_op_rollback
:
2260 re_string_set_index (regexp
, start_idx
);
2261 *token
= start_token
;
2262 token
->type
= CHARACTER
;
2266 /* Size of the names for collating symbol/equivalence_class/character_class.
2267 I'm not sure, but maybe enough. */
2268 #define BRACKET_NAME_BUF_SIZE 32
2271 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2272 Build the range expression which starts from START_ELEM, and ends
2273 at END_ELEM. The result are written to MBCSET and SBCSET.
2274 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2275 mbcset->range_ends, is a pointer argument sinse we may
2278 static reg_errcode_t
2279 # ifdef RE_ENABLE_I18N
2280 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2281 re_charset_t
*mbcset
;
2283 # else /* not RE_ENABLE_I18N */
2284 build_range_exp (sbcset
, start_elem
, end_elem
)
2285 # endif /* not RE_ENABLE_I18N */
2286 re_bitset_ptr_t sbcset
;
2287 bracket_elem_t
*start_elem
, *end_elem
;
2289 unsigned int start_ch
, end_ch
;
2290 /* Equivalence Classes and Character Classes can't be a range start/end. */
2291 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2292 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2296 /* We can handle no multi character collating elements without libc
2298 if (BE ((start_elem
->type
== COLL_SYM
2299 && strlen ((char *) start_elem
->opr
.name
) > 1)
2300 || (end_elem
->type
== COLL_SYM
2301 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2302 return REG_ECOLLATE
;
2304 # ifdef RE_ENABLE_I18N
2306 wchar_t wc
, start_wc
, end_wc
;
2307 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2309 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2310 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2312 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2313 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2315 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2316 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2317 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2318 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2319 cmp_buf
[0] = start_wc
;
2320 cmp_buf
[4] = end_wc
;
2321 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2324 /* Check the space of the arrays. */
2325 if (*range_alloc
== mbcset
->nranges
)
2327 /* There are not enough space, need realloc. */
2328 wchar_t *new_array_start
, *new_array_end
;
2331 /* +1 in case of mbcset->nranges is 0. */
2332 new_nranges
= 2 * mbcset
->nranges
+ 1;
2333 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2334 are NULL if *range_alloc == 0. */
2335 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2337 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2340 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2343 mbcset
->range_starts
= new_array_start
;
2344 mbcset
->range_ends
= new_array_end
;
2345 *range_alloc
= new_nranges
;
2348 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2349 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2351 /* Build the table for single byte characters. */
2352 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2355 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2356 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2357 bitset_set (sbcset
, wc
);
2360 # else /* not RE_ENABLE_I18N */
2363 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2364 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2366 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2367 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2369 if (start_ch
> end_ch
)
2371 /* Build the table for single byte characters. */
2372 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2373 if (start_ch
<= ch
&& ch
<= end_ch
)
2374 bitset_set (sbcset
, ch
);
2376 # endif /* not RE_ENABLE_I18N */
2379 #endif /* not _LIBC */
2382 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2383 Build the collating element which is represented by NAME.
2384 The result are written to MBCSET and SBCSET.
2385 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2386 pointer argument since we may update it. */
2388 static reg_errcode_t
2389 # ifdef RE_ENABLE_I18N
2390 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2391 re_charset_t
*mbcset
;
2392 int *coll_sym_alloc
;
2393 # else /* not RE_ENABLE_I18N */
2394 build_collating_symbol (sbcset
, name
)
2395 # endif /* not RE_ENABLE_I18N */
2396 re_bitset_ptr_t sbcset
;
2397 const unsigned char *name
;
2399 size_t name_len
= strlen ((const char *) name
);
2400 if (BE (name_len
!= 1, 0))
2401 return REG_ECOLLATE
;
2404 bitset_set (sbcset
, name
[0]);
2408 #endif /* not _LIBC */
2410 /* This function parse bracket expression like "[abc]", "[a-c]",
2414 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2415 re_string_t
*regexp
;
2418 reg_syntax_t syntax
;
2422 const unsigned char *collseqmb
;
2423 const char *collseqwc
;
2426 const int32_t *symb_table
;
2427 const unsigned char *extra
;
2429 /* Local function for parse_bracket_exp used in _LIBC environement.
2430 Seek the collating symbol entry correspondings to NAME.
2431 Return the index of the symbol in the SYMB_TABLE. */
2433 static inline int32_t
2434 seek_collating_symbol_entry (name
, name_len
)
2435 const unsigned char *name
;
2438 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2439 int32_t elem
= hash
% table_size
;
2440 int32_t second
= hash
% (table_size
- 2);
2441 while (symb_table
[2 * elem
] != 0)
2443 /* First compare the hashing value. */
2444 if (symb_table
[2 * elem
] == hash
2445 /* Compare the length of the name. */
2446 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2447 /* Compare the name. */
2448 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2451 /* Yep, this is the entry. */
2461 /* Local function for parse_bracket_exp used in _LIBC environement.
2462 Look up the collation sequence value of BR_ELEM.
2463 Return the value if succeeded, UINT_MAX otherwise. */
2465 static inline unsigned int
2466 lookup_collation_sequence_value (br_elem
)
2467 bracket_elem_t
*br_elem
;
2469 if (br_elem
->type
== SB_CHAR
)
2472 if (MB_CUR_MAX == 1)
2475 return collseqmb
[br_elem
->opr
.ch
];
2478 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2479 return collseq_table_lookup (collseqwc
, wc
);
2482 else if (br_elem
->type
== MB_CHAR
)
2484 return collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2486 else if (br_elem
->type
== COLL_SYM
)
2488 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2492 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2494 if (symb_table
[2 * elem
] != 0)
2496 /* We found the entry. */
2497 idx
= symb_table
[2 * elem
+ 1];
2498 /* Skip the name of collating element name. */
2499 idx
+= 1 + extra
[idx
];
2500 /* Skip the byte sequence of the collating element. */
2501 idx
+= 1 + extra
[idx
];
2502 /* Adjust for the alignment. */
2503 idx
= (idx
+ 3) & ~3;
2504 /* Skip the multibyte collation sequence value. */
2505 idx
+= sizeof (unsigned int);
2506 /* Skip the wide char sequence of the collating element. */
2507 idx
+= sizeof (unsigned int) *
2508 (1 + *(unsigned int *) (extra
+ idx
));
2509 /* Return the collation sequence value. */
2510 return *(unsigned int *) (extra
+ idx
);
2512 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2514 /* No valid character. Match it as a single byte
2516 return collseqmb
[br_elem
->opr
.name
[0]];
2519 else if (sym_name_len
== 1)
2520 return collseqmb
[br_elem
->opr
.name
[0]];
2525 /* Local function for parse_bracket_exp used in _LIBC environement.
2526 Build the range expression which starts from START_ELEM, and ends
2527 at END_ELEM. The result are written to MBCSET and SBCSET.
2528 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2529 mbcset->range_ends, is a pointer argument sinse we may
2532 static inline reg_errcode_t
2533 # ifdef RE_ENABLE_I18N
2534 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2535 re_charset_t
*mbcset
;
2537 # else /* not RE_ENABLE_I18N */
2538 build_range_exp (sbcset
, start_elem
, end_elem
)
2539 # endif /* not RE_ENABLE_I18N */
2540 re_bitset_ptr_t sbcset
;
2541 bracket_elem_t
*start_elem
, *end_elem
;
2544 uint32_t start_collseq
;
2545 uint32_t end_collseq
;
2547 # ifdef RE_ENABLE_I18N
2548 /* Check the space of the arrays. */
2549 if (*range_alloc
== mbcset
->nranges
)
2551 /* There are not enough space, need realloc. */
2552 uint32_t *new_array_start
;
2553 uint32_t *new_array_end
;
2556 /* +1 in case of mbcset->nranges is 0. */
2557 new_nranges
= 2 * mbcset
->nranges
+ 1;
2558 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2559 are NULL if *range_alloc == 0. */
2560 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2562 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2565 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2568 mbcset
->range_starts
= new_array_start
;
2569 mbcset
->range_ends
= new_array_end
;
2570 *range_alloc
= new_nranges
;
2572 # endif /* RE_ENABLE_I18N */
2574 /* Equivalence Classes and Character Classes can't be a range
2576 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2577 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2581 start_collseq
= lookup_collation_sequence_value (start_elem
);
2582 end_collseq
= lookup_collation_sequence_value (end_elem
);
2583 /* Check start/end collation sequence values. */
2584 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2585 return REG_ECOLLATE
;
2586 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2589 # ifdef RE_ENABLE_I18N
2590 /* Got valid collation sequence values, add them as a new entry. */
2591 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2592 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2593 # endif /* RE_ENABLE_I18N */
2595 /* Build the table for single byte characters. */
2596 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2598 uint32_t ch_collseq
;
2600 if (MB_CUR_MAX == 1)
2603 ch_collseq
= collseqmb
[ch
];
2605 ch_collseq
= collseq_table_lookup (collseqwc
, __btowc (ch
));
2606 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2607 bitset_set (sbcset
, ch
);
2612 /* Local function for parse_bracket_exp used in _LIBC environement.
2613 Build the collating element which is represented by NAME.
2614 The result are written to MBCSET and SBCSET.
2615 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2616 pointer argument sinse we may update it. */
2618 static inline reg_errcode_t
2619 # ifdef RE_ENABLE_I18N
2620 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2621 re_charset_t
*mbcset
;
2622 int *coll_sym_alloc
;
2623 # else /* not RE_ENABLE_I18N */
2624 build_collating_symbol (sbcset
, name
)
2625 # endif /* not RE_ENABLE_I18N */
2626 re_bitset_ptr_t sbcset
;
2627 const unsigned char *name
;
2630 size_t name_len
= strlen ((const char *) name
);
2633 elem
= seek_collating_symbol_entry (name
, name_len
);
2634 if (symb_table
[2 * elem
] != 0)
2636 /* We found the entry. */
2637 idx
= symb_table
[2 * elem
+ 1];
2638 /* Skip the name of collating element name. */
2639 idx
+= 1 + extra
[idx
];
2641 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2643 /* No valid character, treat it as a normal
2645 bitset_set (sbcset
, name
[0]);
2649 return REG_ECOLLATE
;
2651 # ifdef RE_ENABLE_I18N
2652 /* Got valid collation sequence, add it as a new entry. */
2653 /* Check the space of the arrays. */
2654 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2656 /* Not enough, realloc it. */
2657 /* +1 in case of mbcset->ncoll_syms is 0. */
2658 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2659 /* Use realloc since mbcset->coll_syms is NULL
2661 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2663 if (BE (mbcset
->coll_syms
== NULL
, 0))
2666 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2667 # endif /* RE_ENABLE_I18N */
2672 if (BE (name_len
!= 1, 0))
2673 return REG_ECOLLATE
;
2676 bitset_set (sbcset
, name
[0]);
2683 re_token_t br_token
;
2684 re_bitset_ptr_t sbcset
;
2685 #ifdef RE_ENABLE_I18N
2686 re_charset_t
*mbcset
;
2687 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2688 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2689 #else /* not RE_ENABLE_I18N */
2691 #endif /* not RE_ENABLE_I18N */
2692 bin_tree_t
*work_tree
;
2693 int token_len
, new_idx
;
2695 collseqmb
= (const unsigned char *)
2696 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2697 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2703 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2704 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2705 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2706 _NL_COLLATE_SYMB_TABLEMB
);
2707 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2708 _NL_COLLATE_SYMB_EXTRAMB
);
2711 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2712 #ifdef RE_ENABLE_I18N
2713 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2714 #endif /* RE_ENABLE_I18N */
2715 #ifdef RE_ENABLE_I18N
2716 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2718 if (BE (sbcset
== NULL
, 0))
2719 #endif /* RE_ENABLE_I18N */
2725 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2726 if (BE (token
->type
== END_OF_RE
, 0))
2729 goto parse_bracket_exp_free_return
;
2731 if (token
->type
== OP_NON_MATCH_LIST
)
2733 #ifdef RE_ENABLE_I18N
2735 mbcset
->non_match
= 1;
2736 #else /* not RE_ENABLE_I18N */
2738 #endif /* not RE_ENABLE_I18N */
2739 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2740 bitset_set (sbcset
, '\0');
2741 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2742 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2743 if (BE (token
->type
== END_OF_RE
, 0))
2746 goto parse_bracket_exp_free_return
;
2748 #ifdef RE_ENABLE_I18N
2750 for (i
= 0; i
< SBC_MAX
; ++i
)
2751 if (__btowc (i
) == WEOF
)
2752 bitset_set (sbcset
, i
);
2753 #endif /* RE_ENABLE_I18N */
2756 /* We treat the first ']' as a normal character. */
2757 if (token
->type
== OP_CLOSE_BRACKET
)
2758 token
->type
= CHARACTER
;
2762 bracket_elem_t start_elem
, end_elem
;
2763 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
2764 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
2766 int token_len2
= 0, is_range_exp
= 0;
2769 start_elem
.opr
.name
= start_name_buf
;
2770 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
2772 if (BE (ret
!= REG_NOERROR
, 0))
2775 goto parse_bracket_exp_free_return
;
2778 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2779 if (BE (token
->type
== END_OF_RE
, 0))
2782 goto parse_bracket_exp_free_return
;
2784 if (token
->type
== OP_CHARSET_RANGE
)
2786 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
2787 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
2788 if (BE (token
->type
== END_OF_RE
, 0))
2791 goto parse_bracket_exp_free_return
;
2793 if (token2
.type
== OP_CLOSE_BRACKET
)
2795 /* We treat the last '-' as a normal character. */
2796 re_string_skip_bytes (regexp
, -token_len
);
2797 token
->type
= CHARACTER
;
2803 if (is_range_exp
== 1)
2805 end_elem
.opr
.name
= end_name_buf
;
2806 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
2808 if (BE (ret
!= REG_NOERROR
, 0))
2811 goto parse_bracket_exp_free_return
;
2814 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2815 if (BE (token
->type
== END_OF_RE
, 0))
2818 goto parse_bracket_exp_free_return
;
2820 *err
= build_range_exp (sbcset
,
2821 #ifdef RE_ENABLE_I18N
2822 mbcset
, &range_alloc
,
2823 #endif /* RE_ENABLE_I18N */
2824 &start_elem
, &end_elem
);
2825 if (BE (*err
!= REG_NOERROR
, 0))
2826 goto parse_bracket_exp_free_return
;
2830 switch (start_elem
.type
)
2833 bitset_set (sbcset
, start_elem
.opr
.ch
);
2835 #ifdef RE_ENABLE_I18N
2837 /* Check whether the array has enough space. */
2838 if (mbchar_alloc
== mbcset
->nmbchars
)
2840 /* Not enough, realloc it. */
2841 /* +1 in case of mbcset->nmbchars is 0. */
2842 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
2843 /* Use realloc since array is NULL if *alloc == 0. */
2844 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
2846 if (BE (mbcset
->mbchars
== NULL
, 0))
2847 goto parse_bracket_exp_espace
;
2849 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
2851 #endif /* RE_ENABLE_I18N */
2853 *err
= build_equiv_class (sbcset
,
2854 #ifdef RE_ENABLE_I18N
2855 mbcset
, &equiv_class_alloc
,
2856 #endif /* RE_ENABLE_I18N */
2857 start_elem
.opr
.name
);
2858 if (BE (*err
!= REG_NOERROR
, 0))
2859 goto parse_bracket_exp_free_return
;
2862 *err
= build_collating_symbol (sbcset
,
2863 #ifdef RE_ENABLE_I18N
2864 mbcset
, &coll_sym_alloc
,
2865 #endif /* RE_ENABLE_I18N */
2866 start_elem
.opr
.name
);
2867 if (BE (*err
!= REG_NOERROR
, 0))
2868 goto parse_bracket_exp_free_return
;
2871 ret
= build_charclass (sbcset
,
2872 #ifdef RE_ENABLE_I18N
2873 mbcset
, &char_class_alloc
,
2874 #endif /* RE_ENABLE_I18N */
2875 start_elem
.opr
.name
, syntax
);
2876 if (BE (ret
!= REG_NOERROR
, 0))
2877 goto parse_bracket_exp_espace
;
2884 if (token
->type
== OP_CLOSE_BRACKET
)
2888 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2890 /* If it is non-matching list. */
2891 #ifdef RE_ENABLE_I18N
2892 if (mbcset
->non_match
)
2893 #else /* not RE_ENABLE_I18N */
2895 #endif /* not RE_ENABLE_I18N */
2896 bitset_not (sbcset
);
2898 /* Build a tree for simple bracket. */
2899 br_token
.type
= SIMPLE_BRACKET
;
2900 br_token
.opr
.sbcset
= sbcset
;
2901 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2902 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2903 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
2904 goto parse_bracket_exp_espace
;
2906 #ifdef RE_ENABLE_I18N
2907 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
2908 || mbcset
->nranges
|| (MB_CUR_MAX
> 1 && (mbcset
->nchar_classes
2909 || mbcset
->non_match
)))
2911 re_token_t alt_token
;
2912 bin_tree_t
*mbc_tree
;
2913 /* Build a tree for complex bracket. */
2914 br_token
.type
= COMPLEX_BRACKET
;
2915 br_token
.opr
.mbcset
= mbcset
;
2916 dfa
->has_mb_node
= 1;
2917 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2918 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2919 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
2920 goto parse_bracket_exp_espace
;
2921 /* Then join them by ALT node. */
2922 alt_token
.type
= OP_ALT
;
2923 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
2924 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
2925 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
2930 free_charset (mbcset
);
2933 #else /* not RE_ENABLE_I18N */
2935 #endif /* not RE_ENABLE_I18N */
2937 parse_bracket_exp_espace
:
2939 parse_bracket_exp_free_return
:
2941 #ifdef RE_ENABLE_I18N
2942 free_charset (mbcset
);
2943 #endif /* RE_ENABLE_I18N */
2947 /* Parse an element in the bracket expression. */
2949 static reg_errcode_t
2950 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
)
2951 bracket_elem_t
*elem
;
2952 re_string_t
*regexp
;
2956 reg_syntax_t syntax
;
2958 #ifdef RE_ENABLE_I18N
2960 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
2961 if (cur_char_size
> 1)
2963 elem
->type
= MB_CHAR
;
2964 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
2965 re_string_skip_bytes (regexp
, cur_char_size
);
2968 #endif /* RE_ENABLE_I18N */
2969 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2970 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
2971 || token
->type
== OP_OPEN_EQUIV_CLASS
)
2972 return parse_bracket_symbol (elem
, regexp
, token
);
2973 elem
->type
= SB_CHAR
;
2974 elem
->opr
.ch
= token
->opr
.c
;
2978 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
2979 such as [:<character_class>:], [.<collating_element>.], and
2980 [=<equivalent_class>=]. */
2982 static reg_errcode_t
2983 parse_bracket_symbol (elem
, regexp
, token
)
2984 bracket_elem_t
*elem
;
2985 re_string_t
*regexp
;
2988 unsigned char ch
, delim
= token
->opr
.c
;
2992 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
2994 if (token
->type
== OP_OPEN_CHAR_CLASS
)
2995 ch
= re_string_fetch_byte_case (regexp
);
2997 ch
= re_string_fetch_byte (regexp
);
2998 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3000 elem
->opr
.name
[i
] = ch
;
3002 re_string_skip_bytes (regexp
, 1);
3003 elem
->opr
.name
[i
] = '\0';
3004 switch (token
->type
)
3006 case OP_OPEN_COLL_ELEM
:
3007 elem
->type
= COLL_SYM
;
3009 case OP_OPEN_EQUIV_CLASS
:
3010 elem
->type
= EQUIV_CLASS
;
3012 case OP_OPEN_CHAR_CLASS
:
3013 elem
->type
= CHAR_CLASS
;
3021 /* Helper function for parse_bracket_exp.
3022 Build the equivalence class which is represented by NAME.
3023 The result are written to MBCSET and SBCSET.
3024 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3025 is a pointer argument sinse we may update it. */
3027 static reg_errcode_t
3028 #ifdef RE_ENABLE_I18N
3029 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3030 re_charset_t
*mbcset
;
3031 int *equiv_class_alloc
;
3032 #else /* not RE_ENABLE_I18N */
3033 build_equiv_class (sbcset
, name
)
3034 #endif /* not RE_ENABLE_I18N */
3035 re_bitset_ptr_t sbcset
;
3036 const unsigned char *name
;
3038 #if defined _LIBC && defined RE_ENABLE_I18N
3039 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3042 const int32_t *table
, *indirect
;
3043 const unsigned char *weights
, *extra
, *cp
;
3044 unsigned char char_buf
[2];
3048 /* This #include defines a local function! */
3049 # include <locale/weight.h>
3050 /* Calculate the index for equivalence class. */
3052 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3053 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3054 _NL_COLLATE_WEIGHTMB
);
3055 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3056 _NL_COLLATE_EXTRAMB
);
3057 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3058 _NL_COLLATE_INDIRECTMB
);
3059 idx1
= findidx (&cp
);
3060 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3061 /* This isn't a valid character. */
3062 return REG_ECOLLATE
;
3064 /* Build single byte matcing table for this equivalence class. */
3065 char_buf
[1] = (unsigned char) '\0';
3066 len
= weights
[idx1
];
3067 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3071 idx2
= findidx (&cp
);
3076 /* This isn't a valid character. */
3078 if (len
== weights
[idx2
])
3081 while (cnt
<= len
&&
3082 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3086 bitset_set (sbcset
, ch
);
3089 /* Check whether the array has enough space. */
3090 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3092 /* Not enough, realloc it. */
3093 /* +1 in case of mbcset->nequiv_classes is 0. */
3094 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3095 /* Use realloc since the array is NULL if *alloc == 0. */
3096 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3097 *equiv_class_alloc
);
3098 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3101 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3104 #endif /* _LIBC && RE_ENABLE_I18N */
3106 if (BE (strlen ((const char *) name
) != 1, 0))
3107 return REG_ECOLLATE
;
3108 bitset_set (sbcset
, *name
);
3113 /* Helper function for parse_bracket_exp.
3114 Build the character class which is represented by NAME.
3115 The result are written to MBCSET and SBCSET.
3116 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3117 is a pointer argument sinse we may update it. */
3119 static reg_errcode_t
3120 #ifdef RE_ENABLE_I18N
3121 build_charclass (sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3122 re_charset_t
*mbcset
;
3123 int *char_class_alloc
;
3124 #else /* not RE_ENABLE_I18N */
3125 build_charclass (sbcset
, class_name
, syntax
)
3126 #endif /* not RE_ENABLE_I18N */
3127 re_bitset_ptr_t sbcset
;
3128 const unsigned char *class_name
;
3129 reg_syntax_t syntax
;
3132 const char *name
= (const char *) class_name
;
3134 /* In case of REG_ICASE "upper" and "lower" match the both of
3135 upper and lower cases. */
3136 if ((syntax
& RE_ICASE
)
3137 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3140 #ifdef RE_ENABLE_I18N
3141 /* Check the space of the arrays. */
3142 if (*char_class_alloc
== mbcset
->nchar_classes
)
3144 /* Not enough, realloc it. */
3145 /* +1 in case of mbcset->nchar_classes is 0. */
3146 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3147 /* Use realloc since array is NULL if *alloc == 0. */
3148 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3150 if (BE (mbcset
->char_classes
== NULL
, 0))
3153 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3154 #endif /* RE_ENABLE_I18N */
3156 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3157 for (i = 0; i < SBC_MAX; ++i) \
3159 if (ctype_func (i)) \
3160 bitset_set (sbcset, i); \
3163 if (strcmp (name
, "alnum") == 0)
3164 BUILD_CHARCLASS_LOOP (isalnum
)
3165 else if (strcmp (name
, "cntrl") == 0)
3166 BUILD_CHARCLASS_LOOP (iscntrl
)
3167 else if (strcmp (name
, "lower") == 0)
3168 BUILD_CHARCLASS_LOOP (islower
)
3169 else if (strcmp (name
, "space") == 0)
3170 BUILD_CHARCLASS_LOOP (isspace
)
3171 else if (strcmp (name
, "alpha") == 0)
3172 BUILD_CHARCLASS_LOOP (isalpha
)
3173 else if (strcmp (name
, "digit") == 0)
3174 BUILD_CHARCLASS_LOOP (isdigit
)
3175 else if (strcmp (name
, "print") == 0)
3176 BUILD_CHARCLASS_LOOP (isprint
)
3177 else if (strcmp (name
, "upper") == 0)
3178 BUILD_CHARCLASS_LOOP (isupper
)
3179 else if (strcmp (name
, "blank") == 0)
3180 BUILD_CHARCLASS_LOOP (isblank
)
3181 else if (strcmp (name
, "graph") == 0)
3182 BUILD_CHARCLASS_LOOP (isgraph
)
3183 else if (strcmp (name
, "punct") == 0)
3184 BUILD_CHARCLASS_LOOP (ispunct
)
3185 else if (strcmp (name
, "xdigit") == 0)
3186 BUILD_CHARCLASS_LOOP (isxdigit
)
3194 build_word_op (dfa
, not, err
)
3199 re_bitset_ptr_t sbcset
;
3200 #ifdef RE_ENABLE_I18N
3201 re_charset_t
*mbcset
;
3203 #else /* not RE_ENABLE_I18N */
3205 #endif /* not RE_ENABLE_I18N */
3207 re_token_t br_token
;
3211 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3212 #ifdef RE_ENABLE_I18N
3213 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3214 #endif /* RE_ENABLE_I18N */
3216 #ifdef RE_ENABLE_I18N
3217 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3218 #else /* not RE_ENABLE_I18N */
3219 if (BE (sbcset
== NULL
, 0))
3220 #endif /* not RE_ENABLE_I18N */
3228 #ifdef RE_ENABLE_I18N
3231 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3232 bitset_set(cset->sbcset, '\0');
3234 mbcset
->non_match
= 1;
3236 for (i
= 0; i
< SBC_MAX
; ++i
)
3237 if (__btowc (i
) == WEOF
)
3238 bitset_set (sbcset
, i
);
3239 #else /* not RE_ENABLE_I18N */
3241 #endif /* not RE_ENABLE_I18N */
3244 /* We don't care the syntax in this case. */
3245 ret
= build_charclass (sbcset
,
3246 #ifdef RE_ENABLE_I18N
3248 #endif /* RE_ENABLE_I18N */
3249 (const unsigned char *) "alpha", 0);
3251 if (BE (ret
!= REG_NOERROR
, 0))
3254 #ifdef RE_ENABLE_I18N
3255 free_charset (mbcset
);
3256 #endif /* RE_ENABLE_I18N */
3260 /* \w match '_' also. */
3261 bitset_set (sbcset
, '_');
3263 /* If it is non-matching list. */
3264 #ifdef RE_ENABLE_I18N
3265 if (mbcset
->non_match
)
3266 #else /* not RE_ENABLE_I18N */
3268 #endif /* not RE_ENABLE_I18N */
3269 bitset_not (sbcset
);
3271 /* Build a tree for simple bracket. */
3272 br_token
.type
= SIMPLE_BRACKET
;
3273 br_token
.opr
.sbcset
= sbcset
;
3274 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3275 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3276 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3277 goto build_word_op_espace
;
3279 #ifdef RE_ENABLE_I18N
3282 re_token_t alt_token
;
3283 bin_tree_t
*mbc_tree
;
3284 /* Build a tree for complex bracket. */
3285 br_token
.type
= COMPLEX_BRACKET
;
3286 br_token
.opr
.mbcset
= mbcset
;
3287 dfa
->has_mb_node
= 1;
3288 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3289 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3290 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3291 goto build_word_op_espace
;
3292 /* Then join them by ALT node. */
3293 alt_token
.type
= OP_ALT
;
3294 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3295 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3296 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3301 free_charset (mbcset
);
3304 #else /* not RE_ENABLE_I18N */
3306 #endif /* not RE_ENABLE_I18N */
3308 build_word_op_espace
:
3310 #ifdef RE_ENABLE_I18N
3311 free_charset (mbcset
);
3312 #endif /* RE_ENABLE_I18N */
3317 /* This is intended for the expressions like "a{1,3}".
3318 Fetch a number from `input', and return the number.
3319 Return -1, if the number field is empty like "{,1}".
3320 Return -2, If an error is occured. */
3323 fetch_number (input
, token
, syntax
)
3326 reg_syntax_t syntax
;
3332 *token
= fetch_token (input
, syntax
);
3334 if (BE (token
->type
== END_OF_RE
, 0))
3336 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3338 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3339 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3340 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3345 #ifdef RE_ENABLE_I18N
3347 free_charset (re_charset_t
*cset
)
3349 re_free (cset
->mbchars
);
3351 re_free (cset
->coll_syms
);
3352 re_free (cset
->equiv_classes
);
3353 re_free (cset
->range_starts
);
3354 re_free (cset
->range_ends
);
3356 re_free (cset
->char_classes
);
3359 #endif /* RE_ENABLE_I18N */
3361 /* Functions for binary tree operation. */
3363 /* Create a node of tree.
3364 Note: This function automatically free left and right if malloc fails. */
3367 create_tree (left
, right
, type
, index
)
3370 re_token_type_t type
;
3374 tree
= re_malloc (bin_tree_t
, 1);
3375 if (BE (tree
== NULL
, 0))
3377 free_bin_tree (left
);
3378 free_bin_tree (right
);
3381 tree
->parent
= NULL
;
3383 tree
->right
= right
;
3385 tree
->node_idx
= index
;
3388 re_node_set_init_empty (&tree
->eclosure
);
3391 left
->parent
= tree
;
3393 right
->parent
= tree
;
3397 /* Free the sub tree pointed by TREE. */
3400 free_bin_tree (tree
)
3405 /*re_node_set_free (&tree->eclosure);*/
3406 free_bin_tree (tree
->left
);
3407 free_bin_tree (tree
->right
);
3411 /* Duplicate the node SRC, and return new node. */
3414 duplicate_tree (src
, dfa
)
3415 const bin_tree_t
*src
;
3418 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3420 /* Since node indies must be according to Post-order of the tree,
3421 we must duplicate the left at first. */
3422 if (src
->left
!= NULL
)
3424 left
= duplicate_tree (src
->left
, dfa
);
3429 /* Secondaly, duplicate the right. */
3430 if (src
->right
!= NULL
)
3432 right
= duplicate_tree (src
->right
, dfa
);
3435 free_bin_tree (left
);
3440 /* At last, duplicate itself. */
3441 if (src
->type
== NON_TYPE
)
3443 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3444 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3445 if (BE (new_node_idx
== -1, 0))
3447 free_bin_tree (left
);
3448 free_bin_tree (right
);
3453 new_node_idx
= src
->type
;
3455 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3456 if (BE (new_tree
== NULL
, 0))
3458 free_bin_tree (left
);
3459 free_bin_tree (right
);