1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
45 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
46 const char *string1
, int length1
,
47 const char *string2
, int length2
,
48 int start
, int range
, struct re_registers
*regs
,
49 int stop
, int ret_len
)
51 static int re_search_stub (struct re_pattern_buffer
*bufp
,
52 const char *string
, int length
, int start
,
53 int range
, int stop
, struct re_registers
*regs
,
56 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
57 int nregs
, int regs_allocated
)
59 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
61 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
62 int *p_match_first
) internal_function
;
63 static int check_halt_state_context (const re_match_context_t
*mctx
,
64 const re_dfastate_t
*state
, int idx
)
66 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
67 regmatch_t
*prev_idx_match
, int cur_node
,
68 int cur_idx
, int nmatch
) internal_function
;
69 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
70 int str_idx
, int dest_node
, int nregs
,
72 re_node_set
*eps_via_nodes
)
74 static reg_errcode_t
set_regs (const regex_t
*preg
,
75 const re_match_context_t
*mctx
,
76 size_t nmatch
, regmatch_t
*pmatch
,
77 int fl_backtrack
) internal_function
;
78 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
82 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
83 re_sift_context_t
*sctx
,
84 int node_idx
, int str_idx
, int max_str_idx
)
86 #endif /* RE_ENABLE_I18N */
87 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
88 re_sift_context_t
*sctx
)
90 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
, int str_idx
,
92 re_node_set
*cur_dest
)
94 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
95 re_sift_context_t
*sctx
,
97 re_node_set
*dest_nodes
)
99 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
100 re_node_set
*dest_nodes
,
101 const re_node_set
*candidates
)
103 static int check_dst_limits (const re_match_context_t
*mctx
,
105 int dst_node
, int dst_idx
, int src_node
,
106 int src_idx
) internal_function
;
107 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
108 int boundaries
, int subexp_idx
,
109 int from_node
, int bkref_idx
)
111 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
112 int limit
, int subexp_idx
,
113 int node
, int str_idx
,
114 int bkref_idx
) internal_function
;
115 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
116 re_node_set
*dest_nodes
,
117 const re_node_set
*candidates
,
119 struct re_backref_cache_entry
*bkref_ents
,
120 int str_idx
) internal_function
;
121 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
122 re_sift_context_t
*sctx
,
123 int str_idx
, const re_node_set
*candidates
)
125 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
127 re_dfastate_t
**src
, int num
)
129 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
130 re_match_context_t
*mctx
) internal_function
;
131 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
132 re_match_context_t
*mctx
,
133 re_dfastate_t
*state
) internal_function
;
134 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
135 re_match_context_t
*mctx
,
136 re_dfastate_t
*next_state
)
138 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
139 re_node_set
*cur_nodes
,
140 int str_idx
) internal_function
;
142 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
143 re_match_context_t
*mctx
,
144 re_dfastate_t
*pstate
)
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
149 re_dfastate_t
*pstate
)
151 #endif /* RE_ENABLE_I18N */
152 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
153 const re_node_set
*nodes
)
155 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
156 int bkref_node
, int bkref_str_idx
)
158 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
159 const re_sub_match_top_t
*sub_top
,
160 re_sub_match_last_t
*sub_last
,
161 int bkref_node
, int bkref_str
)
163 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
164 int subexp_idx
, int type
) internal_function
;
165 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
166 state_array_t
*path
, int top_node
,
167 int top_str
, int last_node
, int last_str
,
168 int type
) internal_function
;
169 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
171 re_node_set
*cur_nodes
,
172 re_node_set
*next_nodes
)
174 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
175 re_node_set
*cur_nodes
,
176 int ex_subexp
, int type
)
178 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
179 re_node_set
*dst_nodes
,
180 int target
, int ex_subexp
,
181 int type
) internal_function
;
182 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
183 re_node_set
*cur_nodes
, int cur_str
,
184 int subexp_num
, int type
)
186 static int build_trtable (const re_dfa_t
*dfa
,
187 re_dfastate_t
*state
) internal_function
;
188 #ifdef RE_ENABLE_I18N
189 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
190 const re_string_t
*input
, int idx
)
193 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
197 #endif /* RE_ENABLE_I18N */
198 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
199 const re_dfastate_t
*state
,
200 re_node_set
*states_node
,
201 bitset_t
*states_ch
) internal_function
;
202 static int check_node_accept (const re_match_context_t
*mctx
,
203 const re_token_t
*node
, int idx
)
205 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
208 /* Entry point for POSIX code. */
210 /* regexec searches for a given pattern, specified by PREG, in the
213 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
214 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
215 least NMATCH elements, and we set them to the offsets of the
216 corresponding matched substrings.
218 EFLAGS specifies `execution flags' which affect matching: if
219 REG_NOTBOL is set, then ^ does not match at the beginning of the
220 string; if REG_NOTEOL is set, then $ does not match at the end.
222 We return 0 if we find a match and REG_NOMATCH if not. */
226 const regex_t
*__restrict preg
,
227 const char *__restrict string
,
235 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
238 if (eflags
& REG_STARTEND
)
240 start
= pmatch
[0].rm_so
;
241 length
= pmatch
[0].rm_eo
;
246 length
= strlen (string
);
249 __libc_lock_lock (dfa
->lock
);
251 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
252 length
, 0, NULL
, eflags
);
254 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
255 length
, nmatch
, pmatch
, eflags
);
256 __libc_lock_unlock (dfa
->lock
);
257 return err
!= REG_NOERROR
;
261 # include <shlib-compat.h>
262 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
264 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
265 __typeof__ (__regexec
) __compat_regexec
;
268 attribute_compat_text_section
269 __compat_regexec (const regex_t
*__restrict preg
,
270 const char *__restrict string
, size_t nmatch
,
271 regmatch_t pmatch
[], int eflags
)
273 return regexec (preg
, string
, nmatch
, pmatch
,
274 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
276 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
280 /* Entry points for GNU code. */
282 /* re_match, re_search, re_match_2, re_search_2
284 The former two functions operate on STRING with length LENGTH,
285 while the later two operate on concatenation of STRING1 and STRING2
286 with lengths LENGTH1 and LENGTH2, respectively.
288 re_match() matches the compiled pattern in BUFP against the string,
289 starting at index START.
291 re_search() first tries matching at index START, then it tries to match
292 starting from index START + 1, and so on. The last start position tried
293 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
296 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
297 the first STOP characters of the concatenation of the strings should be
300 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
301 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
302 computed relative to the concatenation, not relative to the individual
305 On success, re_match* functions return the length of the match, re_search*
306 return the position of the start of the match. Return value -1 means no
307 match was found and -2 indicates an internal error. */
310 re_match (struct re_pattern_buffer
*bufp
,
314 struct re_registers
*regs
)
316 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
319 weak_alias (__re_match
, re_match
)
323 re_search (struct re_pattern_buffer
*bufp
,
325 int length
, int start
, int range
,
326 struct re_registers
*regs
)
328 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
331 weak_alias (__re_search
, re_search
)
335 re_match_2 (struct re_pattern_buffer
*bufp
,
336 const char *string1
, int length1
,
337 const char *string2
, int length2
, int start
,
338 struct re_registers
*regs
, int stop
)
340 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
341 start
, 0, regs
, stop
, 1);
344 weak_alias (__re_match_2
, re_match_2
)
348 re_search_2 (struct re_pattern_buffer
*bufp
,
349 const char *string1
, int length1
,
350 const char *string2
, int length2
, int start
,
351 int range
, struct re_registers
*regs
, int stop
)
353 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
354 start
, range
, regs
, stop
, 0);
357 weak_alias (__re_search_2
, re_search_2
)
360 static int internal_function
361 re_search_2_stub (struct re_pattern_buffer
*bufp
,
362 const char *string1
, int length1
,
363 const char *string2
, int length2
, int start
,
364 int range
, struct re_registers
*regs
,
365 int stop
, int ret_len
)
369 int len
= length1
+ length2
;
372 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
375 /* Concatenate the strings. */
379 char *s
= re_malloc (char, len
);
381 if (BE (s
== NULL
, 0))
383 memcpy (s
, string1
, length1
);
384 memcpy (s
+ length1
, string2
, length2
);
393 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
395 re_free ((char *) str
);
399 /* The parameters have the same meaning as those of re_search.
400 Additional parameters:
401 If RET_LEN is nonzero the length of the match is returned (re_match style);
402 otherwise the position of the match is returned. */
404 static int internal_function
405 re_search_stub (struct re_pattern_buffer
*bufp
,
406 const char *string
, int length
, int start
,
408 struct re_registers
*regs
, int ret_len
)
410 reg_errcode_t result
;
415 /* Check for out-of-range. */
416 if (BE (start
< 0 || start
> length
, 0))
418 if (BE (start
+ range
> length
, 0))
419 range
= length
- start
;
420 else if (BE (start
+ range
< 0, 0))
423 __libc_lock_lock (dfa
->lock
);
425 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
426 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
428 /* Compile fastmap if we haven't yet. */
429 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
430 re_compile_fastmap (bufp
);
432 if (BE (bufp
->no_sub
, 0))
435 /* We need at least 1 register. */
438 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
439 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
441 nregs
= regs
->num_regs
;
442 if (BE (nregs
< 1, 0))
444 /* Nothing can be copied to regs. */
450 nregs
= bufp
->re_nsub
+ 1;
451 pmatch
= re_malloc (regmatch_t
, nregs
);
452 if (BE (pmatch
== NULL
, 0))
458 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
459 nregs
, pmatch
, eflags
);
463 /* I hope we needn't fill ther regs with -1's when no match was found. */
464 if (result
!= REG_NOERROR
)
466 else if (regs
!= NULL
)
468 /* If caller wants register contents data back, copy them. */
469 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
470 bufp
->regs_allocated
);
471 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
475 if (BE (rval
== 0, 1))
479 assert (pmatch
[0].rm_so
== start
);
480 rval
= pmatch
[0].rm_eo
- start
;
483 rval
= pmatch
[0].rm_so
;
487 __libc_lock_unlock (dfa
->lock
);
491 static unsigned internal_function
492 re_copy_regs (struct re_registers
*regs
,
494 int nregs
, int regs_allocated
)
496 int rval
= REGS_REALLOCATE
;
498 int need_regs
= nregs
+ 1;
499 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
502 /* Have the register data arrays been allocated? */
503 if (regs_allocated
== REGS_UNALLOCATED
)
504 { /* No. So allocate them with malloc. */
505 regs
->start
= re_malloc (regoff_t
, need_regs
);
506 if (BE (regs
->start
== NULL
, 0))
507 return REGS_UNALLOCATED
;
508 regs
->end
= re_malloc (regoff_t
, need_regs
);
509 if (BE (regs
->end
== NULL
, 0))
511 re_free (regs
->start
);
512 return REGS_UNALLOCATED
;
514 regs
->num_regs
= need_regs
;
516 else if (regs_allocated
== REGS_REALLOCATE
)
517 { /* Yes. If we need more elements than were already
518 allocated, reallocate them. If we need fewer, just
520 if (BE (need_regs
> regs
->num_regs
, 0))
522 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
524 if (BE (new_start
== NULL
, 0))
525 return REGS_UNALLOCATED
;
526 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
527 if (BE (new_end
== NULL
, 0))
530 return REGS_UNALLOCATED
;
532 regs
->start
= new_start
;
534 regs
->num_regs
= need_regs
;
539 assert (regs_allocated
== REGS_FIXED
);
540 /* This function may not be called with REGS_FIXED and nregs too big. */
541 assert (regs
->num_regs
>= nregs
);
546 for (i
= 0; i
< nregs
; ++i
)
548 regs
->start
[i
] = pmatch
[i
].rm_so
;
549 regs
->end
[i
] = pmatch
[i
].rm_eo
;
551 for ( ; i
< regs
->num_regs
; ++i
)
552 regs
->start
[i
] = regs
->end
[i
] = -1;
557 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
558 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
559 this memory for recording register information. STARTS and ENDS
560 must be allocated using the malloc library routine, and must each
561 be at least NUM_REGS * sizeof (regoff_t) bytes long.
563 If NUM_REGS == 0, then subsequent matches should allocate their own
566 Unless this function is called, the first search or match using
567 PATTERN_BUFFER will allocate its own register data, without
568 freeing the old data. */
571 re_set_registers (struct re_pattern_buffer
*bufp
,
572 struct re_registers
*regs
,
579 bufp
->regs_allocated
= REGS_REALLOCATE
;
580 regs
->num_regs
= num_regs
;
581 regs
->start
= starts
;
586 bufp
->regs_allocated
= REGS_UNALLOCATED
;
588 regs
->start
= regs
->end
= (regoff_t
*) 0;
592 weak_alias (__re_set_registers
, re_set_registers
)
595 /* Entry points compatible with 4.2 BSD regex library. We don't define
596 them unless specifically requested. */
598 #if defined _REGEX_RE_COMP || defined _LIBC
606 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
608 #endif /* _REGEX_RE_COMP */
610 /* Internal entry point. */
612 /* Searches for a compiled pattern PREG in the string STRING, whose
613 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
614 mingings with regexec. START, and RANGE have the same meanings
616 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
617 otherwise return the error code.
618 Note: We assume front end functions already check ranges.
619 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
621 static reg_errcode_t internal_function
622 re_search_internal (const regex_t
*preg
,
624 int length
, int start
, int range
, int stop
,
625 size_t nmatch
, regmatch_t pmatch
[],
629 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
630 int left_lim
, right_lim
, incr
;
631 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
635 re_match_context_t mctx
= { .dfa
= dfa
};
637 re_match_context_t mctx
;
639 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
640 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
641 RE_TRANSLATE_TYPE t
= preg
->translate
;
643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
644 memset (&mctx
, '\0', sizeof (re_match_context_t
));
648 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
649 nmatch
-= extra_nmatch
;
651 /* Check if the DFA haven't been compiled. */
652 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
653 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
654 || dfa
->init_state_begbuf
== NULL
, 0))
658 /* We assume front-end functions already check them. */
659 assert (start
+ range
>= 0 && start
+ range
<= length
);
662 /* If initial states with non-begbuf contexts have no elements,
663 the regex must be anchored. If preg->newline_anchor is set,
664 we'll never use init_state_nl, so do not check it. */
665 if (dfa
->init_state
->nodes
.nelem
== 0
666 && dfa
->init_state_word
->nodes
.nelem
== 0
667 && (dfa
->init_state_nl
->nodes
.nelem
== 0
668 || !preg
->newline_anchor
))
670 if (start
!= 0 && start
+ range
!= 0)
675 /* We must check the longest matching, if nmatch > 0. */
676 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
678 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
679 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
680 if (BE (err
!= REG_NOERROR
, 0))
682 mctx
.input
.stop
= stop
;
683 mctx
.input
.raw_stop
= stop
;
684 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
686 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
687 if (BE (err
!= REG_NOERROR
, 0))
690 /* We will log all the DFA states through which the dfa pass,
691 if nmatch > 1, or this dfa has "multibyte node", which is a
692 back-reference or a node which can accept multibyte character or
693 multi character collating element. */
694 if (nmatch
> 1 || dfa
->has_mb_node
)
696 /* Avoid overflow. */
697 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
703 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
704 if (BE (mctx
.state_log
== NULL
, 0))
711 mctx
.state_log
= NULL
;
714 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
715 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
717 /* Check incrementally whether of not the input string match. */
718 incr
= (range
< 0) ? -1 : 1;
719 left_lim
= (range
< 0) ? start
+ range
: start
;
720 right_lim
= (range
< 0) ? start
: start
+ range
;
721 sb
= dfa
->mb_cur_max
== 1;
724 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
725 | (range
>= 0 ? 2 : 0)
726 | (t
!= NULL
? 1 : 0))
729 for (;; match_first
+= incr
)
732 if (match_first
< left_lim
|| right_lim
< match_first
)
735 /* Advance as rapidly as possible through the string, until we
736 find a plausible place to start matching. This may be done
737 with varying efficiency, so there are various possibilities:
738 only the most common of them are specialized, in order to
739 save on code size. We use a switch statement for speed. */
747 /* Fastmap with single-byte translation, match forward. */
748 while (BE (match_first
< right_lim
, 1)
749 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
751 goto forward_match_found_start_or_reached_end
;
754 /* Fastmap without translation, match forward. */
755 while (BE (match_first
< right_lim
, 1)
756 && !fastmap
[(unsigned char) string
[match_first
]])
759 forward_match_found_start_or_reached_end
:
760 if (BE (match_first
== right_lim
, 0))
762 ch
= match_first
>= length
763 ? 0 : (unsigned char) string
[match_first
];
764 if (!fastmap
[t
? t
[ch
] : ch
])
771 /* Fastmap without multi-byte translation, match backwards. */
772 while (match_first
>= left_lim
)
774 ch
= match_first
>= length
775 ? 0 : (unsigned char) string
[match_first
];
776 if (fastmap
[t
? t
[ch
] : ch
])
780 if (match_first
< left_lim
)
785 /* In this case, we can't determine easily the current byte,
786 since it might be a component byte of a multibyte
787 character. Then we use the constructed buffer instead. */
790 /* If MATCH_FIRST is out of the valid range, reconstruct the
792 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
793 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
795 err
= re_string_reconstruct (&mctx
.input
, match_first
,
797 if (BE (err
!= REG_NOERROR
, 0))
800 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
802 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
803 Note that MATCH_FIRST must not be smaller than 0. */
804 ch
= (match_first
>= length
805 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
809 if (match_first
< left_lim
|| match_first
> right_lim
)
818 /* Reconstruct the buffers so that the matcher can assume that
819 the matching starts from the beginning of the buffer. */
820 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
821 if (BE (err
!= REG_NOERROR
, 0))
824 #ifdef RE_ENABLE_I18N
825 /* Don't consider this char as a possible match start if it part,
826 yet isn't the head, of a multibyte character. */
827 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
831 /* It seems to be appropriate one, then use the matcher. */
832 /* We assume that the matching starts from 0. */
833 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
834 match_last
= check_matching (&mctx
, fl_longest_match
,
835 range
>= 0 ? &match_first
: NULL
);
836 if (match_last
!= -1)
838 if (BE (match_last
== -2, 0))
845 mctx
.match_last
= match_last
;
846 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
848 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
849 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
852 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
855 err
= prune_impossible_nodes (&mctx
);
856 if (err
== REG_NOERROR
)
858 if (BE (err
!= REG_NOMATCH
, 0))
863 break; /* We found a match. */
867 match_ctx_clean (&mctx
);
871 assert (match_last
!= -1);
872 assert (err
== REG_NOERROR
);
875 /* Set pmatch[] if we need. */
880 /* Initialize registers. */
881 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
882 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
884 /* Set the points where matching start/end. */
886 pmatch
[0].rm_eo
= mctx
.match_last
;
888 if (!preg
->no_sub
&& nmatch
> 1)
890 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
891 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
892 if (BE (err
!= REG_NOERROR
, 0))
896 /* At last, add the offset to the each registers, since we slided
897 the buffers so that we could assume that the matching starts
899 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
900 if (pmatch
[reg_idx
].rm_so
!= -1)
902 #ifdef RE_ENABLE_I18N
903 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
905 pmatch
[reg_idx
].rm_so
=
906 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
907 ? mctx
.input
.valid_raw_len
908 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
909 pmatch
[reg_idx
].rm_eo
=
910 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
911 ? mctx
.input
.valid_raw_len
912 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
915 assert (mctx
.input
.offsets_needed
== 0);
917 pmatch
[reg_idx
].rm_so
+= match_first
;
918 pmatch
[reg_idx
].rm_eo
+= match_first
;
920 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
922 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
923 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
927 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
928 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
930 pmatch
[reg_idx
+ 1].rm_so
931 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
932 pmatch
[reg_idx
+ 1].rm_eo
933 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
938 re_free (mctx
.state_log
);
940 match_ctx_free (&mctx
);
941 re_string_destruct (&mctx
.input
);
945 static reg_errcode_t internal_function
946 prune_impossible_nodes (re_match_context_t
*mctx
)
948 const re_dfa_t
*const dfa
= mctx
->dfa
;
949 int halt_node
, match_last
;
951 re_dfastate_t
**sifted_states
;
952 re_dfastate_t
**lim_states
= NULL
;
953 re_sift_context_t sctx
;
955 assert (mctx
->state_log
!= NULL
);
957 match_last
= mctx
->match_last
;
958 halt_node
= mctx
->last_node
;
960 /* Avoid overflow. */
961 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
964 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
965 if (BE (sifted_states
== NULL
, 0))
972 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
973 if (BE (lim_states
== NULL
, 0))
980 memset (lim_states
, '\0',
981 sizeof (re_dfastate_t
*) * (match_last
+ 1));
982 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
984 ret
= sift_states_backward (mctx
, &sctx
);
985 re_node_set_free (&sctx
.limits
);
986 if (BE (ret
!= REG_NOERROR
, 0))
988 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
998 } while (mctx
->state_log
[match_last
] == NULL
999 || !mctx
->state_log
[match_last
]->halt
);
1000 halt_node
= check_halt_state_context (mctx
,
1001 mctx
->state_log
[match_last
],
1004 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
1006 re_free (lim_states
);
1008 if (BE (ret
!= REG_NOERROR
, 0))
1013 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1014 ret
= sift_states_backward (mctx
, &sctx
);
1015 re_node_set_free (&sctx
.limits
);
1016 if (BE (ret
!= REG_NOERROR
, 0))
1018 if (sifted_states
[0] == NULL
)
1024 re_free (mctx
->state_log
);
1025 mctx
->state_log
= sifted_states
;
1026 sifted_states
= NULL
;
1027 mctx
->last_node
= halt_node
;
1028 mctx
->match_last
= match_last
;
1031 re_free (sifted_states
);
1032 re_free (lim_states
);
1036 /* Acquire an initial state and return it.
1037 We must select appropriate initial state depending on the context,
1038 since initial states may have constraints like "\<", "^", etc.. */
1040 static inline re_dfastate_t
*
1041 __attribute ((always_inline
)) internal_function
1042 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1045 const re_dfa_t
*const dfa
= mctx
->dfa
;
1046 if (dfa
->init_state
->has_constraint
)
1048 unsigned int context
;
1049 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1050 if (IS_WORD_CONTEXT (context
))
1051 return dfa
->init_state_word
;
1052 else if (IS_ORDINARY_CONTEXT (context
))
1053 return dfa
->init_state
;
1054 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1055 return dfa
->init_state_begbuf
;
1056 else if (IS_NEWLINE_CONTEXT (context
))
1057 return dfa
->init_state_nl
;
1058 else if (IS_BEGBUF_CONTEXT (context
))
1060 /* It is relatively rare case, then calculate on demand. */
1061 return re_acquire_state_context (err
, dfa
,
1062 dfa
->init_state
->entrance_nodes
,
1066 /* Must not happen? */
1067 return dfa
->init_state
;
1070 return dfa
->init_state
;
1073 /* Check whether the regular expression match input string INPUT or not,
1074 and return the index where the matching end, return -1 if not match,
1075 or return -2 in case of an error.
1076 FL_LONGEST_MATCH means we want the POSIX longest matching.
1077 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1078 next place where we may want to try matching.
1079 Note that the matcher assume that the maching starts from the current
1080 index of the buffer. */
1084 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1087 const re_dfa_t
*const dfa
= mctx
->dfa
;
1090 int match_last
= -1;
1091 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1092 re_dfastate_t
*cur_state
;
1093 int at_init_state
= p_match_first
!= NULL
;
1094 int next_start_idx
= cur_str_idx
;
1097 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1098 /* An initial state must not be NULL (invalid). */
1099 if (BE (cur_state
== NULL
, 0))
1101 assert (err
== REG_ESPACE
);
1105 if (mctx
->state_log
!= NULL
)
1107 mctx
->state_log
[cur_str_idx
] = cur_state
;
1109 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1110 later. E.g. Processing back references. */
1111 if (BE (dfa
->nbackref
, 0))
1114 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1115 if (BE (err
!= REG_NOERROR
, 0))
1118 if (cur_state
->has_backref
)
1120 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1121 if (BE (err
!= REG_NOERROR
, 0))
1127 /* If the RE accepts NULL string. */
1128 if (BE (cur_state
->halt
, 0))
1130 if (!cur_state
->has_constraint
1131 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1133 if (!fl_longest_match
)
1137 match_last
= cur_str_idx
;
1143 while (!re_string_eoi (&mctx
->input
))
1145 re_dfastate_t
*old_state
= cur_state
;
1146 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1148 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1149 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1150 && mctx
->input
.valid_len
< mctx
->input
.len
))
1152 err
= extend_buffers (mctx
);
1153 if (BE (err
!= REG_NOERROR
, 0))
1155 assert (err
== REG_ESPACE
);
1160 cur_state
= transit_state (&err
, mctx
, cur_state
);
1161 if (mctx
->state_log
!= NULL
)
1162 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1164 if (cur_state
== NULL
)
1166 /* Reached the invalid state or an error. Try to recover a valid
1167 state using the state log, if available and if we have not
1168 already found a valid (even if not the longest) match. */
1169 if (BE (err
!= REG_NOERROR
, 0))
1172 if (mctx
->state_log
== NULL
1173 || (match
&& !fl_longest_match
)
1174 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1178 if (BE (at_init_state
, 0))
1180 if (old_state
== cur_state
)
1181 next_start_idx
= next_char_idx
;
1186 if (cur_state
->halt
)
1188 /* Reached a halt state.
1189 Check the halt state can satisfy the current context. */
1190 if (!cur_state
->has_constraint
1191 || check_halt_state_context (mctx
, cur_state
,
1192 re_string_cur_idx (&mctx
->input
)))
1194 /* We found an appropriate halt state. */
1195 match_last
= re_string_cur_idx (&mctx
->input
);
1198 /* We found a match, do not modify match_first below. */
1199 p_match_first
= NULL
;
1200 if (!fl_longest_match
)
1207 *p_match_first
+= next_start_idx
;
1212 /* Check NODE match the current context. */
1216 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1218 re_token_type_t type
= dfa
->nodes
[node
].type
;
1219 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1220 if (type
!= END_OF_RE
)
1224 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1229 /* Check the halt state STATE match the current context.
1230 Return 0 if not match, if the node, STATE has, is a halt node and
1231 match the context, return the node. */
1235 check_halt_state_context (const re_match_context_t
*mctx
,
1236 const re_dfastate_t
*state
, int idx
)
1239 unsigned int context
;
1241 assert (state
->halt
);
1243 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1244 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1245 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1246 return state
->nodes
.elems
[i
];
1250 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1251 corresponding to the DFA).
1252 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1257 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1258 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1259 struct re_fail_stack_t
*fs
)
1261 const re_dfa_t
*const dfa
= mctx
->dfa
;
1263 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1265 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1266 re_node_set
*edests
= &dfa
->edests
[node
];
1268 err
= re_node_set_insert (eps_via_nodes
, node
);
1269 if (BE (err
< 0, 0))
1271 /* Pick up a valid destination, or return -1 if none is found. */
1272 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1274 int candidate
= edests
->elems
[i
];
1275 if (!re_node_set_contains (cur_nodes
, candidate
))
1277 if (dest_node
== -1)
1278 dest_node
= candidate
;
1282 /* In order to avoid infinite loop like "(a*)*", return the second
1283 epsilon-transition if the first was already considered. */
1284 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1287 /* Otherwise, push the second epsilon-transition on the fail stack. */
1289 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1293 /* We know we are going to exit. */
1302 re_token_type_t type
= dfa
->nodes
[node
].type
;
1304 #ifdef RE_ENABLE_I18N
1305 if (dfa
->nodes
[node
].accept_mb
)
1306 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1308 #endif /* RE_ENABLE_I18N */
1309 if (type
== OP_BACK_REF
)
1311 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1312 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1315 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1319 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1320 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1329 err
= re_node_set_insert (eps_via_nodes
, node
);
1330 if (BE (err
< 0, 0))
1332 dest_node
= dfa
->edests
[node
].elems
[0];
1333 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1340 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1342 int dest_node
= dfa
->nexts
[node
];
1343 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1344 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1345 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1348 re_node_set_empty (eps_via_nodes
);
1355 static reg_errcode_t
1357 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1358 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1361 int num
= fs
->num
++;
1362 if (fs
->num
== fs
->alloc
)
1364 struct re_fail_stack_ent_t
*new_array
;
1365 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1367 if (new_array
== NULL
)
1370 fs
->stack
= new_array
;
1372 fs
->stack
[num
].idx
= str_idx
;
1373 fs
->stack
[num
].node
= dest_node
;
1374 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1375 if (fs
->stack
[num
].regs
== NULL
)
1377 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1378 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1384 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1385 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1387 int num
= --fs
->num
;
1389 *pidx
= fs
->stack
[num
].idx
;
1390 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1391 re_node_set_free (eps_via_nodes
);
1392 re_free (fs
->stack
[num
].regs
);
1393 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1394 return fs
->stack
[num
].node
;
1397 /* Set the positions where the subexpressions are starts/ends to registers
1399 Note: We assume that pmatch[0] is already set, and
1400 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1402 static reg_errcode_t
1404 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1405 regmatch_t
*pmatch
, int fl_backtrack
)
1407 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1409 re_node_set eps_via_nodes
;
1410 struct re_fail_stack_t
*fs
;
1411 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1412 regmatch_t
*prev_idx_match
;
1413 int prev_idx_match_malloced
= 0;
1416 assert (nmatch
> 1);
1417 assert (mctx
->state_log
!= NULL
);
1422 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1423 if (fs
->stack
== NULL
)
1429 cur_node
= dfa
->init_node
;
1430 re_node_set_init_empty (&eps_via_nodes
);
1433 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1434 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1438 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1439 if (prev_idx_match
== NULL
)
1441 free_fail_stack_return (fs
);
1444 prev_idx_match_malloced
= 1;
1446 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1448 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1450 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1452 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1457 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1458 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1460 if (reg_idx
== nmatch
)
1462 re_node_set_free (&eps_via_nodes
);
1463 if (prev_idx_match_malloced
)
1464 re_free (prev_idx_match
);
1465 return free_fail_stack_return (fs
);
1467 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1472 re_node_set_free (&eps_via_nodes
);
1473 if (prev_idx_match_malloced
)
1474 re_free (prev_idx_match
);
1479 /* Proceed to next node. */
1480 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1481 &eps_via_nodes
, fs
);
1483 if (BE (cur_node
< 0, 0))
1485 if (BE (cur_node
== -2, 0))
1487 re_node_set_free (&eps_via_nodes
);
1488 if (prev_idx_match_malloced
)
1489 re_free (prev_idx_match
);
1490 free_fail_stack_return (fs
);
1494 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1498 re_node_set_free (&eps_via_nodes
);
1499 if (prev_idx_match_malloced
)
1500 re_free (prev_idx_match
);
1505 re_node_set_free (&eps_via_nodes
);
1506 if (prev_idx_match_malloced
)
1507 re_free (prev_idx_match
);
1508 return free_fail_stack_return (fs
);
1511 static reg_errcode_t
1513 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1518 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1520 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1521 re_free (fs
->stack
[fs_idx
].regs
);
1523 re_free (fs
->stack
);
1530 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1531 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1533 int type
= dfa
->nodes
[cur_node
].type
;
1534 if (type
== OP_OPEN_SUBEXP
)
1536 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1538 /* We are at the first node of this sub expression. */
1539 if (reg_num
< nmatch
)
1541 pmatch
[reg_num
].rm_so
= cur_idx
;
1542 pmatch
[reg_num
].rm_eo
= -1;
1545 else if (type
== OP_CLOSE_SUBEXP
)
1547 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1548 if (reg_num
< nmatch
)
1550 /* We are at the last node of this sub expression. */
1551 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1553 pmatch
[reg_num
].rm_eo
= cur_idx
;
1554 /* This is a non-empty match or we are not inside an optional
1555 subexpression. Accept this right away. */
1556 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1560 if (dfa
->nodes
[cur_node
].opt_subexp
1561 && prev_idx_match
[reg_num
].rm_so
!= -1)
1562 /* We transited through an empty match for an optional
1563 subexpression, like (a?)*, and this is not the subexp's
1564 first match. Copy back the old content of the registers
1565 so that matches of an inner subexpression are undone as
1566 well, like in ((a?))*. */
1567 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1569 /* We completed a subexpression, but it may be part of
1570 an optional one, so do not update PREV_IDX_MATCH. */
1571 pmatch
[reg_num
].rm_eo
= cur_idx
;
1577 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1578 and sift the nodes in each states according to the following rules.
1579 Updated state_log will be wrote to STATE_LOG.
1581 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1582 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1583 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1584 the LAST_NODE, we throw away the node `a'.
1585 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1586 string `s' and transit to `b':
1587 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1589 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1590 thrown away, we throw away the node `a'.
1591 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1592 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1594 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1595 we throw away the node `a'. */
1597 #define STATE_NODE_CONTAINS(state,node) \
1598 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1600 static reg_errcode_t
1602 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1606 int str_idx
= sctx
->last_str_idx
;
1607 re_node_set cur_dest
;
1610 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1613 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1614 transit to the last_node and the last_node itself. */
1615 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1616 if (BE (err
!= REG_NOERROR
, 0))
1618 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1619 if (BE (err
!= REG_NOERROR
, 0))
1622 /* Then check each states in the state_log. */
1625 /* Update counters. */
1626 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1627 if (null_cnt
> mctx
->max_mb_elem_len
)
1629 memset (sctx
->sifted_states
, '\0',
1630 sizeof (re_dfastate_t
*) * str_idx
);
1631 re_node_set_free (&cur_dest
);
1634 re_node_set_empty (&cur_dest
);
1637 if (mctx
->state_log
[str_idx
])
1639 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1640 if (BE (err
!= REG_NOERROR
, 0))
1644 /* Add all the nodes which satisfy the following conditions:
1645 - It can epsilon transit to a node in CUR_DEST.
1647 And update state_log. */
1648 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1649 if (BE (err
!= REG_NOERROR
, 0))
1654 re_node_set_free (&cur_dest
);
1658 static reg_errcode_t
1660 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1661 int str_idx
, re_node_set
*cur_dest
)
1663 const re_dfa_t
*const dfa
= mctx
->dfa
;
1664 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1667 /* Then build the next sifted state.
1668 We build the next sifted state on `cur_dest', and update
1669 `sifted_states[str_idx]' with `cur_dest'.
1671 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1672 `cur_src' points the node_set of the old `state_log[str_idx]'
1673 (with the epsilon nodes pre-filtered out). */
1674 for (i
= 0; i
< cur_src
->nelem
; i
++)
1676 int prev_node
= cur_src
->elems
[i
];
1681 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1682 assert (!IS_EPSILON_NODE (type
));
1684 #ifdef RE_ENABLE_I18N
1685 /* If the node may accept `multi byte'. */
1686 if (dfa
->nodes
[prev_node
].accept_mb
)
1687 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1688 str_idx
, sctx
->last_str_idx
);
1689 #endif /* RE_ENABLE_I18N */
1691 /* We don't check backreferences here.
1692 See update_cur_sifted_state(). */
1694 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1695 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1696 dfa
->nexts
[prev_node
]))
1702 if (sctx
->limits
.nelem
)
1704 int to_idx
= str_idx
+ naccepted
;
1705 if (check_dst_limits (mctx
, &sctx
->limits
,
1706 dfa
->nexts
[prev_node
], to_idx
,
1707 prev_node
, str_idx
))
1710 ret
= re_node_set_insert (cur_dest
, prev_node
);
1711 if (BE (ret
== -1, 0))
1718 /* Helper functions. */
1720 static reg_errcode_t
1722 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1724 int top
= mctx
->state_log_top
;
1726 if (next_state_log_idx
>= mctx
->input
.bufs_len
1727 || (next_state_log_idx
>= mctx
->input
.valid_len
1728 && mctx
->input
.valid_len
< mctx
->input
.len
))
1731 err
= extend_buffers (mctx
);
1732 if (BE (err
!= REG_NOERROR
, 0))
1736 if (top
< next_state_log_idx
)
1738 memset (mctx
->state_log
+ top
+ 1, '\0',
1739 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1740 mctx
->state_log_top
= next_state_log_idx
;
1745 static reg_errcode_t
1747 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1748 re_dfastate_t
**src
, int num
)
1752 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1754 if (dst
[st_idx
] == NULL
)
1755 dst
[st_idx
] = src
[st_idx
];
1756 else if (src
[st_idx
] != NULL
)
1758 re_node_set merged_set
;
1759 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1760 &src
[st_idx
]->nodes
);
1761 if (BE (err
!= REG_NOERROR
, 0))
1763 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1764 re_node_set_free (&merged_set
);
1765 if (BE (err
!= REG_NOERROR
, 0))
1772 static reg_errcode_t
1774 update_cur_sifted_state (const re_match_context_t
*mctx
,
1775 re_sift_context_t
*sctx
, int str_idx
,
1776 re_node_set
*dest_nodes
)
1778 const re_dfa_t
*const dfa
= mctx
->dfa
;
1779 reg_errcode_t err
= REG_NOERROR
;
1780 const re_node_set
*candidates
;
1781 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1782 : &mctx
->state_log
[str_idx
]->nodes
);
1784 if (dest_nodes
->nelem
== 0)
1785 sctx
->sifted_states
[str_idx
] = NULL
;
1790 /* At first, add the nodes which can epsilon transit to a node in
1792 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1793 if (BE (err
!= REG_NOERROR
, 0))
1796 /* Then, check the limitations in the current sift_context. */
1797 if (sctx
->limits
.nelem
)
1799 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1800 mctx
->bkref_ents
, str_idx
);
1801 if (BE (err
!= REG_NOERROR
, 0))
1806 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1807 if (BE (err
!= REG_NOERROR
, 0))
1811 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1813 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1814 if (BE (err
!= REG_NOERROR
, 0))
1820 static reg_errcode_t
1822 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1823 const re_node_set
*candidates
)
1825 reg_errcode_t err
= REG_NOERROR
;
1828 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1829 if (BE (err
!= REG_NOERROR
, 0))
1832 if (!state
->inveclosure
.alloc
)
1834 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1835 if (BE (err
!= REG_NOERROR
, 0))
1837 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1839 err
= re_node_set_merge (&state
->inveclosure
,
1840 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1841 if (BE (err
!= REG_NOERROR
, 0))
1845 return re_node_set_add_intersect (dest_nodes
, candidates
,
1846 &state
->inveclosure
);
1849 static reg_errcode_t
1851 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1852 const re_node_set
*candidates
)
1856 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1857 re_node_set except_nodes
;
1858 re_node_set_init_empty (&except_nodes
);
1859 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1861 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1862 if (cur_node
== node
)
1864 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1866 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1867 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1868 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1869 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1870 && re_node_set_contains (dest_nodes
, edst1
))
1872 && !re_node_set_contains (inv_eclosure
, edst2
)
1873 && re_node_set_contains (dest_nodes
, edst2
)))
1875 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1876 dfa
->inveclosures
+ cur_node
);
1877 if (BE (err
!= REG_NOERROR
, 0))
1879 re_node_set_free (&except_nodes
);
1885 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1887 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1888 if (!re_node_set_contains (&except_nodes
, cur_node
))
1890 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1891 re_node_set_remove_at (dest_nodes
, idx
);
1894 re_node_set_free (&except_nodes
);
1900 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1901 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1903 const re_dfa_t
*const dfa
= mctx
->dfa
;
1904 int lim_idx
, src_pos
, dst_pos
;
1906 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1907 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1908 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1911 struct re_backref_cache_entry
*ent
;
1912 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1913 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1915 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1916 subexp_idx
, dst_node
, dst_idx
,
1918 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1919 subexp_idx
, src_node
, src_idx
,
1923 <src> <dst> ( <subexp> )
1924 ( <subexp> ) <src> <dst>
1925 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1926 if (src_pos
== dst_pos
)
1927 continue; /* This is unrelated limitation. */
1936 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1937 int subexp_idx
, int from_node
, int bkref_idx
)
1939 const re_dfa_t
*const dfa
= mctx
->dfa
;
1940 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1943 /* Else, we are on the boundary: examine the nodes on the epsilon
1945 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1947 int node
= eclosures
->elems
[node_idx
];
1948 switch (dfa
->nodes
[node
].type
)
1951 if (bkref_idx
!= -1)
1953 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1958 if (ent
->node
!= node
)
1961 if (subexp_idx
< BITSET_WORD_BITS
1962 && !(ent
->eps_reachable_subexps_map
1963 & ((bitset_word_t
) 1 << subexp_idx
)))
1966 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1967 OP_CLOSE_SUBEXP cases below. But, if the
1968 destination node is the same node as the source
1969 node, don't recurse because it would cause an
1970 infinite loop: a regex that exhibits this behavior
1972 dst
= dfa
->edests
[node
].elems
[0];
1973 if (dst
== from_node
)
1977 else /* if (boundaries & 2) */
1982 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1984 if (cpos
== -1 /* && (boundaries & 1) */)
1986 if (cpos
== 0 && (boundaries
& 2))
1989 if (subexp_idx
< BITSET_WORD_BITS
)
1990 ent
->eps_reachable_subexps_map
1991 &= ~((bitset_word_t
) 1 << subexp_idx
);
1993 while (ent
++->more
);
1997 case OP_OPEN_SUBEXP
:
1998 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2002 case OP_CLOSE_SUBEXP
:
2003 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2012 return (boundaries
& 2) ? 1 : 0;
2017 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2018 int subexp_idx
, int from_node
, int str_idx
,
2021 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2024 /* If we are outside the range of the subexpression, return -1 or 1. */
2025 if (str_idx
< lim
->subexp_from
)
2028 if (lim
->subexp_to
< str_idx
)
2031 /* If we are within the subexpression, return 0. */
2032 boundaries
= (str_idx
== lim
->subexp_from
);
2033 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2034 if (boundaries
== 0)
2037 /* Else, examine epsilon closure. */
2038 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2039 from_node
, bkref_idx
);
2042 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2043 which are against limitations from DEST_NODES. */
2045 static reg_errcode_t
2047 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2048 const re_node_set
*candidates
, re_node_set
*limits
,
2049 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2052 int node_idx
, lim_idx
;
2054 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2057 struct re_backref_cache_entry
*ent
;
2058 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2060 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2061 continue; /* This is unrelated limitation. */
2063 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2064 if (ent
->subexp_to
== str_idx
)
2068 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2070 int node
= dest_nodes
->elems
[node_idx
];
2071 re_token_type_t type
= dfa
->nodes
[node
].type
;
2072 if (type
== OP_OPEN_SUBEXP
2073 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2075 else if (type
== OP_CLOSE_SUBEXP
2076 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2080 /* Check the limitation of the open subexpression. */
2081 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2084 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2086 if (BE (err
!= REG_NOERROR
, 0))
2090 /* Check the limitation of the close subexpression. */
2092 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2094 int node
= dest_nodes
->elems
[node_idx
];
2095 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2097 && !re_node_set_contains (dfa
->eclosures
+ node
,
2100 /* It is against this limitation.
2101 Remove it form the current sifted state. */
2102 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2104 if (BE (err
!= REG_NOERROR
, 0))
2110 else /* (ent->subexp_to != str_idx) */
2112 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2114 int node
= dest_nodes
->elems
[node_idx
];
2115 re_token_type_t type
= dfa
->nodes
[node
].type
;
2116 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2118 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2120 /* It is against this limitation.
2121 Remove it form the current sifted state. */
2122 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2124 if (BE (err
!= REG_NOERROR
, 0))
2133 static reg_errcode_t
2135 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2136 int str_idx
, const re_node_set
*candidates
)
2138 const re_dfa_t
*const dfa
= mctx
->dfa
;
2141 re_sift_context_t local_sctx
;
2142 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2144 if (first_idx
== -1)
2147 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2149 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2152 re_token_type_t type
;
2153 struct re_backref_cache_entry
*entry
;
2154 node
= candidates
->elems
[node_idx
];
2155 type
= dfa
->nodes
[node
].type
;
2156 /* Avoid infinite loop for the REs like "()\1+". */
2157 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2159 if (type
!= OP_BACK_REF
)
2162 entry
= mctx
->bkref_ents
+ first_idx
;
2163 enabled_idx
= first_idx
;
2170 re_dfastate_t
*cur_state
;
2172 if (entry
->node
!= node
)
2174 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2175 to_idx
= str_idx
+ subexp_len
;
2176 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2177 : dfa
->edests
[node
].elems
[0]);
2179 if (to_idx
> sctx
->last_str_idx
2180 || sctx
->sifted_states
[to_idx
] == NULL
2181 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2182 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2183 str_idx
, dst_node
, to_idx
))
2186 if (local_sctx
.sifted_states
== NULL
)
2189 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2190 if (BE (err
!= REG_NOERROR
, 0))
2193 local_sctx
.last_node
= node
;
2194 local_sctx
.last_str_idx
= str_idx
;
2195 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2196 if (BE (ret
< 0, 0))
2201 cur_state
= local_sctx
.sifted_states
[str_idx
];
2202 err
= sift_states_backward (mctx
, &local_sctx
);
2203 if (BE (err
!= REG_NOERROR
, 0))
2205 if (sctx
->limited_states
!= NULL
)
2207 err
= merge_state_array (dfa
, sctx
->limited_states
,
2208 local_sctx
.sifted_states
,
2210 if (BE (err
!= REG_NOERROR
, 0))
2213 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2214 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2216 /* mctx->bkref_ents may have changed, reload the pointer. */
2217 entry
= mctx
->bkref_ents
+ enabled_idx
;
2219 while (enabled_idx
++, entry
++->more
);
2223 if (local_sctx
.sifted_states
!= NULL
)
2225 re_node_set_free (&local_sctx
.limits
);
2232 #ifdef RE_ENABLE_I18N
2235 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2236 int node_idx
, int str_idx
, int max_str_idx
)
2238 const re_dfa_t
*const dfa
= mctx
->dfa
;
2240 /* Check the node can accept `multi byte'. */
2241 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2242 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2243 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2244 dfa
->nexts
[node_idx
]))
2245 /* The node can't accept the `multi byte', or the
2246 destination was already thrown away, then the node
2247 could't accept the current input `multi byte'. */
2249 /* Otherwise, it is sure that the node could accept
2250 `naccepted' bytes input. */
2253 #endif /* RE_ENABLE_I18N */
2256 /* Functions for state transition. */
2258 /* Return the next state to which the current state STATE will transit by
2259 accepting the current input byte, and update STATE_LOG if necessary.
2260 If STATE can accept a multibyte char/collating element/back reference
2261 update the destination of STATE_LOG. */
2263 static re_dfastate_t
*
2265 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2266 re_dfastate_t
*state
)
2268 re_dfastate_t
**trtable
;
2271 #ifdef RE_ENABLE_I18N
2272 /* If the current state can accept multibyte. */
2273 if (BE (state
->accept_mb
, 0))
2275 *err
= transit_state_mb (mctx
, state
);
2276 if (BE (*err
!= REG_NOERROR
, 0))
2279 #endif /* RE_ENABLE_I18N */
2281 /* Then decide the next state with the single byte. */
2284 /* don't use transition table */
2285 return transit_state_sb (err
, mctx
, state
);
2288 /* Use transition table */
2289 ch
= re_string_fetch_byte (&mctx
->input
);
2292 trtable
= state
->trtable
;
2293 if (BE (trtable
!= NULL
, 1))
2296 trtable
= state
->word_trtable
;
2297 if (BE (trtable
!= NULL
, 1))
2299 unsigned int context
;
2301 = re_string_context_at (&mctx
->input
,
2302 re_string_cur_idx (&mctx
->input
) - 1,
2304 if (IS_WORD_CONTEXT (context
))
2305 return trtable
[ch
+ SBC_MAX
];
2310 if (!build_trtable (mctx
->dfa
, state
))
2316 /* Retry, we now have a transition table. */
2320 /* Update the state_log if we need */
2323 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2324 re_dfastate_t
*next_state
)
2326 const re_dfa_t
*const dfa
= mctx
->dfa
;
2327 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2329 if (cur_idx
> mctx
->state_log_top
)
2331 mctx
->state_log
[cur_idx
] = next_state
;
2332 mctx
->state_log_top
= cur_idx
;
2334 else if (mctx
->state_log
[cur_idx
] == 0)
2336 mctx
->state_log
[cur_idx
] = next_state
;
2340 re_dfastate_t
*pstate
;
2341 unsigned int context
;
2342 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2343 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2344 the destination of a multibyte char/collating element/
2345 back reference. Then the next state is the union set of
2346 these destinations and the results of the transition table. */
2347 pstate
= mctx
->state_log
[cur_idx
];
2348 log_nodes
= pstate
->entrance_nodes
;
2349 if (next_state
!= NULL
)
2351 table_nodes
= next_state
->entrance_nodes
;
2352 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2354 if (BE (*err
!= REG_NOERROR
, 0))
2358 next_nodes
= *log_nodes
;
2359 /* Note: We already add the nodes of the initial state,
2360 then we don't need to add them here. */
2362 context
= re_string_context_at (&mctx
->input
,
2363 re_string_cur_idx (&mctx
->input
) - 1,
2365 next_state
= mctx
->state_log
[cur_idx
]
2366 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2367 /* We don't need to check errors here, since the return value of
2368 this function is next_state and ERR is already set. */
2370 if (table_nodes
!= NULL
)
2371 re_node_set_free (&next_nodes
);
2374 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2376 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2377 later. We must check them here, since the back references in the
2378 next state might use them. */
2379 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2381 if (BE (*err
!= REG_NOERROR
, 0))
2384 /* If the next state has back references. */
2385 if (next_state
->has_backref
)
2387 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2388 if (BE (*err
!= REG_NOERROR
, 0))
2390 next_state
= mctx
->state_log
[cur_idx
];
2397 /* Skip bytes in the input that correspond to part of a
2398 multi-byte match, then look in the log for a state
2399 from which to restart matching. */
2402 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2404 re_dfastate_t
*cur_state
;
2407 int max
= mctx
->state_log_top
;
2408 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2412 if (++cur_str_idx
> max
)
2414 re_string_skip_bytes (&mctx
->input
, 1);
2416 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2418 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2420 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2424 /* Helper functions for transit_state. */
2426 /* From the node set CUR_NODES, pick up the nodes whose types are
2427 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2428 expression. And register them to use them later for evaluating the
2429 correspoding back references. */
2431 static reg_errcode_t
2433 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2436 const re_dfa_t
*const dfa
= mctx
->dfa
;
2440 /* TODO: This isn't efficient.
2441 Because there might be more than one nodes whose types are
2442 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2445 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2447 int node
= cur_nodes
->elems
[node_idx
];
2448 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2449 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2450 && (dfa
->used_bkref_map
2451 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2453 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2454 if (BE (err
!= REG_NOERROR
, 0))
2462 /* Return the next state to which the current state STATE will transit by
2463 accepting the current input byte. */
2465 static re_dfastate_t
*
2466 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2467 re_dfastate_t
*state
)
2469 const re_dfa_t
*const dfa
= mctx
->dfa
;
2470 re_node_set next_nodes
;
2471 re_dfastate_t
*next_state
;
2472 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2473 unsigned int context
;
2475 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2476 if (BE (*err
!= REG_NOERROR
, 0))
2478 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2480 int cur_node
= state
->nodes
.elems
[node_cnt
];
2481 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2483 *err
= re_node_set_merge (&next_nodes
,
2484 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2485 if (BE (*err
!= REG_NOERROR
, 0))
2487 re_node_set_free (&next_nodes
);
2492 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2493 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2494 /* We don't need to check errors here, since the return value of
2495 this function is next_state and ERR is already set. */
2497 re_node_set_free (&next_nodes
);
2498 re_string_skip_bytes (&mctx
->input
, 1);
2503 #ifdef RE_ENABLE_I18N
2504 static reg_errcode_t
2506 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2508 const re_dfa_t
*const dfa
= mctx
->dfa
;
2512 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2514 re_node_set dest_nodes
, *new_nodes
;
2515 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2516 int naccepted
, dest_idx
;
2517 unsigned int context
;
2518 re_dfastate_t
*dest_state
;
2520 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2523 if (dfa
->nodes
[cur_node_idx
].constraint
)
2525 context
= re_string_context_at (&mctx
->input
,
2526 re_string_cur_idx (&mctx
->input
),
2528 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2533 /* How many bytes the node can accept? */
2534 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2535 re_string_cur_idx (&mctx
->input
));
2539 /* The node can accepts `naccepted' bytes. */
2540 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2541 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2542 : mctx
->max_mb_elem_len
);
2543 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2544 if (BE (err
!= REG_NOERROR
, 0))
2547 assert (dfa
->nexts
[cur_node_idx
] != -1);
2549 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2551 dest_state
= mctx
->state_log
[dest_idx
];
2552 if (dest_state
== NULL
)
2553 dest_nodes
= *new_nodes
;
2556 err
= re_node_set_init_union (&dest_nodes
,
2557 dest_state
->entrance_nodes
, new_nodes
);
2558 if (BE (err
!= REG_NOERROR
, 0))
2561 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2563 mctx
->state_log
[dest_idx
]
2564 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2565 if (dest_state
!= NULL
)
2566 re_node_set_free (&dest_nodes
);
2567 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2572 #endif /* RE_ENABLE_I18N */
2574 static reg_errcode_t
2576 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2578 const re_dfa_t
*const dfa
= mctx
->dfa
;
2581 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2583 for (i
= 0; i
< nodes
->nelem
; ++i
)
2585 int dest_str_idx
, prev_nelem
, bkc_idx
;
2586 int node_idx
= nodes
->elems
[i
];
2587 unsigned int context
;
2588 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2589 re_node_set
*new_dest_nodes
;
2591 /* Check whether `node' is a backreference or not. */
2592 if (node
->type
!= OP_BACK_REF
)
2595 if (node
->constraint
)
2597 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2599 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2603 /* `node' is a backreference.
2604 Check the substring which the substring matched. */
2605 bkc_idx
= mctx
->nbkref_ents
;
2606 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2607 if (BE (err
!= REG_NOERROR
, 0))
2610 /* And add the epsilon closures (which is `new_dest_nodes') of
2611 the backreference to appropriate state_log. */
2613 assert (dfa
->nexts
[node_idx
] != -1);
2615 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2618 re_dfastate_t
*dest_state
;
2619 struct re_backref_cache_entry
*bkref_ent
;
2620 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2621 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2623 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2624 new_dest_nodes
= (subexp_len
== 0
2625 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2626 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2627 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2628 - bkref_ent
->subexp_from
);
2629 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2631 dest_state
= mctx
->state_log
[dest_str_idx
];
2632 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2633 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2634 /* Add `new_dest_node' to state_log. */
2635 if (dest_state
== NULL
)
2637 mctx
->state_log
[dest_str_idx
]
2638 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2640 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2641 && err
!= REG_NOERROR
, 0))
2646 re_node_set dest_nodes
;
2647 err
= re_node_set_init_union (&dest_nodes
,
2648 dest_state
->entrance_nodes
,
2650 if (BE (err
!= REG_NOERROR
, 0))
2652 re_node_set_free (&dest_nodes
);
2655 mctx
->state_log
[dest_str_idx
]
2656 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2657 re_node_set_free (&dest_nodes
);
2658 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2659 && err
!= REG_NOERROR
, 0))
2662 /* We need to check recursively if the backreference can epsilon
2665 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2667 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2669 if (BE (err
!= REG_NOERROR
, 0))
2671 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2672 if (BE (err
!= REG_NOERROR
, 0))
2682 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2683 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2684 Note that we might collect inappropriate candidates here.
2685 However, the cost of checking them strictly here is too high, then we
2686 delay these checking for prune_impossible_nodes(). */
2688 static reg_errcode_t
2690 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2692 const re_dfa_t
*const dfa
= mctx
->dfa
;
2693 int subexp_num
, sub_top_idx
;
2694 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2695 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2696 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2697 if (cache_idx
!= -1)
2699 const struct re_backref_cache_entry
*entry
2700 = mctx
->bkref_ents
+ cache_idx
;
2702 if (entry
->node
== bkref_node
)
2703 return REG_NOERROR
; /* We already checked it. */
2704 while (entry
++->more
);
2707 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2709 /* For each sub expression */
2710 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2713 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2714 re_sub_match_last_t
*sub_last
;
2715 int sub_last_idx
, sl_str
, bkref_str_off
;
2717 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2718 continue; /* It isn't related. */
2720 sl_str
= sub_top
->str_idx
;
2721 bkref_str_off
= bkref_str_idx
;
2722 /* At first, check the last node of sub expressions we already
2724 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2727 sub_last
= sub_top
->lasts
[sub_last_idx
];
2728 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2729 /* The matched string by the sub expression match with the substring
2730 at the back reference? */
2731 if (sl_str_diff
> 0)
2733 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2735 /* Not enough chars for a successful match. */
2736 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2739 err
= clean_state_log_if_needed (mctx
,
2742 if (BE (err
!= REG_NOERROR
, 0))
2744 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2746 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2747 /* We don't need to search this sub expression any more. */
2750 bkref_str_off
+= sl_str_diff
;
2751 sl_str
+= sl_str_diff
;
2752 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2755 /* Reload buf, since the preceding call might have reallocated
2757 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2759 if (err
== REG_NOMATCH
)
2761 if (BE (err
!= REG_NOERROR
, 0))
2765 if (sub_last_idx
< sub_top
->nlasts
)
2767 if (sub_last_idx
> 0)
2769 /* Then, search for the other last nodes of the sub expression. */
2770 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2772 int cls_node
, sl_str_off
;
2773 const re_node_set
*nodes
;
2774 sl_str_off
= sl_str
- sub_top
->str_idx
;
2775 /* The matched string by the sub expression match with the substring
2776 at the back reference? */
2779 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2781 /* If we are at the end of the input, we cannot match. */
2782 if (bkref_str_off
>= mctx
->input
.len
)
2785 err
= extend_buffers (mctx
);
2786 if (BE (err
!= REG_NOERROR
, 0))
2789 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2791 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2792 break; /* We don't need to search this sub expression
2795 if (mctx
->state_log
[sl_str
] == NULL
)
2797 /* Does this state have a ')' of the sub expression? */
2798 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2799 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2803 if (sub_top
->path
== NULL
)
2805 sub_top
->path
= calloc (sizeof (state_array_t
),
2806 sl_str
- sub_top
->str_idx
+ 1);
2807 if (sub_top
->path
== NULL
)
2810 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2811 in the current context? */
2812 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2813 sub_top
->str_idx
, cls_node
, sl_str
,
2815 if (err
== REG_NOMATCH
)
2817 if (BE (err
!= REG_NOERROR
, 0))
2819 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2820 if (BE (sub_last
== NULL
, 0))
2822 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2824 if (err
== REG_NOMATCH
)
2831 /* Helper functions for get_subexp(). */
2833 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2834 If it can arrive, register the sub expression expressed with SUB_TOP
2837 static reg_errcode_t
2839 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2840 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2844 /* Can the subexpression arrive the back reference? */
2845 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2846 sub_last
->str_idx
, bkref_node
, bkref_str
,
2848 if (err
!= REG_NOERROR
)
2850 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2852 if (BE (err
!= REG_NOERROR
, 0))
2854 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2855 return clean_state_log_if_needed (mctx
, to_idx
);
2858 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2859 Search '(' if FL_OPEN, or search ')' otherwise.
2860 TODO: This function isn't efficient...
2861 Because there might be more than one nodes whose types are
2862 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2868 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2869 int subexp_idx
, int type
)
2872 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2874 int cls_node
= nodes
->elems
[cls_idx
];
2875 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2876 if (node
->type
== type
2877 && node
->opr
.idx
== subexp_idx
)
2883 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2884 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2886 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2888 static reg_errcode_t
2890 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2891 int top_str
, int last_node
, int last_str
, int type
)
2893 const re_dfa_t
*const dfa
= mctx
->dfa
;
2894 reg_errcode_t err
= REG_NOERROR
;
2895 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2896 re_dfastate_t
*cur_state
= NULL
;
2897 re_node_set
*cur_nodes
, next_nodes
;
2898 re_dfastate_t
**backup_state_log
;
2899 unsigned int context
;
2901 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2902 /* Extend the buffer if we need. */
2903 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2905 re_dfastate_t
**new_array
;
2906 int old_alloc
= path
->alloc
;
2907 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2908 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2909 if (BE (new_array
== NULL
, 0))
2911 path
->alloc
= old_alloc
;
2914 path
->array
= new_array
;
2915 memset (new_array
+ old_alloc
, '\0',
2916 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2919 str_idx
= path
->next_idx
? path
->next_idx
: top_str
;
2921 /* Temporary modify MCTX. */
2922 backup_state_log
= mctx
->state_log
;
2923 backup_cur_idx
= mctx
->input
.cur_idx
;
2924 mctx
->state_log
= path
->array
;
2925 mctx
->input
.cur_idx
= str_idx
;
2927 /* Setup initial node set. */
2928 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2929 if (str_idx
== top_str
)
2931 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2932 if (BE (err
!= REG_NOERROR
, 0))
2934 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2935 if (BE (err
!= REG_NOERROR
, 0))
2937 re_node_set_free (&next_nodes
);
2943 cur_state
= mctx
->state_log
[str_idx
];
2944 if (cur_state
&& cur_state
->has_backref
)
2946 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2947 if (BE (err
!= REG_NOERROR
, 0))
2951 re_node_set_init_empty (&next_nodes
);
2953 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2955 if (next_nodes
.nelem
)
2957 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2959 if (BE (err
!= REG_NOERROR
, 0))
2961 re_node_set_free (&next_nodes
);
2965 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2966 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2968 re_node_set_free (&next_nodes
);
2971 mctx
->state_log
[str_idx
] = cur_state
;
2974 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2976 re_node_set_empty (&next_nodes
);
2977 if (mctx
->state_log
[str_idx
+ 1])
2979 err
= re_node_set_merge (&next_nodes
,
2980 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2981 if (BE (err
!= REG_NOERROR
, 0))
2983 re_node_set_free (&next_nodes
);
2989 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2990 &cur_state
->non_eps_nodes
,
2992 if (BE (err
!= REG_NOERROR
, 0))
2994 re_node_set_free (&next_nodes
);
2999 if (next_nodes
.nelem
)
3001 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
3002 if (BE (err
!= REG_NOERROR
, 0))
3004 re_node_set_free (&next_nodes
);
3007 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
3009 if (BE (err
!= REG_NOERROR
, 0))
3011 re_node_set_free (&next_nodes
);
3015 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3016 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3017 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3019 re_node_set_free (&next_nodes
);
3022 mctx
->state_log
[str_idx
] = cur_state
;
3023 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3025 re_node_set_free (&next_nodes
);
3026 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3027 : &mctx
->state_log
[last_str
]->nodes
);
3028 path
->next_idx
= str_idx
;
3031 mctx
->state_log
= backup_state_log
;
3032 mctx
->input
.cur_idx
= backup_cur_idx
;
3034 /* Then check the current node set has the node LAST_NODE. */
3035 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3041 /* Helper functions for check_arrival. */
3043 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3045 TODO: This function is similar to the functions transit_state*(),
3046 however this function has many additional works.
3047 Can't we unify them? */
3049 static reg_errcode_t
3051 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3052 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3054 const re_dfa_t
*const dfa
= mctx
->dfa
;
3057 #ifdef RE_ENABLE_I18N
3058 reg_errcode_t err
= REG_NOERROR
;
3060 re_node_set union_set
;
3061 re_node_set_init_empty (&union_set
);
3062 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3065 int cur_node
= cur_nodes
->elems
[cur_idx
];
3067 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3068 assert (!IS_EPSILON_NODE (type
));
3070 #ifdef RE_ENABLE_I18N
3071 /* If the node may accept `multi byte'. */
3072 if (dfa
->nodes
[cur_node
].accept_mb
)
3074 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3078 re_dfastate_t
*dest_state
;
3079 int next_node
= dfa
->nexts
[cur_node
];
3080 int next_idx
= str_idx
+ naccepted
;
3081 dest_state
= mctx
->state_log
[next_idx
];
3082 re_node_set_empty (&union_set
);
3085 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3086 if (BE (err
!= REG_NOERROR
, 0))
3088 re_node_set_free (&union_set
);
3092 result
= re_node_set_insert (&union_set
, next_node
);
3093 if (BE (result
< 0, 0))
3095 re_node_set_free (&union_set
);
3098 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3100 if (BE (mctx
->state_log
[next_idx
] == NULL
3101 && err
!= REG_NOERROR
, 0))
3103 re_node_set_free (&union_set
);
3108 #endif /* RE_ENABLE_I18N */
3110 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3112 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3113 if (BE (result
< 0, 0))
3115 re_node_set_free (&union_set
);
3120 re_node_set_free (&union_set
);
3124 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3125 CUR_NODES, however exclude the nodes which are:
3126 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3127 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3130 static reg_errcode_t
3132 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3133 int ex_subexp
, int type
)
3136 int idx
, outside_node
;
3137 re_node_set new_nodes
;
3139 assert (cur_nodes
->nelem
);
3141 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3142 if (BE (err
!= REG_NOERROR
, 0))
3144 /* Create a new node set NEW_NODES with the nodes which are epsilon
3145 closures of the node in CUR_NODES. */
3147 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3149 int cur_node
= cur_nodes
->elems
[idx
];
3150 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3151 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3152 if (outside_node
== -1)
3154 /* There are no problematic nodes, just merge them. */
3155 err
= re_node_set_merge (&new_nodes
, eclosure
);
3156 if (BE (err
!= REG_NOERROR
, 0))
3158 re_node_set_free (&new_nodes
);
3164 /* There are problematic nodes, re-calculate incrementally. */
3165 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3167 if (BE (err
!= REG_NOERROR
, 0))
3169 re_node_set_free (&new_nodes
);
3174 re_node_set_free (cur_nodes
);
3175 *cur_nodes
= new_nodes
;
3179 /* Helper function for check_arrival_expand_ecl.
3180 Check incrementally the epsilon closure of TARGET, and if it isn't
3181 problematic append it to DST_NODES. */
3183 static reg_errcode_t
3185 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3186 int target
, int ex_subexp
, int type
)
3189 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3193 if (dfa
->nodes
[cur_node
].type
== type
3194 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3196 if (type
== OP_CLOSE_SUBEXP
)
3198 err
= re_node_set_insert (dst_nodes
, cur_node
);
3199 if (BE (err
== -1, 0))
3204 err
= re_node_set_insert (dst_nodes
, cur_node
);
3205 if (BE (err
== -1, 0))
3207 if (dfa
->edests
[cur_node
].nelem
== 0)
3209 if (dfa
->edests
[cur_node
].nelem
== 2)
3211 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3212 dfa
->edests
[cur_node
].elems
[1],
3214 if (BE (err
!= REG_NOERROR
, 0))
3217 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3223 /* For all the back references in the current state, calculate the
3224 destination of the back references by the appropriate entry
3225 in MCTX->BKREF_ENTS. */
3227 static reg_errcode_t
3229 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3230 int cur_str
, int subexp_num
, int type
)
3232 const re_dfa_t
*const dfa
= mctx
->dfa
;
3234 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3235 struct re_backref_cache_entry
*ent
;
3237 if (cache_idx_start
== -1)
3241 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3244 int to_idx
, next_node
;
3246 /* Is this entry ENT is appropriate? */
3247 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3250 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3251 /* Calculate the destination of the back reference, and append it
3252 to MCTX->STATE_LOG. */
3253 if (to_idx
== cur_str
)
3255 /* The backreference did epsilon transit, we must re-check all the
3256 node in the current state. */
3257 re_node_set new_dests
;
3258 reg_errcode_t err2
, err3
;
3259 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3260 if (re_node_set_contains (cur_nodes
, next_node
))
3262 err
= re_node_set_init_1 (&new_dests
, next_node
);
3263 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3264 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3265 re_node_set_free (&new_dests
);
3266 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3267 || err3
!= REG_NOERROR
, 0))
3269 err
= (err
!= REG_NOERROR
? err
3270 : (err2
!= REG_NOERROR
? err2
: err3
));
3273 /* TODO: It is still inefficient... */
3278 re_node_set union_set
;
3279 next_node
= dfa
->nexts
[ent
->node
];
3280 if (mctx
->state_log
[to_idx
])
3283 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3286 err
= re_node_set_init_copy (&union_set
,
3287 &mctx
->state_log
[to_idx
]->nodes
);
3288 ret
= re_node_set_insert (&union_set
, next_node
);
3289 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3291 re_node_set_free (&union_set
);
3292 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3298 err
= re_node_set_init_1 (&union_set
, next_node
);
3299 if (BE (err
!= REG_NOERROR
, 0))
3302 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3303 re_node_set_free (&union_set
);
3304 if (BE (mctx
->state_log
[to_idx
] == NULL
3305 && err
!= REG_NOERROR
, 0))
3309 while (ent
++->more
);
3313 /* Build transition table for the state.
3314 Return 1 if succeeded, otherwise return NULL. */
3318 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3321 int i
, j
, ch
, need_word_trtable
= 0;
3322 bitset_word_t elem
, mask
;
3323 bool dests_node_malloced
= false;
3324 bool dest_states_malloced
= false;
3325 int ndests
; /* Number of the destination states from `state'. */
3326 re_dfastate_t
**trtable
;
3327 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3328 re_node_set follows
, *dests_node
;
3330 bitset_t acceptable
;
3334 re_node_set dests_node
[SBC_MAX
];
3335 bitset_t dests_ch
[SBC_MAX
];
3338 /* We build DFA states which corresponds to the destination nodes
3339 from `state'. `dests_node[i]' represents the nodes which i-th
3340 destination state contains, and `dests_ch[i]' represents the
3341 characters which i-th destination state accepts. */
3343 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3344 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3348 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3349 if (BE (dests_alloc
== NULL
, 0))
3351 dests_node_malloced
= true;
3353 dests_node
= dests_alloc
->dests_node
;
3354 dests_ch
= dests_alloc
->dests_ch
;
3356 /* Initialize transiton table. */
3357 state
->word_trtable
= state
->trtable
= NULL
;
3359 /* At first, group all nodes belonging to `state' into several
3361 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3362 if (BE (ndests
<= 0, 0))
3364 if (dests_node_malloced
)
3366 /* Return 0 in case of an error, 1 otherwise. */
3369 state
->trtable
= (re_dfastate_t
**)
3370 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3376 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3377 if (BE (err
!= REG_NOERROR
, 0))
3380 /* Avoid arithmetic overflow in size calculation. */
3381 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3382 / (3 * sizeof (re_dfastate_t
*)))
3388 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3389 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3390 dest_states
= (re_dfastate_t
**)
3391 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3395 dest_states
= (re_dfastate_t
**)
3396 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3397 if (BE (dest_states
== NULL
, 0))
3400 if (dest_states_malloced
)
3402 re_node_set_free (&follows
);
3403 for (i
= 0; i
< ndests
; ++i
)
3404 re_node_set_free (dests_node
+ i
);
3405 if (dests_node_malloced
)
3409 dest_states_malloced
= true;
3411 dest_states_word
= dest_states
+ ndests
;
3412 dest_states_nl
= dest_states_word
+ ndests
;
3413 bitset_empty (acceptable
);
3415 /* Then build the states for all destinations. */
3416 for (i
= 0; i
< ndests
; ++i
)
3419 re_node_set_empty (&follows
);
3420 /* Merge the follows of this destination states. */
3421 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3423 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3424 if (next_node
!= -1)
3426 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3427 if (BE (err
!= REG_NOERROR
, 0))
3431 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3432 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3434 /* If the new state has context constraint,
3435 build appropriate states for these contexts. */
3436 if (dest_states
[i
]->has_constraint
)
3438 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3440 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3443 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3444 need_word_trtable
= 1;
3446 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3448 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3453 dest_states_word
[i
] = dest_states
[i
];
3454 dest_states_nl
[i
] = dest_states
[i
];
3456 bitset_merge (acceptable
, dests_ch
[i
]);
3459 if (!BE (need_word_trtable
, 0))
3461 /* We don't care about whether the following character is a word
3462 character, or we are in a single-byte character set so we can
3463 discern by looking at the character code: allocate a
3464 256-entry transition table. */
3465 trtable
= state
->trtable
=
3466 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3467 if (BE (trtable
== NULL
, 0))
3470 /* For all characters ch...: */
3471 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3472 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3474 mask
<<= 1, elem
>>= 1, ++ch
)
3475 if (BE (elem
& 1, 0))
3477 /* There must be exactly one destination which accepts
3478 character ch. See group_nodes_into_DFAstates. */
3479 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3482 /* j-th destination accepts the word character ch. */
3483 if (dfa
->word_char
[i
] & mask
)
3484 trtable
[ch
] = dest_states_word
[j
];
3486 trtable
[ch
] = dest_states
[j
];
3491 /* We care about whether the following character is a word
3492 character, and we are in a multi-byte character set: discern
3493 by looking at the character code: build two 256-entry
3494 transition tables, one starting at trtable[0] and one
3495 starting at trtable[SBC_MAX]. */
3496 trtable
= state
->word_trtable
=
3497 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3498 if (BE (trtable
== NULL
, 0))
3501 /* For all characters ch...: */
3502 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3503 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3505 mask
<<= 1, elem
>>= 1, ++ch
)
3506 if (BE (elem
& 1, 0))
3508 /* There must be exactly one destination which accepts
3509 character ch. See group_nodes_into_DFAstates. */
3510 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3513 /* j-th destination accepts the word character ch. */
3514 trtable
[ch
] = dest_states
[j
];
3515 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3520 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3522 /* The current state accepts newline character. */
3523 for (j
= 0; j
< ndests
; ++j
)
3524 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3526 /* k-th destination accepts newline character. */
3527 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3528 if (need_word_trtable
)
3529 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3530 /* There must be only one destination which accepts
3531 newline. See group_nodes_into_DFAstates. */
3536 if (dest_states_malloced
)
3539 re_node_set_free (&follows
);
3540 for (i
= 0; i
< ndests
; ++i
)
3541 re_node_set_free (dests_node
+ i
);
3543 if (dests_node_malloced
)
3549 /* Group all nodes belonging to STATE into several destinations.
3550 Then for all destinations, set the nodes belonging to the destination
3551 to DESTS_NODE[i] and set the characters accepted by the destination
3552 to DEST_CH[i]. This function return the number of destinations. */
3556 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3557 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3562 int ndests
; /* Number of the destinations from `state'. */
3563 bitset_t accepts
; /* Characters a node can accept. */
3564 const re_node_set
*cur_nodes
= &state
->nodes
;
3565 bitset_empty (accepts
);
3568 /* For all the nodes belonging to `state', */
3569 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3571 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3572 re_token_type_t type
= node
->type
;
3573 unsigned int constraint
= node
->constraint
;
3575 /* Enumerate all single byte character this node can accept. */
3576 if (type
== CHARACTER
)
3577 bitset_set (accepts
, node
->opr
.c
);
3578 else if (type
== SIMPLE_BRACKET
)
3580 bitset_merge (accepts
, node
->opr
.sbcset
);
3582 else if (type
== OP_PERIOD
)
3584 #ifdef RE_ENABLE_I18N
3585 if (dfa
->mb_cur_max
> 1)
3586 bitset_merge (accepts
, dfa
->sb_char
);
3589 bitset_set_all (accepts
);
3590 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3591 bitset_clear (accepts
, '\n');
3592 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3593 bitset_clear (accepts
, '\0');
3595 #ifdef RE_ENABLE_I18N
3596 else if (type
== OP_UTF8_PERIOD
)
3598 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3599 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3600 bitset_clear (accepts
, '\n');
3601 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3602 bitset_clear (accepts
, '\0');
3608 /* Check the `accepts' and sift the characters which are not
3609 match it the context. */
3612 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3614 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3615 bitset_empty (accepts
);
3616 if (accepts_newline
)
3617 bitset_set (accepts
, NEWLINE_CHAR
);
3621 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3623 bitset_empty (accepts
);
3627 if (constraint
& NEXT_WORD_CONSTRAINT
)
3629 bitset_word_t any_set
= 0;
3630 if (type
== CHARACTER
&& !node
->word_char
)
3632 bitset_empty (accepts
);
3635 #ifdef RE_ENABLE_I18N
3636 if (dfa
->mb_cur_max
> 1)
3637 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3638 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3641 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3642 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3646 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3648 bitset_word_t any_set
= 0;
3649 if (type
== CHARACTER
&& node
->word_char
)
3651 bitset_empty (accepts
);
3654 #ifdef RE_ENABLE_I18N
3655 if (dfa
->mb_cur_max
> 1)
3656 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3657 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3660 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3661 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3667 /* Then divide `accepts' into DFA states, or create a new
3668 state. Above, we make sure that accepts is not empty. */
3669 for (j
= 0; j
< ndests
; ++j
)
3671 bitset_t intersec
; /* Intersection sets, see below. */
3673 /* Flags, see below. */
3674 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3676 /* Optimization, skip if this state doesn't accept the character. */
3677 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3680 /* Enumerate the intersection set of this state and `accepts'. */
3682 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3683 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3684 /* And skip if the intersection set is empty. */
3688 /* Then check if this state is a subset of `accepts'. */
3689 not_subset
= not_consumed
= 0;
3690 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3692 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3693 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3696 /* If this state isn't a subset of `accepts', create a
3697 new group state, which has the `remains'. */
3700 bitset_copy (dests_ch
[ndests
], remains
);
3701 bitset_copy (dests_ch
[j
], intersec
);
3702 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3703 if (BE (err
!= REG_NOERROR
, 0))
3708 /* Put the position in the current group. */
3709 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3710 if (BE (result
< 0, 0))
3713 /* If all characters are consumed, go to next node. */
3717 /* Some characters remain, create a new group. */
3720 bitset_copy (dests_ch
[ndests
], accepts
);
3721 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3722 if (BE (err
!= REG_NOERROR
, 0))
3725 bitset_empty (accepts
);
3730 for (j
= 0; j
< ndests
; ++j
)
3731 re_node_set_free (dests_node
+ j
);
3735 #ifdef RE_ENABLE_I18N
3736 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3737 Return the number of the bytes the node accepts.
3738 STR_IDX is the current index of the input string.
3740 This function handles the nodes which can accept one character, or
3741 one collating element like '.', '[a-z]', opposite to the other nodes
3742 can only accept one byte. */
3746 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3747 const re_string_t
*input
, int str_idx
)
3749 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3750 int char_len
, elem_len
;
3754 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3756 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3757 if (BE (c
< 0xc2, 1))
3760 if (str_idx
+ 2 > input
->len
)
3763 d
= re_string_byte_at (input
, str_idx
+ 1);
3765 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3769 if (c
== 0xe0 && d
< 0xa0)
3775 if (c
== 0xf0 && d
< 0x90)
3781 if (c
== 0xf8 && d
< 0x88)
3787 if (c
== 0xfc && d
< 0x84)
3793 if (str_idx
+ char_len
> input
->len
)
3796 for (i
= 1; i
< char_len
; ++i
)
3798 d
= re_string_byte_at (input
, str_idx
+ i
);
3799 if (d
< 0x80 || d
> 0xbf)
3805 char_len
= re_string_char_size_at (input
, str_idx
);
3806 if (node
->type
== OP_PERIOD
)
3810 /* FIXME: I don't think this if is needed, as both '\n'
3811 and '\0' are char_len == 1. */
3812 /* '.' accepts any one character except the following two cases. */
3813 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3814 re_string_byte_at (input
, str_idx
) == '\n') ||
3815 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3816 re_string_byte_at (input
, str_idx
) == '\0'))
3821 elem_len
= re_string_elem_size_at (input
, str_idx
);
3822 wc
= __btowc(*(input
->mbs
+str_idx
));
3823 if (((elem_len
<= 1 && char_len
<= 1) || char_len
== 0) && (wc
!= WEOF
&& wc
< SBC_MAX
))
3826 if (node
->type
== COMPLEX_BRACKET
)
3828 const re_charset_t
*cset
= node
->opr
.mbcset
;
3830 const unsigned char *pin
3831 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3836 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3837 ? re_string_wchar_at (input
, str_idx
) : 0);
3839 /* match with multibyte character? */
3840 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3841 if (wc
== cset
->mbchars
[i
])
3843 match_len
= char_len
;
3844 goto check_node_accept_bytes_match
;
3846 /* match with character_class? */
3847 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3849 wctype_t wt
= cset
->char_classes
[i
];
3850 if (__iswctype (wc
, wt
))
3852 match_len
= char_len
;
3853 goto check_node_accept_bytes_match
;
3858 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3861 unsigned int in_collseq
= 0;
3862 const int32_t *table
, *indirect
;
3863 const unsigned char *weights
, *extra
;
3864 const char *collseqwc
;
3865 /* This #include defines a local function! */
3866 # include <locale/weight.h>
3868 /* match with collating_symbol? */
3869 if (cset
->ncoll_syms
)
3870 extra
= (const unsigned char *)
3871 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3872 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3874 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3875 /* Compare the length of input collating element and
3876 the length of current collating element. */
3877 if (*coll_sym
!= elem_len
)
3879 /* Compare each bytes. */
3880 for (j
= 0; j
< *coll_sym
; j
++)
3881 if (pin
[j
] != coll_sym
[1 + j
])
3885 /* Match if every bytes is equal. */
3887 goto check_node_accept_bytes_match
;
3893 if (elem_len
<= char_len
)
3895 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3896 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3899 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3901 /* match with range expression? */
3902 for (i
= 0; i
< cset
->nranges
; ++i
)
3903 if (cset
->range_starts
[i
] <= in_collseq
3904 && in_collseq
<= cset
->range_ends
[i
])
3906 match_len
= elem_len
;
3907 goto check_node_accept_bytes_match
;
3910 /* match with equivalence_class? */
3911 if (cset
->nequiv_classes
)
3913 const unsigned char *cp
= pin
;
3914 table
= (const int32_t *)
3915 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3916 weights
= (const unsigned char *)
3917 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3918 extra
= (const unsigned char *)
3919 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3920 indirect
= (const int32_t *)
3921 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3922 int32_t idx
= findidx (&cp
);
3924 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3926 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3927 size_t weight_len
= weights
[idx
& 0xffffff];
3928 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3929 && (idx
>> 24) == (equiv_class_idx
>> 24))
3934 equiv_class_idx
&= 0xffffff;
3936 while (cnt
<= weight_len
3937 && (weights
[equiv_class_idx
+ 1 + cnt
]
3938 == weights
[idx
+ 1 + cnt
]))
3940 if (cnt
> weight_len
)
3942 match_len
= elem_len
;
3943 goto check_node_accept_bytes_match
;
3952 /* match with range expression? */
3954 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3956 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3959 for (i
= 0; i
< cset
->nranges
; ++i
)
3961 cmp_buf
[0] = cset
->range_starts
[i
];
3962 cmp_buf
[4] = cset
->range_ends
[i
];
3963 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3964 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3966 match_len
= char_len
;
3967 goto check_node_accept_bytes_match
;
3971 check_node_accept_bytes_match
:
3972 if (!cset
->non_match
)
3979 return (elem_len
> char_len
) ? elem_len
: char_len
;
3988 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3990 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3995 /* No valid character. Match it as a single byte character. */
3996 const unsigned char *collseq
= (const unsigned char *)
3997 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3998 return collseq
[mbs
[0]];
4005 const unsigned char *extra
= (const unsigned char *)
4006 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
4007 int32_t extrasize
= (const unsigned char *)
4008 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
4010 for (idx
= 0; idx
< extrasize
;)
4012 int mbs_cnt
, found
= 0;
4013 int32_t elem_mbs_len
;
4014 /* Skip the name of collating element name. */
4015 idx
= idx
+ extra
[idx
] + 1;
4016 elem_mbs_len
= extra
[idx
++];
4017 if (mbs_len
== elem_mbs_len
)
4019 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4020 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4022 if (mbs_cnt
== elem_mbs_len
)
4023 /* Found the entry. */
4026 /* Skip the byte sequence of the collating element. */
4027 idx
+= elem_mbs_len
;
4028 /* Adjust for the alignment. */
4029 idx
= (idx
+ 3) & ~3;
4030 /* Skip the collation sequence value. */
4031 idx
+= sizeof (uint32_t);
4032 /* Skip the wide char sequence of the collating element. */
4033 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
4034 /* If we found the entry, return the sequence value. */
4036 return *(uint32_t *) (extra
+ idx
);
4037 /* Skip the collation sequence value. */
4038 idx
+= sizeof (uint32_t);
4044 #endif /* RE_ENABLE_I18N */
4046 /* Check whether the node accepts the byte which is IDX-th
4047 byte of the INPUT. */
4051 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4055 ch
= re_string_byte_at (&mctx
->input
, idx
);
4059 if (node
->opr
.c
!= ch
)
4063 case SIMPLE_BRACKET
:
4064 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4068 #ifdef RE_ENABLE_I18N
4069 case OP_UTF8_PERIOD
:
4075 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4076 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4084 if (node
->constraint
)
4086 /* The node has constraints. Check whether the current context
4087 satisfies the constraints. */
4088 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4090 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4097 /* Extend the buffers, if the buffers have run out. */
4099 static reg_errcode_t
4101 extend_buffers (re_match_context_t
*mctx
)
4104 re_string_t
*pstr
= &mctx
->input
;
4106 /* Avoid overflow. */
4107 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4110 /* Double the lengthes of the buffers. */
4111 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4112 if (BE (ret
!= REG_NOERROR
, 0))
4115 if (mctx
->state_log
!= NULL
)
4117 /* And double the length of state_log. */
4118 /* XXX We have no indication of the size of this buffer. If this
4119 allocation fail we have no indication that the state_log array
4120 does not have the right size. */
4121 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4122 pstr
->bufs_len
+ 1);
4123 if (BE (new_array
== NULL
, 0))
4125 mctx
->state_log
= new_array
;
4128 /* Then reconstruct the buffers. */
4131 #ifdef RE_ENABLE_I18N
4132 if (pstr
->mb_cur_max
> 1)
4134 ret
= build_wcs_upper_buffer (pstr
);
4135 if (BE (ret
!= REG_NOERROR
, 0))
4139 #endif /* RE_ENABLE_I18N */
4140 build_upper_buffer (pstr
);
4144 #ifdef RE_ENABLE_I18N
4145 if (pstr
->mb_cur_max
> 1)
4146 build_wcs_buffer (pstr
);
4148 #endif /* RE_ENABLE_I18N */
4150 if (pstr
->trans
!= NULL
)
4151 re_string_translate_buffer (pstr
);
4158 /* Functions for matching context. */
4160 /* Initialize MCTX. */
4162 static reg_errcode_t
4164 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4166 mctx
->eflags
= eflags
;
4167 mctx
->match_last
= -1;
4170 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4171 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4172 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4175 /* Already zero-ed by the caller.
4177 mctx->bkref_ents = NULL;
4178 mctx->nbkref_ents = 0;
4179 mctx->nsub_tops = 0; */
4180 mctx
->abkref_ents
= n
;
4181 mctx
->max_mb_elem_len
= 1;
4182 mctx
->asub_tops
= n
;
4186 /* Clean the entries which depend on the current input in MCTX.
4187 This function must be invoked when the matcher changes the start index
4188 of the input, or changes the input string. */
4192 match_ctx_clean (re_match_context_t
*mctx
)
4195 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4198 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4199 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4201 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4202 re_free (last
->path
.array
);
4205 re_free (top
->lasts
);
4208 re_free (top
->path
->array
);
4209 re_free (top
->path
);
4214 mctx
->nsub_tops
= 0;
4215 mctx
->nbkref_ents
= 0;
4218 /* Free all the memory associated with MCTX. */
4222 match_ctx_free (re_match_context_t
*mctx
)
4224 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4225 match_ctx_clean (mctx
);
4226 re_free (mctx
->sub_tops
);
4227 re_free (mctx
->bkref_ents
);
4230 /* Add a new backreference entry to MCTX.
4231 Note that we assume that caller never call this function with duplicate
4232 entry, and call with STR_IDX which isn't smaller than any existing entry.
4235 static reg_errcode_t
4237 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4240 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4242 struct re_backref_cache_entry
* new_entry
;
4243 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4244 mctx
->abkref_ents
* 2);
4245 if (BE (new_entry
== NULL
, 0))
4247 re_free (mctx
->bkref_ents
);
4250 mctx
->bkref_ents
= new_entry
;
4251 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4252 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4253 mctx
->abkref_ents
*= 2;
4255 if (mctx
->nbkref_ents
> 0
4256 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4257 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4259 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4260 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4261 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4262 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4264 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4265 If bit N is clear, means that this entry won't epsilon-transition to
4266 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4267 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4270 A backreference does not epsilon-transition unless it is empty, so set
4271 to all zeros if FROM != TO. */
4272 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4273 = (from
== to
? ~0 : 0);
4275 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4276 if (mctx
->max_mb_elem_len
< to
- from
)
4277 mctx
->max_mb_elem_len
= to
- from
;
4281 /* Search for the first entry which has the same str_idx, or -1 if none is
4282 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4286 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4288 int left
, right
, mid
, last
;
4289 last
= right
= mctx
->nbkref_ents
;
4290 for (left
= 0; left
< right
;)
4292 mid
= (left
+ right
) / 2;
4293 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4298 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4304 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4307 static reg_errcode_t
4309 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4312 assert (mctx
->sub_tops
!= NULL
);
4313 assert (mctx
->asub_tops
> 0);
4315 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4317 int new_asub_tops
= mctx
->asub_tops
* 2;
4318 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4319 re_sub_match_top_t
*,
4321 if (BE (new_array
== NULL
, 0))
4323 mctx
->sub_tops
= new_array
;
4324 mctx
->asub_tops
= new_asub_tops
;
4326 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4327 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4329 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4330 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4334 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4335 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4337 static re_sub_match_last_t
*
4339 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4341 re_sub_match_last_t
*new_entry
;
4342 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4344 int new_alasts
= 2 * subtop
->alasts
+ 1;
4345 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4346 re_sub_match_last_t
*,
4348 if (BE (new_array
== NULL
, 0))
4350 subtop
->lasts
= new_array
;
4351 subtop
->alasts
= new_alasts
;
4353 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4354 if (BE (new_entry
!= NULL
, 1))
4356 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4357 new_entry
->node
= node
;
4358 new_entry
->str_idx
= str_idx
;
4366 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4367 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4369 sctx
->sifted_states
= sifted_sts
;
4370 sctx
->limited_states
= limited_sts
;
4371 sctx
->last_node
= last_node
;
4372 sctx
->last_str_idx
= last_str_idx
;
4373 re_node_set_init_empty (&sctx
->limits
);