1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
21 Idx n
) internal_function
;
22 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
23 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
24 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, Idx node
,
25 Idx str_idx
, Idx from
, Idx to
)
27 static Idx
search_cur_bkref_entry (const re_match_context_t
*mctx
, Idx str_idx
)
29 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, Idx node
,
30 Idx str_idx
) internal_function
;
31 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
32 Idx node
, Idx str_idx
)
34 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
35 re_dfastate_t
**limited_sts
, Idx last_node
,
38 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
39 const char *string
, Idx length
,
40 Idx start
, Idx last_start
, Idx stop
,
41 size_t nmatch
, regmatch_t pmatch
[],
42 int eflags
) internal_function
;
43 static regoff_t
re_search_2_stub (struct re_pattern_buffer
*bufp
,
44 const char *string1
, Idx length1
,
45 const char *string2
, Idx length2
,
46 Idx start
, regoff_t range
,
47 struct re_registers
*regs
,
48 Idx stop
, bool ret_len
) internal_function
;
49 static regoff_t
re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, Idx length
, Idx start
,
51 regoff_t range
, Idx stop
,
52 struct re_registers
*regs
,
53 bool ret_len
) internal_function
;
54 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
55 Idx nregs
, int regs_allocated
) internal_function
;
56 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
58 static Idx
check_matching (re_match_context_t
*mctx
, bool fl_longest_match
,
59 Idx
*p_match_first
) internal_function
;
60 static Idx
check_halt_state_context (const re_match_context_t
*mctx
,
61 const re_dfastate_t
*state
, Idx idx
)
63 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
64 regmatch_t
*prev_idx_match
, Idx cur_node
,
65 Idx cur_idx
, Idx nmatch
) internal_function
;
66 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
67 Idx str_idx
, Idx dest_node
, Idx nregs
,
69 re_node_set
*eps_via_nodes
)
71 static reg_errcode_t
set_regs (const regex_t
*preg
,
72 const re_match_context_t
*mctx
,
73 size_t nmatch
, regmatch_t
*pmatch
,
74 bool fl_backtrack
) internal_function
;
75 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
79 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
80 re_sift_context_t
*sctx
,
81 Idx node_idx
, Idx str_idx
, Idx max_str_idx
)
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
85 re_sift_context_t
*sctx
)
87 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
88 re_sift_context_t
*sctx
, Idx str_idx
,
89 re_node_set
*cur_dest
)
91 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
92 re_sift_context_t
*sctx
,
94 re_node_set
*dest_nodes
)
96 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
97 re_node_set
*dest_nodes
,
98 const re_node_set
*candidates
)
100 static bool check_dst_limits (const re_match_context_t
*mctx
,
101 const re_node_set
*limits
,
102 Idx dst_node
, Idx dst_idx
, Idx src_node
,
103 Idx src_idx
) internal_function
;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
105 int boundaries
, Idx subexp_idx
,
106 Idx from_node
, Idx bkref_idx
)
108 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
109 Idx limit
, Idx subexp_idx
,
110 Idx node
, Idx str_idx
,
111 Idx bkref_idx
) internal_function
;
112 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
113 re_node_set
*dest_nodes
,
114 const re_node_set
*candidates
,
116 struct re_backref_cache_entry
*bkref_ents
,
117 Idx str_idx
) internal_function
;
118 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
119 re_sift_context_t
*sctx
,
120 Idx str_idx
, const re_node_set
*candidates
)
122 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
124 re_dfastate_t
**src
, Idx num
)
126 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
127 re_match_context_t
*mctx
) internal_function
;
128 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
129 re_match_context_t
*mctx
,
130 re_dfastate_t
*state
) internal_function
;
131 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
132 re_match_context_t
*mctx
,
133 re_dfastate_t
*next_state
)
135 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
136 re_node_set
*cur_nodes
,
137 Idx str_idx
) internal_function
;
139 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
140 re_match_context_t
*mctx
,
141 re_dfastate_t
*pstate
)
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
146 re_dfastate_t
*pstate
)
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
150 const re_node_set
*nodes
)
152 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
153 Idx bkref_node
, Idx bkref_str_idx
)
155 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
156 const re_sub_match_top_t
*sub_top
,
157 re_sub_match_last_t
*sub_last
,
158 Idx bkref_node
, Idx bkref_str
)
160 static Idx
find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
161 Idx subexp_idx
, int type
) internal_function
;
162 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
163 state_array_t
*path
, Idx top_node
,
164 Idx top_str
, Idx last_node
, Idx last_str
,
165 int type
) internal_function
;
166 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
168 re_node_set
*cur_nodes
,
169 re_node_set
*next_nodes
)
171 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
172 re_node_set
*cur_nodes
,
173 Idx ex_subexp
, int type
)
175 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
176 re_node_set
*dst_nodes
,
177 Idx target
, Idx ex_subexp
,
178 int type
) internal_function
;
179 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
180 re_node_set
*cur_nodes
, Idx cur_str
,
181 Idx subexp_num
, int type
)
183 static bool build_trtable (const re_dfa_t
*dfa
,
184 re_dfastate_t
*state
) internal_function
;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t
*dfa
, Idx node_idx
,
187 const re_string_t
*input
, Idx idx
)
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
194 #endif /* RE_ENABLE_I18N */
195 static Idx
group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
196 const re_dfastate_t
*state
,
197 re_node_set
*states_node
,
198 bitset_t
*states_ch
) internal_function
;
199 static bool check_node_accept (const re_match_context_t
*mctx
,
200 const re_token_t
*node
, Idx idx
)
202 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
, int min_len
)
205 /* Entry point for POSIX code. */
207 /* regexec searches for a given pattern, specified by PREG, in the
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
215 EFLAGS specifies "execution flags" which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
219 We return 0 if we find a match and REG_NOMATCH if not. */
222 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
223 const regex_t
*_Restrict_ preg
;
224 const char *_Restrict_ string
;
226 regmatch_t pmatch
[_Restrict_arr_
];
231 re_dfa_t
*dfa
= preg
->buffer
;
233 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
236 if (eflags
& REG_STARTEND
)
238 start
= pmatch
[0].rm_so
;
239 length
= pmatch
[0].rm_eo
;
244 length
= strlen (string
);
247 lock_lock (dfa
->lock
);
249 err
= re_search_internal (preg
, string
, length
, start
, length
,
250 length
, 0, NULL
, eflags
);
252 err
= re_search_internal (preg
, string
, length
, start
, length
,
253 length
, nmatch
, pmatch
, eflags
);
254 lock_unlock (dfa
->lock
);
255 return err
!= REG_NOERROR
;
259 # include <shlib-compat.h>
260 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
262 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
263 __typeof__ (__regexec
) __compat_regexec
;
266 attribute_compat_text_section
267 __compat_regexec (const regex_t
*_Restrict_ preg
,
268 const char *_Restrict_ string
, size_t nmatch
,
269 regmatch_t pmatch
[], int eflags
)
271 return regexec (preg
, string
, nmatch
, pmatch
,
272 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
274 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
278 /* Entry points for GNU code. */
280 /* re_match, re_search, re_match_2, re_search_2
282 The former two functions operate on STRING with length LENGTH,
283 while the later two operate on concatenation of STRING1 and STRING2
284 with lengths LENGTH1 and LENGTH2, respectively.
286 re_match() matches the compiled pattern in BUFP against the string,
287 starting at index START.
289 re_search() first tries matching at index START, then it tries to match
290 starting from index START + 1, and so on. The last start position tried
291 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
294 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
295 the first STOP characters of the concatenation of the strings should be
298 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
299 and all groups is stored in REGS. (For the "_2" variants, the offsets are
300 computed relative to the concatenation, not relative to the individual
303 On success, re_match* functions return the length of the match, re_search*
304 return the position of the start of the match. Return value -1 means no
305 match was found and -2 indicates an internal error. */
308 re_match (bufp
, string
, length
, start
, regs
)
309 struct re_pattern_buffer
*bufp
;
312 struct re_registers
*regs
;
314 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, true);
317 weak_alias (__re_match
, re_match
)
321 re_search (bufp
, string
, length
, start
, range
, regs
)
322 struct re_pattern_buffer
*bufp
;
326 struct re_registers
*regs
;
328 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
,
332 weak_alias (__re_search
, re_search
)
336 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
337 struct re_pattern_buffer
*bufp
;
338 const char *string1
, *string2
;
339 Idx length1
, length2
, start
, stop
;
340 struct re_registers
*regs
;
342 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
343 start
, 0, regs
, stop
, true);
346 weak_alias (__re_match_2
, re_match_2
)
350 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
351 struct re_pattern_buffer
*bufp
;
352 const char *string1
, *string2
;
353 Idx length1
, length2
, start
, stop
;
355 struct re_registers
*regs
;
357 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
358 start
, range
, regs
, stop
, false);
361 weak_alias (__re_search_2
, re_search_2
)
365 re_search_2_stub (struct re_pattern_buffer
*bufp
,
366 const char *string1
, Idx length1
,
367 const char *string2
, Idx length2
,
368 Idx start
, regoff_t range
, struct re_registers
*regs
,
369 Idx stop
, bool ret_len
)
373 Idx len
= length1
+ length2
;
376 if (BE (length1
< 0 || length2
< 0 || stop
< 0 || len
< length1
, 0))
379 /* Concatenate the strings. */
383 s
= re_malloc (char, len
);
385 if (BE (s
== NULL
, 0))
388 memcpy (__mempcpy (s
, string1
, length1
), string2
, length2
);
390 memcpy (s
, string1
, length1
);
391 memcpy (s
+ length1
, string2
, length2
);
400 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
406 /* The parameters have the same meaning as those of re_search.
407 Additional parameters:
408 If RET_LEN is true the length of the match is returned (re_match style);
409 otherwise the position of the match is returned. */
412 re_search_stub (struct re_pattern_buffer
*bufp
,
413 const char *string
, Idx length
,
414 Idx start
, regoff_t range
, Idx stop
, struct re_registers
*regs
,
417 reg_errcode_t result
;
422 re_dfa_t
*dfa
= bufp
->buffer
;
423 Idx last_start
= start
+ range
;
425 /* Check for out-of-range. */
426 if (BE (start
< 0 || start
> length
, 0))
428 if (BE (length
< last_start
|| (0 <= range
&& last_start
< start
), 0))
430 else if (BE (last_start
< 0 || (range
< 0 && start
<= last_start
), 0))
433 lock_lock (dfa
->lock
);
435 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
436 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
438 /* Compile fastmap if we haven't yet. */
439 if (start
< last_start
&& bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
440 re_compile_fastmap (bufp
);
442 if (BE (bufp
->no_sub
, 0))
445 /* We need at least 1 register. */
448 else if (BE (bufp
->regs_allocated
== REGS_FIXED
449 && regs
->num_regs
<= bufp
->re_nsub
, 0))
451 nregs
= regs
->num_regs
;
452 if (BE (nregs
< 1, 0))
454 /* Nothing can be copied to regs. */
460 nregs
= bufp
->re_nsub
+ 1;
461 pmatch
= re_malloc (regmatch_t
, nregs
);
462 if (BE (pmatch
== NULL
, 0))
468 result
= re_search_internal (bufp
, string
, length
, start
, last_start
, stop
,
469 nregs
, pmatch
, eflags
);
473 /* I hope we needn't fill their regs with -1's when no match was found. */
474 if (result
!= REG_NOERROR
)
475 rval
= result
== REG_NOMATCH
? -1 : -2;
476 else if (regs
!= NULL
)
478 /* If caller wants register contents data back, copy them. */
479 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
480 bufp
->regs_allocated
);
481 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
485 if (BE (rval
== 0, 1))
489 assert (pmatch
[0].rm_so
== start
);
490 rval
= pmatch
[0].rm_eo
- start
;
493 rval
= pmatch
[0].rm_so
;
497 lock_unlock (dfa
->lock
);
502 re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
, Idx nregs
,
505 int rval
= REGS_REALLOCATE
;
507 Idx need_regs
= nregs
+ 1;
508 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
511 /* Have the register data arrays been allocated? */
512 if (regs_allocated
== REGS_UNALLOCATED
)
513 { /* No. So allocate them with malloc. */
514 regs
->start
= re_malloc (regoff_t
, need_regs
);
515 if (BE (regs
->start
== NULL
, 0))
516 return REGS_UNALLOCATED
;
517 regs
->end
= re_malloc (regoff_t
, need_regs
);
518 if (BE (regs
->end
== NULL
, 0))
520 re_free (regs
->start
);
521 return REGS_UNALLOCATED
;
523 regs
->num_regs
= need_regs
;
525 else if (regs_allocated
== REGS_REALLOCATE
)
526 { /* Yes. If we need more elements than were already
527 allocated, reallocate them. If we need fewer, just
529 if (BE (need_regs
> regs
->num_regs
, 0))
531 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
533 if (BE (new_start
== NULL
, 0))
534 return REGS_UNALLOCATED
;
535 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
536 if (BE (new_end
== NULL
, 0))
539 return REGS_UNALLOCATED
;
541 regs
->start
= new_start
;
543 regs
->num_regs
= need_regs
;
548 assert (regs_allocated
== REGS_FIXED
);
549 /* This function may not be called with REGS_FIXED and nregs too big. */
550 assert (regs
->num_regs
>= nregs
);
555 for (i
= 0; i
< nregs
; ++i
)
557 regs
->start
[i
] = pmatch
[i
].rm_so
;
558 regs
->end
[i
] = pmatch
[i
].rm_eo
;
560 for ( ; i
< regs
->num_regs
; ++i
)
561 regs
->start
[i
] = regs
->end
[i
] = -1;
566 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
567 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
568 this memory for recording register information. STARTS and ENDS
569 must be allocated using the malloc library routine, and must each
570 be at least NUM_REGS * sizeof (regoff_t) bytes long.
572 If NUM_REGS == 0, then subsequent matches should allocate their own
575 Unless this function is called, the first search or match using
576 PATTERN_BUFFER will allocate its own register data, without
577 freeing the old data. */
580 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
581 struct re_pattern_buffer
*bufp
;
582 struct re_registers
*regs
;
583 __re_size_t num_regs
;
584 regoff_t
*starts
, *ends
;
588 bufp
->regs_allocated
= REGS_REALLOCATE
;
589 regs
->num_regs
= num_regs
;
590 regs
->start
= starts
;
595 bufp
->regs_allocated
= REGS_UNALLOCATED
;
597 regs
->start
= regs
->end
= NULL
;
601 weak_alias (__re_set_registers
, re_set_registers
)
604 /* Entry points compatible with 4.2 BSD regex library. We don't define
605 them unless specifically requested. */
607 #if defined _REGEX_RE_COMP || defined _LIBC
615 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
617 #endif /* _REGEX_RE_COMP */
619 /* Internal entry point. */
621 /* Searches for a compiled pattern PREG in the string STRING, whose
622 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
623 meaning as with regexec. LAST_START is START + RANGE, where
624 START and RANGE have the same meaning as with re_search.
625 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
626 otherwise return the error code.
627 Note: We assume front end functions already check ranges.
628 (0 <= LAST_START && LAST_START <= LENGTH) */
631 __attribute_warn_unused_result__
632 re_search_internal (const regex_t
*preg
,
633 const char *string
, Idx length
,
634 Idx start
, Idx last_start
, Idx stop
,
635 size_t nmatch
, regmatch_t pmatch
[],
639 const re_dfa_t
*dfa
= preg
->buffer
;
640 Idx left_lim
, right_lim
;
642 bool fl_longest_match
;
645 Idx match_last
= REG_MISSING
;
649 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
650 re_match_context_t mctx
= { .dfa
= dfa
};
652 re_match_context_t mctx
;
654 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
655 && start
!= last_start
&& !preg
->can_be_null
)
656 ? preg
->fastmap
: NULL
);
657 RE_TRANSLATE_TYPE t
= preg
->translate
;
659 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
660 memset (&mctx
, '\0', sizeof (re_match_context_t
));
664 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
665 nmatch
-= extra_nmatch
;
667 /* Check if the DFA haven't been compiled. */
668 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
669 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
670 || dfa
->init_state_begbuf
== NULL
, 0))
674 /* We assume front-end functions already check them. */
675 assert (0 <= last_start
&& last_start
<= length
);
678 /* If initial states with non-begbuf contexts have no elements,
679 the regex must be anchored. If preg->newline_anchor is set,
680 we'll never use init_state_nl, so do not check it. */
681 if (dfa
->init_state
->nodes
.nelem
== 0
682 && dfa
->init_state_word
->nodes
.nelem
== 0
683 && (dfa
->init_state_nl
->nodes
.nelem
== 0
684 || !preg
->newline_anchor
))
686 if (start
!= 0 && last_start
!= 0)
688 start
= last_start
= 0;
691 /* We must check the longest matching, if nmatch > 0. */
692 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
694 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
695 preg
->translate
, (preg
->syntax
& RE_ICASE
) != 0,
697 if (BE (err
!= REG_NOERROR
, 0))
699 mctx
.input
.stop
= stop
;
700 mctx
.input
.raw_stop
= stop
;
701 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
703 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
704 if (BE (err
!= REG_NOERROR
, 0))
707 /* We will log all the DFA states through which the dfa pass,
708 if nmatch > 1, or this dfa has "multibyte node", which is a
709 back-reference or a node which can accept multibyte character or
710 multi character collating element. */
711 if (nmatch
> 1 || dfa
->has_mb_node
)
713 /* Avoid overflow. */
714 if (BE ((MIN (IDX_MAX
, SIZE_MAX
/ sizeof (re_dfastate_t
*))
715 <= mctx
.input
.bufs_len
), 0))
721 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
722 if (BE (mctx
.state_log
== NULL
, 0))
729 mctx
.state_log
= NULL
;
732 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
733 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
735 /* Check incrementally whether the input string matches. */
736 incr
= (last_start
< start
) ? -1 : 1;
737 left_lim
= (last_start
< start
) ? last_start
: start
;
738 right_lim
= (last_start
< start
) ? start
: last_start
;
739 sb
= dfa
->mb_cur_max
== 1;
742 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
743 | (start
<= last_start
? 2 : 0)
744 | (t
!= NULL
? 1 : 0))
747 for (;; match_first
+= incr
)
750 if (match_first
< left_lim
|| right_lim
< match_first
)
753 /* Advance as rapidly as possible through the string, until we
754 find a plausible place to start matching. This may be done
755 with varying efficiency, so there are various possibilities:
756 only the most common of them are specialized, in order to
757 save on code size. We use a switch statement for speed. */
765 /* Fastmap with single-byte translation, match forward. */
766 while (BE (match_first
< right_lim
, 1)
767 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
769 goto forward_match_found_start_or_reached_end
;
772 /* Fastmap without translation, match forward. */
773 while (BE (match_first
< right_lim
, 1)
774 && !fastmap
[(unsigned char) string
[match_first
]])
777 forward_match_found_start_or_reached_end
:
778 if (BE (match_first
== right_lim
, 0))
780 ch
= match_first
>= length
781 ? 0 : (unsigned char) string
[match_first
];
782 if (!fastmap
[t
? t
[ch
] : ch
])
789 /* Fastmap without multi-byte translation, match backwards. */
790 while (match_first
>= left_lim
)
792 ch
= match_first
>= length
793 ? 0 : (unsigned char) string
[match_first
];
794 if (fastmap
[t
? t
[ch
] : ch
])
798 if (match_first
< left_lim
)
803 /* In this case, we can't determine easily the current byte,
804 since it might be a component byte of a multibyte
805 character. Then we use the constructed buffer instead. */
808 /* If MATCH_FIRST is out of the valid range, reconstruct the
810 __re_size_t offset
= match_first
- mctx
.input
.raw_mbs_idx
;
811 if (BE (offset
>= (__re_size_t
) mctx
.input
.valid_raw_len
, 0))
813 err
= re_string_reconstruct (&mctx
.input
, match_first
,
815 if (BE (err
!= REG_NOERROR
, 0))
818 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
820 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
821 Note that MATCH_FIRST must not be smaller than 0. */
822 ch
= (match_first
>= length
823 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
827 if (match_first
< left_lim
|| match_first
> right_lim
)
836 /* Reconstruct the buffers so that the matcher can assume that
837 the matching starts from the beginning of the buffer. */
838 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
839 if (BE (err
!= REG_NOERROR
, 0))
842 #ifdef RE_ENABLE_I18N
843 /* Don't consider this char as a possible match start if it part,
844 yet isn't the head, of a multibyte character. */
845 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
849 /* It seems to be appropriate one, then use the matcher. */
850 /* We assume that the matching starts from 0. */
851 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
852 match_last
= check_matching (&mctx
, fl_longest_match
,
853 start
<= last_start
? &match_first
: NULL
);
854 if (match_last
!= REG_MISSING
)
856 if (BE (match_last
== REG_ERROR
, 0))
863 mctx
.match_last
= match_last
;
864 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
866 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
867 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
870 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
873 err
= prune_impossible_nodes (&mctx
);
874 if (err
== REG_NOERROR
)
876 if (BE (err
!= REG_NOMATCH
, 0))
878 match_last
= REG_MISSING
;
881 break; /* We found a match. */
885 match_ctx_clean (&mctx
);
889 assert (match_last
!= REG_MISSING
);
890 assert (err
== REG_NOERROR
);
893 /* Set pmatch[] if we need. */
898 /* Initialize registers. */
899 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
900 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
902 /* Set the points where matching start/end. */
904 pmatch
[0].rm_eo
= mctx
.match_last
;
905 /* FIXME: This function should fail if mctx.match_last exceeds
906 the maximum possible regoff_t value. We need a new error
907 code REG_OVERFLOW. */
909 if (!preg
->no_sub
&& nmatch
> 1)
911 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
912 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
913 if (BE (err
!= REG_NOERROR
, 0))
917 /* At last, add the offset to each register, since we slid
918 the buffers so that we could assume that the matching starts
920 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
921 if (pmatch
[reg_idx
].rm_so
!= -1)
923 #ifdef RE_ENABLE_I18N
924 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
926 pmatch
[reg_idx
].rm_so
=
927 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
928 ? mctx
.input
.valid_raw_len
929 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
930 pmatch
[reg_idx
].rm_eo
=
931 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
932 ? mctx
.input
.valid_raw_len
933 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
936 assert (mctx
.input
.offsets_needed
== 0);
938 pmatch
[reg_idx
].rm_so
+= match_first
;
939 pmatch
[reg_idx
].rm_eo
+= match_first
;
941 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
943 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
944 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
948 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
949 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
951 pmatch
[reg_idx
+ 1].rm_so
952 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
953 pmatch
[reg_idx
+ 1].rm_eo
954 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
959 re_free (mctx
.state_log
);
961 match_ctx_free (&mctx
);
962 re_string_destruct (&mctx
.input
);
967 __attribute_warn_unused_result__
968 prune_impossible_nodes (re_match_context_t
*mctx
)
970 const re_dfa_t
*const dfa
= mctx
->dfa
;
971 Idx halt_node
, match_last
;
973 re_dfastate_t
**sifted_states
;
974 re_dfastate_t
**lim_states
= NULL
;
975 re_sift_context_t sctx
;
977 assert (mctx
->state_log
!= NULL
);
979 match_last
= mctx
->match_last
;
980 halt_node
= mctx
->last_node
;
982 /* Avoid overflow. */
983 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ sizeof (re_dfastate_t
*)) <= match_last
, 0))
986 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
987 if (BE (sifted_states
== NULL
, 0))
994 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
995 if (BE (lim_states
== NULL
, 0))
1002 memset (lim_states
, '\0',
1003 sizeof (re_dfastate_t
*) * (match_last
+ 1));
1004 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
1006 ret
= sift_states_backward (mctx
, &sctx
);
1007 re_node_set_free (&sctx
.limits
);
1008 if (BE (ret
!= REG_NOERROR
, 0))
1010 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
1015 if (! REG_VALID_INDEX (match_last
))
1020 } while (mctx
->state_log
[match_last
] == NULL
1021 || !mctx
->state_log
[match_last
]->halt
);
1022 halt_node
= check_halt_state_context (mctx
,
1023 mctx
->state_log
[match_last
],
1026 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
1028 re_free (lim_states
);
1030 if (BE (ret
!= REG_NOERROR
, 0))
1035 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1036 ret
= sift_states_backward (mctx
, &sctx
);
1037 re_node_set_free (&sctx
.limits
);
1038 if (BE (ret
!= REG_NOERROR
, 0))
1040 if (sifted_states
[0] == NULL
)
1046 re_free (mctx
->state_log
);
1047 mctx
->state_log
= sifted_states
;
1048 sifted_states
= NULL
;
1049 mctx
->last_node
= halt_node
;
1050 mctx
->match_last
= match_last
;
1053 re_free (sifted_states
);
1054 re_free (lim_states
);
1058 /* Acquire an initial state and return it.
1059 We must select appropriate initial state depending on the context,
1060 since initial states may have constraints like "\<", "^", etc.. */
1062 static inline re_dfastate_t
*
1063 __attribute__ ((always_inline
)) internal_function
1064 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1067 const re_dfa_t
*const dfa
= mctx
->dfa
;
1068 if (dfa
->init_state
->has_constraint
)
1070 unsigned int context
;
1071 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1072 if (IS_WORD_CONTEXT (context
))
1073 return dfa
->init_state_word
;
1074 else if (IS_ORDINARY_CONTEXT (context
))
1075 return dfa
->init_state
;
1076 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1077 return dfa
->init_state_begbuf
;
1078 else if (IS_NEWLINE_CONTEXT (context
))
1079 return dfa
->init_state_nl
;
1080 else if (IS_BEGBUF_CONTEXT (context
))
1082 /* It is relatively rare case, then calculate on demand. */
1083 return re_acquire_state_context (err
, dfa
,
1084 dfa
->init_state
->entrance_nodes
,
1088 /* Must not happen? */
1089 return dfa
->init_state
;
1092 return dfa
->init_state
;
1095 /* Check whether the regular expression match input string INPUT or not,
1096 and return the index where the matching end. Return REG_MISSING if
1097 there is no match, and return REG_ERROR in case of an error.
1098 FL_LONGEST_MATCH means we want the POSIX longest matching.
1099 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1100 next place where we may want to try matching.
1101 Note that the matcher assumes that the matching starts from the current
1102 index of the buffer. */
1105 internal_function __attribute_warn_unused_result__
1106 check_matching (re_match_context_t
*mctx
, bool fl_longest_match
,
1109 const re_dfa_t
*const dfa
= mctx
->dfa
;
1112 Idx match_last
= REG_MISSING
;
1113 Idx cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1114 re_dfastate_t
*cur_state
;
1115 bool at_init_state
= p_match_first
!= NULL
;
1116 Idx next_start_idx
= cur_str_idx
;
1119 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1120 /* An initial state must not be NULL (invalid). */
1121 if (BE (cur_state
== NULL
, 0))
1123 assert (err
== REG_ESPACE
);
1127 if (mctx
->state_log
!= NULL
)
1129 mctx
->state_log
[cur_str_idx
] = cur_state
;
1131 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1132 later. E.g. Processing back references. */
1133 if (BE (dfa
->nbackref
, 0))
1135 at_init_state
= false;
1136 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1137 if (BE (err
!= REG_NOERROR
, 0))
1140 if (cur_state
->has_backref
)
1142 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1143 if (BE (err
!= REG_NOERROR
, 0))
1149 /* If the RE accepts NULL string. */
1150 if (BE (cur_state
->halt
, 0))
1152 if (!cur_state
->has_constraint
1153 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1155 if (!fl_longest_match
)
1159 match_last
= cur_str_idx
;
1165 while (!re_string_eoi (&mctx
->input
))
1167 re_dfastate_t
*old_state
= cur_state
;
1168 Idx next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1170 if ((BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1171 && mctx
->input
.bufs_len
< mctx
->input
.len
)
1172 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1173 && mctx
->input
.valid_len
< mctx
->input
.len
))
1175 err
= extend_buffers (mctx
, next_char_idx
+ 1);
1176 if (BE (err
!= REG_NOERROR
, 0))
1178 assert (err
== REG_ESPACE
);
1183 cur_state
= transit_state (&err
, mctx
, cur_state
);
1184 if (mctx
->state_log
!= NULL
)
1185 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1187 if (cur_state
== NULL
)
1189 /* Reached the invalid state or an error. Try to recover a valid
1190 state using the state log, if available and if we have not
1191 already found a valid (even if not the longest) match. */
1192 if (BE (err
!= REG_NOERROR
, 0))
1195 if (mctx
->state_log
== NULL
1196 || (match
&& !fl_longest_match
)
1197 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1201 if (BE (at_init_state
, 0))
1203 if (old_state
== cur_state
)
1204 next_start_idx
= next_char_idx
;
1206 at_init_state
= false;
1209 if (cur_state
->halt
)
1211 /* Reached a halt state.
1212 Check the halt state can satisfy the current context. */
1213 if (!cur_state
->has_constraint
1214 || check_halt_state_context (mctx
, cur_state
,
1215 re_string_cur_idx (&mctx
->input
)))
1217 /* We found an appropriate halt state. */
1218 match_last
= re_string_cur_idx (&mctx
->input
);
1221 /* We found a match, do not modify match_first below. */
1222 p_match_first
= NULL
;
1223 if (!fl_longest_match
)
1230 *p_match_first
+= next_start_idx
;
1235 /* Check NODE match the current context. */
1239 check_halt_node_context (const re_dfa_t
*dfa
, Idx node
, unsigned int context
)
1241 re_token_type_t type
= dfa
->nodes
[node
].type
;
1242 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1243 if (type
!= END_OF_RE
)
1247 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1252 /* Check the halt state STATE match the current context.
1253 Return 0 if not match, if the node, STATE has, is a halt node and
1254 match the context, return the node. */
1258 check_halt_state_context (const re_match_context_t
*mctx
,
1259 const re_dfastate_t
*state
, Idx idx
)
1262 unsigned int context
;
1264 assert (state
->halt
);
1266 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1267 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1268 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1269 return state
->nodes
.elems
[i
];
1273 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1274 corresponding to the DFA).
1275 Return the destination node, and update EPS_VIA_NODES;
1276 return REG_MISSING in case of errors. */
1280 proceed_next_node (const re_match_context_t
*mctx
, Idx nregs
, regmatch_t
*regs
,
1281 Idx
*pidx
, Idx node
, re_node_set
*eps_via_nodes
,
1282 struct re_fail_stack_t
*fs
)
1284 const re_dfa_t
*const dfa
= mctx
->dfa
;
1287 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1289 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1290 re_node_set
*edests
= &dfa
->edests
[node
];
1292 ok
= re_node_set_insert (eps_via_nodes
, node
);
1295 /* Pick up a valid destination, or return REG_MISSING if none
1297 for (dest_node
= REG_MISSING
, i
= 0; i
< edests
->nelem
; ++i
)
1299 Idx candidate
= edests
->elems
[i
];
1300 if (!re_node_set_contains (cur_nodes
, candidate
))
1302 if (dest_node
== REG_MISSING
)
1303 dest_node
= candidate
;
1307 /* In order to avoid infinite loop like "(a*)*", return the second
1308 epsilon-transition if the first was already considered. */
1309 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1312 /* Otherwise, push the second epsilon-transition on the fail stack. */
1314 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1318 /* We know we are going to exit. */
1327 re_token_type_t type
= dfa
->nodes
[node
].type
;
1329 #ifdef RE_ENABLE_I18N
1330 if (dfa
->nodes
[node
].accept_mb
)
1331 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1333 #endif /* RE_ENABLE_I18N */
1334 if (type
== OP_BACK_REF
)
1336 Idx subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1337 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1340 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1344 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1345 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1354 ok
= re_node_set_insert (eps_via_nodes
, node
);
1357 dest_node
= dfa
->edests
[node
].elems
[0];
1358 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1365 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1367 Idx dest_node
= dfa
->nexts
[node
];
1368 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1369 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1370 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1373 re_node_set_empty (eps_via_nodes
);
1380 static reg_errcode_t
1381 internal_function __attribute_warn_unused_result__
1382 push_fail_stack (struct re_fail_stack_t
*fs
, Idx str_idx
, Idx dest_node
,
1383 Idx nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1386 Idx num
= fs
->num
++;
1387 if (fs
->num
== fs
->alloc
)
1389 struct re_fail_stack_ent_t
*new_array
;
1390 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1392 if (new_array
== NULL
)
1395 fs
->stack
= new_array
;
1397 fs
->stack
[num
].idx
= str_idx
;
1398 fs
->stack
[num
].node
= dest_node
;
1399 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1400 if (fs
->stack
[num
].regs
== NULL
)
1402 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1403 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1409 pop_fail_stack (struct re_fail_stack_t
*fs
, Idx
*pidx
, Idx nregs
,
1410 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1412 Idx num
= --fs
->num
;
1413 assert (REG_VALID_INDEX (num
));
1414 *pidx
= fs
->stack
[num
].idx
;
1415 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1416 re_node_set_free (eps_via_nodes
);
1417 re_free (fs
->stack
[num
].regs
);
1418 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1419 return fs
->stack
[num
].node
;
1422 /* Set the positions where the subexpressions are starts/ends to registers
1424 Note: We assume that pmatch[0] is already set, and
1425 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1427 static reg_errcode_t
1428 internal_function __attribute_warn_unused_result__
1429 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1430 regmatch_t
*pmatch
, bool fl_backtrack
)
1432 const re_dfa_t
*dfa
= preg
->buffer
;
1434 re_node_set eps_via_nodes
;
1435 struct re_fail_stack_t
*fs
;
1436 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1437 regmatch_t
*prev_idx_match
;
1438 bool prev_idx_match_malloced
= false;
1441 assert (nmatch
> 1);
1442 assert (mctx
->state_log
!= NULL
);
1447 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1448 if (fs
->stack
== NULL
)
1454 cur_node
= dfa
->init_node
;
1455 re_node_set_init_empty (&eps_via_nodes
);
1457 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1458 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1461 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1462 if (prev_idx_match
== NULL
)
1464 free_fail_stack_return (fs
);
1467 prev_idx_match_malloced
= true;
1469 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1471 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1473 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1475 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1480 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1481 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1483 if (reg_idx
== nmatch
)
1485 re_node_set_free (&eps_via_nodes
);
1486 if (prev_idx_match_malloced
)
1487 re_free (prev_idx_match
);
1488 return free_fail_stack_return (fs
);
1490 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1495 re_node_set_free (&eps_via_nodes
);
1496 if (prev_idx_match_malloced
)
1497 re_free (prev_idx_match
);
1502 /* Proceed to next node. */
1503 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1504 &eps_via_nodes
, fs
);
1506 if (BE (! REG_VALID_INDEX (cur_node
), 0))
1508 if (BE (cur_node
== REG_ERROR
, 0))
1510 re_node_set_free (&eps_via_nodes
);
1511 if (prev_idx_match_malloced
)
1512 re_free (prev_idx_match
);
1513 free_fail_stack_return (fs
);
1517 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1521 re_node_set_free (&eps_via_nodes
);
1522 if (prev_idx_match_malloced
)
1523 re_free (prev_idx_match
);
1528 re_node_set_free (&eps_via_nodes
);
1529 if (prev_idx_match_malloced
)
1530 re_free (prev_idx_match
);
1531 return free_fail_stack_return (fs
);
1534 static reg_errcode_t
1536 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1541 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1543 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1544 re_free (fs
->stack
[fs_idx
].regs
);
1546 re_free (fs
->stack
);
1553 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1554 regmatch_t
*prev_idx_match
, Idx cur_node
, Idx cur_idx
, Idx nmatch
)
1556 int type
= dfa
->nodes
[cur_node
].type
;
1557 if (type
== OP_OPEN_SUBEXP
)
1559 Idx reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1561 /* We are at the first node of this sub expression. */
1562 if (reg_num
< nmatch
)
1564 pmatch
[reg_num
].rm_so
= cur_idx
;
1565 pmatch
[reg_num
].rm_eo
= -1;
1568 else if (type
== OP_CLOSE_SUBEXP
)
1570 Idx reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1571 if (reg_num
< nmatch
)
1573 /* We are at the last node of this sub expression. */
1574 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1576 pmatch
[reg_num
].rm_eo
= cur_idx
;
1577 /* This is a non-empty match or we are not inside an optional
1578 subexpression. Accept this right away. */
1579 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1583 if (dfa
->nodes
[cur_node
].opt_subexp
1584 && prev_idx_match
[reg_num
].rm_so
!= -1)
1585 /* We transited through an empty match for an optional
1586 subexpression, like (a?)*, and this is not the subexp's
1587 first match. Copy back the old content of the registers
1588 so that matches of an inner subexpression are undone as
1589 well, like in ((a?))*. */
1590 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1592 /* We completed a subexpression, but it may be part of
1593 an optional one, so do not update PREV_IDX_MATCH. */
1594 pmatch
[reg_num
].rm_eo
= cur_idx
;
1600 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1601 and sift the nodes in each states according to the following rules.
1602 Updated state_log will be wrote to STATE_LOG.
1604 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1605 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1606 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1607 the LAST_NODE, we throw away the node 'a'.
1608 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1609 string 's' and transit to 'b':
1610 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1612 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1613 thrown away, we throw away the node 'a'.
1614 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1615 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1617 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1618 we throw away the node 'a'. */
1620 #define STATE_NODE_CONTAINS(state,node) \
1621 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1623 static reg_errcode_t
1625 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1629 Idx str_idx
= sctx
->last_str_idx
;
1630 re_node_set cur_dest
;
1633 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1636 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1637 transit to the last_node and the last_node itself. */
1638 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1639 if (BE (err
!= REG_NOERROR
, 0))
1641 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1642 if (BE (err
!= REG_NOERROR
, 0))
1645 /* Then check each states in the state_log. */
1648 /* Update counters. */
1649 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1650 if (null_cnt
> mctx
->max_mb_elem_len
)
1652 memset (sctx
->sifted_states
, '\0',
1653 sizeof (re_dfastate_t
*) * str_idx
);
1654 re_node_set_free (&cur_dest
);
1657 re_node_set_empty (&cur_dest
);
1660 if (mctx
->state_log
[str_idx
])
1662 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1663 if (BE (err
!= REG_NOERROR
, 0))
1667 /* Add all the nodes which satisfy the following conditions:
1668 - It can epsilon transit to a node in CUR_DEST.
1670 And update state_log. */
1671 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1672 if (BE (err
!= REG_NOERROR
, 0))
1677 re_node_set_free (&cur_dest
);
1681 static reg_errcode_t
1682 internal_function __attribute_warn_unused_result__
1683 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1684 Idx str_idx
, re_node_set
*cur_dest
)
1686 const re_dfa_t
*const dfa
= mctx
->dfa
;
1687 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1690 /* Then build the next sifted state.
1691 We build the next sifted state on 'cur_dest', and update
1692 'sifted_states[str_idx]' with 'cur_dest'.
1694 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1695 'cur_src' points the node_set of the old 'state_log[str_idx]'
1696 (with the epsilon nodes pre-filtered out). */
1697 for (i
= 0; i
< cur_src
->nelem
; i
++)
1699 Idx prev_node
= cur_src
->elems
[i
];
1704 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1705 assert (!IS_EPSILON_NODE (type
));
1707 #ifdef RE_ENABLE_I18N
1708 /* If the node may accept "multi byte". */
1709 if (dfa
->nodes
[prev_node
].accept_mb
)
1710 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1711 str_idx
, sctx
->last_str_idx
);
1712 #endif /* RE_ENABLE_I18N */
1714 /* We don't check backreferences here.
1715 See update_cur_sifted_state(). */
1717 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1718 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1719 dfa
->nexts
[prev_node
]))
1725 if (sctx
->limits
.nelem
)
1727 Idx to_idx
= str_idx
+ naccepted
;
1728 if (check_dst_limits (mctx
, &sctx
->limits
,
1729 dfa
->nexts
[prev_node
], to_idx
,
1730 prev_node
, str_idx
))
1733 ok
= re_node_set_insert (cur_dest
, prev_node
);
1741 /* Helper functions. */
1743 static reg_errcode_t
1745 clean_state_log_if_needed (re_match_context_t
*mctx
, Idx next_state_log_idx
)
1747 Idx top
= mctx
->state_log_top
;
1749 if ((next_state_log_idx
>= mctx
->input
.bufs_len
1750 && mctx
->input
.bufs_len
< mctx
->input
.len
)
1751 || (next_state_log_idx
>= mctx
->input
.valid_len
1752 && mctx
->input
.valid_len
< mctx
->input
.len
))
1755 err
= extend_buffers (mctx
, next_state_log_idx
+ 1);
1756 if (BE (err
!= REG_NOERROR
, 0))
1760 if (top
< next_state_log_idx
)
1762 memset (mctx
->state_log
+ top
+ 1, '\0',
1763 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1764 mctx
->state_log_top
= next_state_log_idx
;
1769 static reg_errcode_t
1771 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1772 re_dfastate_t
**src
, Idx num
)
1776 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1778 if (dst
[st_idx
] == NULL
)
1779 dst
[st_idx
] = src
[st_idx
];
1780 else if (src
[st_idx
] != NULL
)
1782 re_node_set merged_set
;
1783 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1784 &src
[st_idx
]->nodes
);
1785 if (BE (err
!= REG_NOERROR
, 0))
1787 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1788 re_node_set_free (&merged_set
);
1789 if (BE (err
!= REG_NOERROR
, 0))
1796 static reg_errcode_t
1798 update_cur_sifted_state (const re_match_context_t
*mctx
,
1799 re_sift_context_t
*sctx
, Idx str_idx
,
1800 re_node_set
*dest_nodes
)
1802 const re_dfa_t
*const dfa
= mctx
->dfa
;
1803 reg_errcode_t err
= REG_NOERROR
;
1804 const re_node_set
*candidates
;
1805 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1806 : &mctx
->state_log
[str_idx
]->nodes
);
1808 if (dest_nodes
->nelem
== 0)
1809 sctx
->sifted_states
[str_idx
] = NULL
;
1814 /* At first, add the nodes which can epsilon transit to a node in
1816 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1817 if (BE (err
!= REG_NOERROR
, 0))
1820 /* Then, check the limitations in the current sift_context. */
1821 if (sctx
->limits
.nelem
)
1823 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1824 mctx
->bkref_ents
, str_idx
);
1825 if (BE (err
!= REG_NOERROR
, 0))
1830 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1831 if (BE (err
!= REG_NOERROR
, 0))
1835 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1837 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1838 if (BE (err
!= REG_NOERROR
, 0))
1844 static reg_errcode_t
1845 internal_function __attribute_warn_unused_result__
1846 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1847 const re_node_set
*candidates
)
1849 reg_errcode_t err
= REG_NOERROR
;
1852 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1853 if (BE (err
!= REG_NOERROR
, 0))
1856 if (!state
->inveclosure
.alloc
)
1858 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1859 if (BE (err
!= REG_NOERROR
, 0))
1861 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1863 err
= re_node_set_merge (&state
->inveclosure
,
1864 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1865 if (BE (err
!= REG_NOERROR
, 0))
1869 return re_node_set_add_intersect (dest_nodes
, candidates
,
1870 &state
->inveclosure
);
1873 static reg_errcode_t
1875 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, Idx node
, re_node_set
*dest_nodes
,
1876 const re_node_set
*candidates
)
1880 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1881 re_node_set except_nodes
;
1882 re_node_set_init_empty (&except_nodes
);
1883 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1885 Idx cur_node
= inv_eclosure
->elems
[ecl_idx
];
1886 if (cur_node
== node
)
1888 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1890 Idx edst1
= dfa
->edests
[cur_node
].elems
[0];
1891 Idx edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1892 ? dfa
->edests
[cur_node
].elems
[1] : REG_MISSING
);
1893 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1894 && re_node_set_contains (dest_nodes
, edst1
))
1895 || (REG_VALID_NONZERO_INDEX (edst2
)
1896 && !re_node_set_contains (inv_eclosure
, edst2
)
1897 && re_node_set_contains (dest_nodes
, edst2
)))
1899 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1900 dfa
->inveclosures
+ cur_node
);
1901 if (BE (err
!= REG_NOERROR
, 0))
1903 re_node_set_free (&except_nodes
);
1909 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1911 Idx cur_node
= inv_eclosure
->elems
[ecl_idx
];
1912 if (!re_node_set_contains (&except_nodes
, cur_node
))
1914 Idx idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1915 re_node_set_remove_at (dest_nodes
, idx
);
1918 re_node_set_free (&except_nodes
);
1924 check_dst_limits (const re_match_context_t
*mctx
, const re_node_set
*limits
,
1925 Idx dst_node
, Idx dst_idx
, Idx src_node
, Idx src_idx
)
1927 const re_dfa_t
*const dfa
= mctx
->dfa
;
1928 Idx lim_idx
, src_pos
, dst_pos
;
1930 Idx dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1931 Idx src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1932 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1935 struct re_backref_cache_entry
*ent
;
1936 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1937 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1939 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1940 subexp_idx
, dst_node
, dst_idx
,
1942 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1943 subexp_idx
, src_node
, src_idx
,
1947 <src> <dst> ( <subexp> )
1948 ( <subexp> ) <src> <dst>
1949 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1950 if (src_pos
== dst_pos
)
1951 continue; /* This is unrelated limitation. */
1960 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1961 Idx subexp_idx
, Idx from_node
, Idx bkref_idx
)
1963 const re_dfa_t
*const dfa
= mctx
->dfa
;
1964 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1967 /* Else, we are on the boundary: examine the nodes on the epsilon
1969 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1971 Idx node
= eclosures
->elems
[node_idx
];
1972 switch (dfa
->nodes
[node
].type
)
1975 if (bkref_idx
!= REG_MISSING
)
1977 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1983 if (ent
->node
!= node
)
1986 if (subexp_idx
< BITSET_WORD_BITS
1987 && !(ent
->eps_reachable_subexps_map
1988 & ((bitset_word_t
) 1 << subexp_idx
)))
1991 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1992 OP_CLOSE_SUBEXP cases below. But, if the
1993 destination node is the same node as the source
1994 node, don't recurse because it would cause an
1995 infinite loop: a regex that exhibits this behavior
1997 dst
= dfa
->edests
[node
].elems
[0];
1998 if (dst
== from_node
)
2002 else /* if (boundaries & 2) */
2007 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2009 if (cpos
== -1 /* && (boundaries & 1) */)
2011 if (cpos
== 0 && (boundaries
& 2))
2014 if (subexp_idx
< BITSET_WORD_BITS
)
2015 ent
->eps_reachable_subexps_map
2016 &= ~((bitset_word_t
) 1 << subexp_idx
);
2018 while (ent
++->more
);
2022 case OP_OPEN_SUBEXP
:
2023 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2027 case OP_CLOSE_SUBEXP
:
2028 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2037 return (boundaries
& 2) ? 1 : 0;
2042 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, Idx limit
,
2043 Idx subexp_idx
, Idx from_node
, Idx str_idx
,
2046 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2049 /* If we are outside the range of the subexpression, return -1 or 1. */
2050 if (str_idx
< lim
->subexp_from
)
2053 if (lim
->subexp_to
< str_idx
)
2056 /* If we are within the subexpression, return 0. */
2057 boundaries
= (str_idx
== lim
->subexp_from
);
2058 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2059 if (boundaries
== 0)
2062 /* Else, examine epsilon closure. */
2063 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2064 from_node
, bkref_idx
);
2067 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2068 which are against limitations from DEST_NODES. */
2070 static reg_errcode_t
2072 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2073 const re_node_set
*candidates
, re_node_set
*limits
,
2074 struct re_backref_cache_entry
*bkref_ents
, Idx str_idx
)
2077 Idx node_idx
, lim_idx
;
2079 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2082 struct re_backref_cache_entry
*ent
;
2083 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2085 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2086 continue; /* This is unrelated limitation. */
2088 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2089 if (ent
->subexp_to
== str_idx
)
2091 Idx ops_node
= REG_MISSING
;
2092 Idx cls_node
= REG_MISSING
;
2093 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2095 Idx node
= dest_nodes
->elems
[node_idx
];
2096 re_token_type_t type
= dfa
->nodes
[node
].type
;
2097 if (type
== OP_OPEN_SUBEXP
2098 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2100 else if (type
== OP_CLOSE_SUBEXP
2101 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2105 /* Check the limitation of the open subexpression. */
2106 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2107 if (REG_VALID_INDEX (ops_node
))
2109 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2111 if (BE (err
!= REG_NOERROR
, 0))
2115 /* Check the limitation of the close subexpression. */
2116 if (REG_VALID_INDEX (cls_node
))
2117 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2119 Idx node
= dest_nodes
->elems
[node_idx
];
2120 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2122 && !re_node_set_contains (dfa
->eclosures
+ node
,
2125 /* It is against this limitation.
2126 Remove it form the current sifted state. */
2127 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2129 if (BE (err
!= REG_NOERROR
, 0))
2135 else /* (ent->subexp_to != str_idx) */
2137 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2139 Idx node
= dest_nodes
->elems
[node_idx
];
2140 re_token_type_t type
= dfa
->nodes
[node
].type
;
2141 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2143 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2145 /* It is against this limitation.
2146 Remove it form the current sifted state. */
2147 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2149 if (BE (err
!= REG_NOERROR
, 0))
2158 static reg_errcode_t
2159 internal_function __attribute_warn_unused_result__
2160 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2161 Idx str_idx
, const re_node_set
*candidates
)
2163 const re_dfa_t
*const dfa
= mctx
->dfa
;
2166 re_sift_context_t local_sctx
;
2167 Idx first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2169 if (first_idx
== REG_MISSING
)
2172 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2174 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2177 re_token_type_t type
;
2178 struct re_backref_cache_entry
*entry
;
2179 node
= candidates
->elems
[node_idx
];
2180 type
= dfa
->nodes
[node
].type
;
2181 /* Avoid infinite loop for the REs like "()\1+". */
2182 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2184 if (type
!= OP_BACK_REF
)
2187 entry
= mctx
->bkref_ents
+ first_idx
;
2188 enabled_idx
= first_idx
;
2195 re_dfastate_t
*cur_state
;
2197 if (entry
->node
!= node
)
2199 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2200 to_idx
= str_idx
+ subexp_len
;
2201 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2202 : dfa
->edests
[node
].elems
[0]);
2204 if (to_idx
> sctx
->last_str_idx
2205 || sctx
->sifted_states
[to_idx
] == NULL
2206 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2207 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2208 str_idx
, dst_node
, to_idx
))
2211 if (local_sctx
.sifted_states
== NULL
)
2214 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2215 if (BE (err
!= REG_NOERROR
, 0))
2218 local_sctx
.last_node
= node
;
2219 local_sctx
.last_str_idx
= str_idx
;
2220 ok
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2226 cur_state
= local_sctx
.sifted_states
[str_idx
];
2227 err
= sift_states_backward (mctx
, &local_sctx
);
2228 if (BE (err
!= REG_NOERROR
, 0))
2230 if (sctx
->limited_states
!= NULL
)
2232 err
= merge_state_array (dfa
, sctx
->limited_states
,
2233 local_sctx
.sifted_states
,
2235 if (BE (err
!= REG_NOERROR
, 0))
2238 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2239 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2241 /* mctx->bkref_ents may have changed, reload the pointer. */
2242 entry
= mctx
->bkref_ents
+ enabled_idx
;
2244 while (enabled_idx
++, entry
++->more
);
2248 if (local_sctx
.sifted_states
!= NULL
)
2250 re_node_set_free (&local_sctx
.limits
);
2257 #ifdef RE_ENABLE_I18N
2260 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2261 Idx node_idx
, Idx str_idx
, Idx max_str_idx
)
2263 const re_dfa_t
*const dfa
= mctx
->dfa
;
2265 /* Check the node can accept "multi byte". */
2266 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2267 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2268 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2269 dfa
->nexts
[node_idx
]))
2270 /* The node can't accept the "multi byte", or the
2271 destination was already thrown away, then the node
2272 could't accept the current input "multi byte". */
2274 /* Otherwise, it is sure that the node could accept
2275 'naccepted' bytes input. */
2278 #endif /* RE_ENABLE_I18N */
2281 /* Functions for state transition. */
2283 /* Return the next state to which the current state STATE will transit by
2284 accepting the current input byte, and update STATE_LOG if necessary.
2285 If STATE can accept a multibyte char/collating element/back reference
2286 update the destination of STATE_LOG. */
2288 static re_dfastate_t
*
2289 internal_function __attribute_warn_unused_result__
2290 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2291 re_dfastate_t
*state
)
2293 re_dfastate_t
**trtable
;
2296 #ifdef RE_ENABLE_I18N
2297 /* If the current state can accept multibyte. */
2298 if (BE (state
->accept_mb
, 0))
2300 *err
= transit_state_mb (mctx
, state
);
2301 if (BE (*err
!= REG_NOERROR
, 0))
2304 #endif /* RE_ENABLE_I18N */
2306 /* Then decide the next state with the single byte. */
2309 /* don't use transition table */
2310 return transit_state_sb (err
, mctx
, state
);
2313 /* Use transition table */
2314 ch
= re_string_fetch_byte (&mctx
->input
);
2317 trtable
= state
->trtable
;
2318 if (BE (trtable
!= NULL
, 1))
2321 trtable
= state
->word_trtable
;
2322 if (BE (trtable
!= NULL
, 1))
2324 unsigned int context
;
2326 = re_string_context_at (&mctx
->input
,
2327 re_string_cur_idx (&mctx
->input
) - 1,
2329 if (IS_WORD_CONTEXT (context
))
2330 return trtable
[ch
+ SBC_MAX
];
2335 if (!build_trtable (mctx
->dfa
, state
))
2341 /* Retry, we now have a transition table. */
2345 /* Update the state_log if we need */
2346 static re_dfastate_t
*
2348 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2349 re_dfastate_t
*next_state
)
2351 const re_dfa_t
*const dfa
= mctx
->dfa
;
2352 Idx cur_idx
= re_string_cur_idx (&mctx
->input
);
2354 if (cur_idx
> mctx
->state_log_top
)
2356 mctx
->state_log
[cur_idx
] = next_state
;
2357 mctx
->state_log_top
= cur_idx
;
2359 else if (mctx
->state_log
[cur_idx
] == 0)
2361 mctx
->state_log
[cur_idx
] = next_state
;
2365 re_dfastate_t
*pstate
;
2366 unsigned int context
;
2367 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2368 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2369 the destination of a multibyte char/collating element/
2370 back reference. Then the next state is the union set of
2371 these destinations and the results of the transition table. */
2372 pstate
= mctx
->state_log
[cur_idx
];
2373 log_nodes
= pstate
->entrance_nodes
;
2374 if (next_state
!= NULL
)
2376 table_nodes
= next_state
->entrance_nodes
;
2377 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2379 if (BE (*err
!= REG_NOERROR
, 0))
2383 next_nodes
= *log_nodes
;
2384 /* Note: We already add the nodes of the initial state,
2385 then we don't need to add them here. */
2387 context
= re_string_context_at (&mctx
->input
,
2388 re_string_cur_idx (&mctx
->input
) - 1,
2390 next_state
= mctx
->state_log
[cur_idx
]
2391 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2392 /* We don't need to check errors here, since the return value of
2393 this function is next_state and ERR is already set. */
2395 if (table_nodes
!= NULL
)
2396 re_node_set_free (&next_nodes
);
2399 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2401 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2402 later. We must check them here, since the back references in the
2403 next state might use them. */
2404 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2406 if (BE (*err
!= REG_NOERROR
, 0))
2409 /* If the next state has back references. */
2410 if (next_state
->has_backref
)
2412 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2413 if (BE (*err
!= REG_NOERROR
, 0))
2415 next_state
= mctx
->state_log
[cur_idx
];
2422 /* Skip bytes in the input that correspond to part of a
2423 multi-byte match, then look in the log for a state
2424 from which to restart matching. */
2425 static re_dfastate_t
*
2427 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2429 re_dfastate_t
*cur_state
;
2432 Idx max
= mctx
->state_log_top
;
2433 Idx cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2437 if (++cur_str_idx
> max
)
2439 re_string_skip_bytes (&mctx
->input
, 1);
2441 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2443 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2445 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2449 /* Helper functions for transit_state. */
2451 /* From the node set CUR_NODES, pick up the nodes whose types are
2452 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2453 expression. And register them to use them later for evaluating the
2454 corresponding back references. */
2456 static reg_errcode_t
2458 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2461 const re_dfa_t
*const dfa
= mctx
->dfa
;
2465 /* TODO: This isn't efficient.
2466 Because there might be more than one nodes whose types are
2467 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2470 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2472 Idx node
= cur_nodes
->elems
[node_idx
];
2473 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2474 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2475 && (dfa
->used_bkref_map
2476 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2478 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2479 if (BE (err
!= REG_NOERROR
, 0))
2487 /* Return the next state to which the current state STATE will transit by
2488 accepting the current input byte. */
2490 static re_dfastate_t
*
2491 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2492 re_dfastate_t
*state
)
2494 const re_dfa_t
*const dfa
= mctx
->dfa
;
2495 re_node_set next_nodes
;
2496 re_dfastate_t
*next_state
;
2497 Idx node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2498 unsigned int context
;
2500 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2501 if (BE (*err
!= REG_NOERROR
, 0))
2503 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2505 Idx cur_node
= state
->nodes
.elems
[node_cnt
];
2506 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2508 *err
= re_node_set_merge (&next_nodes
,
2509 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2510 if (BE (*err
!= REG_NOERROR
, 0))
2512 re_node_set_free (&next_nodes
);
2517 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2518 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2519 /* We don't need to check errors here, since the return value of
2520 this function is next_state and ERR is already set. */
2522 re_node_set_free (&next_nodes
);
2523 re_string_skip_bytes (&mctx
->input
, 1);
2528 #ifdef RE_ENABLE_I18N
2529 static reg_errcode_t
2531 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2533 const re_dfa_t
*const dfa
= mctx
->dfa
;
2537 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2539 re_node_set dest_nodes
, *new_nodes
;
2540 Idx cur_node_idx
= pstate
->nodes
.elems
[i
];
2543 unsigned int context
;
2544 re_dfastate_t
*dest_state
;
2546 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2549 if (dfa
->nodes
[cur_node_idx
].constraint
)
2551 context
= re_string_context_at (&mctx
->input
,
2552 re_string_cur_idx (&mctx
->input
),
2554 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2559 /* How many bytes the node can accept? */
2560 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2561 re_string_cur_idx (&mctx
->input
));
2565 /* The node can accepts 'naccepted' bytes. */
2566 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2567 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2568 : mctx
->max_mb_elem_len
);
2569 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2570 if (BE (err
!= REG_NOERROR
, 0))
2573 assert (dfa
->nexts
[cur_node_idx
] != REG_MISSING
);
2575 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2577 dest_state
= mctx
->state_log
[dest_idx
];
2578 if (dest_state
== NULL
)
2579 dest_nodes
= *new_nodes
;
2582 err
= re_node_set_init_union (&dest_nodes
,
2583 dest_state
->entrance_nodes
, new_nodes
);
2584 if (BE (err
!= REG_NOERROR
, 0))
2587 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2589 mctx
->state_log
[dest_idx
]
2590 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2591 if (dest_state
!= NULL
)
2592 re_node_set_free (&dest_nodes
);
2593 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2598 #endif /* RE_ENABLE_I18N */
2600 static reg_errcode_t
2602 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2604 const re_dfa_t
*const dfa
= mctx
->dfa
;
2607 Idx cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2609 for (i
= 0; i
< nodes
->nelem
; ++i
)
2611 Idx dest_str_idx
, prev_nelem
, bkc_idx
;
2612 Idx node_idx
= nodes
->elems
[i
];
2613 unsigned int context
;
2614 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2615 re_node_set
*new_dest_nodes
;
2617 /* Check whether 'node' is a backreference or not. */
2618 if (node
->type
!= OP_BACK_REF
)
2621 if (node
->constraint
)
2623 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2625 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2629 /* 'node' is a backreference.
2630 Check the substring which the substring matched. */
2631 bkc_idx
= mctx
->nbkref_ents
;
2632 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2633 if (BE (err
!= REG_NOERROR
, 0))
2636 /* And add the epsilon closures (which is 'new_dest_nodes') of
2637 the backreference to appropriate state_log. */
2639 assert (dfa
->nexts
[node_idx
] != REG_MISSING
);
2641 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2644 re_dfastate_t
*dest_state
;
2645 struct re_backref_cache_entry
*bkref_ent
;
2646 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2647 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2649 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2650 new_dest_nodes
= (subexp_len
== 0
2651 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2652 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2653 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2654 - bkref_ent
->subexp_from
);
2655 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2657 dest_state
= mctx
->state_log
[dest_str_idx
];
2658 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2659 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2660 /* Add 'new_dest_node' to state_log. */
2661 if (dest_state
== NULL
)
2663 mctx
->state_log
[dest_str_idx
]
2664 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2666 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2667 && err
!= REG_NOERROR
, 0))
2672 re_node_set dest_nodes
;
2673 err
= re_node_set_init_union (&dest_nodes
,
2674 dest_state
->entrance_nodes
,
2676 if (BE (err
!= REG_NOERROR
, 0))
2678 re_node_set_free (&dest_nodes
);
2681 mctx
->state_log
[dest_str_idx
]
2682 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2683 re_node_set_free (&dest_nodes
);
2684 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2685 && err
!= REG_NOERROR
, 0))
2688 /* We need to check recursively if the backreference can epsilon
2691 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2693 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2695 if (BE (err
!= REG_NOERROR
, 0))
2697 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2698 if (BE (err
!= REG_NOERROR
, 0))
2708 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2709 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2710 Note that we might collect inappropriate candidates here.
2711 However, the cost of checking them strictly here is too high, then we
2712 delay these checking for prune_impossible_nodes(). */
2714 static reg_errcode_t
2715 internal_function __attribute_warn_unused_result__
2716 get_subexp (re_match_context_t
*mctx
, Idx bkref_node
, Idx bkref_str_idx
)
2718 const re_dfa_t
*const dfa
= mctx
->dfa
;
2719 Idx subexp_num
, sub_top_idx
;
2720 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2721 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2722 Idx cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2723 if (cache_idx
!= REG_MISSING
)
2725 const struct re_backref_cache_entry
*entry
2726 = mctx
->bkref_ents
+ cache_idx
;
2728 if (entry
->node
== bkref_node
)
2729 return REG_NOERROR
; /* We already checked it. */
2730 while (entry
++->more
);
2733 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2735 /* For each sub expression */
2736 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2739 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2740 re_sub_match_last_t
*sub_last
;
2741 Idx sub_last_idx
, sl_str
, bkref_str_off
;
2743 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2744 continue; /* It isn't related. */
2746 sl_str
= sub_top
->str_idx
;
2747 bkref_str_off
= bkref_str_idx
;
2748 /* At first, check the last node of sub expressions we already
2750 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2752 regoff_t sl_str_diff
;
2753 sub_last
= sub_top
->lasts
[sub_last_idx
];
2754 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2755 /* The matched string by the sub expression match with the substring
2756 at the back reference? */
2757 if (sl_str_diff
> 0)
2759 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2761 /* Not enough chars for a successful match. */
2762 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2765 err
= clean_state_log_if_needed (mctx
,
2768 if (BE (err
!= REG_NOERROR
, 0))
2770 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2772 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2773 /* We don't need to search this sub expression any more. */
2776 bkref_str_off
+= sl_str_diff
;
2777 sl_str
+= sl_str_diff
;
2778 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2781 /* Reload buf, since the preceding call might have reallocated
2783 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2785 if (err
== REG_NOMATCH
)
2787 if (BE (err
!= REG_NOERROR
, 0))
2791 if (sub_last_idx
< sub_top
->nlasts
)
2793 if (sub_last_idx
> 0)
2795 /* Then, search for the other last nodes of the sub expression. */
2796 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2799 regoff_t sl_str_off
;
2800 const re_node_set
*nodes
;
2801 sl_str_off
= sl_str
- sub_top
->str_idx
;
2802 /* The matched string by the sub expression match with the substring
2803 at the back reference? */
2806 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2808 /* If we are at the end of the input, we cannot match. */
2809 if (bkref_str_off
>= mctx
->input
.len
)
2812 err
= extend_buffers (mctx
, bkref_str_off
+ 1);
2813 if (BE (err
!= REG_NOERROR
, 0))
2816 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2818 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2819 break; /* We don't need to search this sub expression
2822 if (mctx
->state_log
[sl_str
] == NULL
)
2824 /* Does this state have a ')' of the sub expression? */
2825 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2826 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2828 if (cls_node
== REG_MISSING
)
2830 if (sub_top
->path
== NULL
)
2832 sub_top
->path
= calloc (sizeof (state_array_t
),
2833 sl_str
- sub_top
->str_idx
+ 1);
2834 if (sub_top
->path
== NULL
)
2837 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2838 in the current context? */
2839 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2840 sub_top
->str_idx
, cls_node
, sl_str
,
2842 if (err
== REG_NOMATCH
)
2844 if (BE (err
!= REG_NOERROR
, 0))
2846 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2847 if (BE (sub_last
== NULL
, 0))
2849 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2851 if (err
== REG_NOMATCH
)
2858 /* Helper functions for get_subexp(). */
2860 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2861 If it can arrive, register the sub expression expressed with SUB_TOP
2864 static reg_errcode_t
2866 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2867 re_sub_match_last_t
*sub_last
, Idx bkref_node
, Idx bkref_str
)
2871 /* Can the subexpression arrive the back reference? */
2872 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2873 sub_last
->str_idx
, bkref_node
, bkref_str
,
2875 if (err
!= REG_NOERROR
)
2877 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2879 if (BE (err
!= REG_NOERROR
, 0))
2881 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2882 return clean_state_log_if_needed (mctx
, to_idx
);
2885 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2886 Search '(' if FL_OPEN, or search ')' otherwise.
2887 TODO: This function isn't efficient...
2888 Because there might be more than one nodes whose types are
2889 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2895 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2896 Idx subexp_idx
, int type
)
2899 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2901 Idx cls_node
= nodes
->elems
[cls_idx
];
2902 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2903 if (node
->type
== type
2904 && node
->opr
.idx
== subexp_idx
)
2910 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2911 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2913 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2915 static reg_errcode_t
2916 internal_function __attribute_warn_unused_result__
2917 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, Idx top_node
,
2918 Idx top_str
, Idx last_node
, Idx last_str
, int type
)
2920 const re_dfa_t
*const dfa
= mctx
->dfa
;
2921 reg_errcode_t err
= REG_NOERROR
;
2922 Idx subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2923 re_dfastate_t
*cur_state
= NULL
;
2924 re_node_set
*cur_nodes
, next_nodes
;
2925 re_dfastate_t
**backup_state_log
;
2926 unsigned int context
;
2928 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2929 /* Extend the buffer if we need. */
2930 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2932 re_dfastate_t
**new_array
;
2933 Idx old_alloc
= path
->alloc
;
2934 Idx incr_alloc
= last_str
+ mctx
->max_mb_elem_len
+ 1;
2936 if (BE (IDX_MAX
- old_alloc
< incr_alloc
, 0))
2938 new_alloc
= old_alloc
+ incr_alloc
;
2939 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) < new_alloc
, 0))
2941 new_array
= re_realloc (path
->array
, re_dfastate_t
*, new_alloc
);
2942 if (BE (new_array
== NULL
, 0))
2944 path
->array
= new_array
;
2945 path
->alloc
= new_alloc
;
2946 memset (new_array
+ old_alloc
, '\0',
2947 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2950 str_idx
= path
->next_idx
? path
->next_idx
: top_str
;
2952 /* Temporary modify MCTX. */
2953 backup_state_log
= mctx
->state_log
;
2954 backup_cur_idx
= mctx
->input
.cur_idx
;
2955 mctx
->state_log
= path
->array
;
2956 mctx
->input
.cur_idx
= str_idx
;
2958 /* Setup initial node set. */
2959 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2960 if (str_idx
== top_str
)
2962 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2963 if (BE (err
!= REG_NOERROR
, 0))
2965 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2966 if (BE (err
!= REG_NOERROR
, 0))
2968 re_node_set_free (&next_nodes
);
2974 cur_state
= mctx
->state_log
[str_idx
];
2975 if (cur_state
&& cur_state
->has_backref
)
2977 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2978 if (BE (err
!= REG_NOERROR
, 0))
2982 re_node_set_init_empty (&next_nodes
);
2984 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2986 if (next_nodes
.nelem
)
2988 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2990 if (BE (err
!= REG_NOERROR
, 0))
2992 re_node_set_free (&next_nodes
);
2996 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2997 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2999 re_node_set_free (&next_nodes
);
3002 mctx
->state_log
[str_idx
] = cur_state
;
3005 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
3007 re_node_set_empty (&next_nodes
);
3008 if (mctx
->state_log
[str_idx
+ 1])
3010 err
= re_node_set_merge (&next_nodes
,
3011 &mctx
->state_log
[str_idx
+ 1]->nodes
);
3012 if (BE (err
!= REG_NOERROR
, 0))
3014 re_node_set_free (&next_nodes
);
3020 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
3021 &cur_state
->non_eps_nodes
,
3023 if (BE (err
!= REG_NOERROR
, 0))
3025 re_node_set_free (&next_nodes
);
3030 if (next_nodes
.nelem
)
3032 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
3033 if (BE (err
!= REG_NOERROR
, 0))
3035 re_node_set_free (&next_nodes
);
3038 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
3040 if (BE (err
!= REG_NOERROR
, 0))
3042 re_node_set_free (&next_nodes
);
3046 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3047 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3048 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3050 re_node_set_free (&next_nodes
);
3053 mctx
->state_log
[str_idx
] = cur_state
;
3054 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3056 re_node_set_free (&next_nodes
);
3057 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3058 : &mctx
->state_log
[last_str
]->nodes
);
3059 path
->next_idx
= str_idx
;
3062 mctx
->state_log
= backup_state_log
;
3063 mctx
->input
.cur_idx
= backup_cur_idx
;
3065 /* Then check the current node set has the node LAST_NODE. */
3066 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3072 /* Helper functions for check_arrival. */
3074 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3076 TODO: This function is similar to the functions transit_state*(),
3077 however this function has many additional works.
3078 Can't we unify them? */
3080 static reg_errcode_t
3081 internal_function __attribute_warn_unused_result__
3082 check_arrival_add_next_nodes (re_match_context_t
*mctx
, Idx str_idx
,
3083 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3085 const re_dfa_t
*const dfa
= mctx
->dfa
;
3088 #ifdef RE_ENABLE_I18N
3089 reg_errcode_t err
= REG_NOERROR
;
3091 re_node_set union_set
;
3092 re_node_set_init_empty (&union_set
);
3093 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3096 Idx cur_node
= cur_nodes
->elems
[cur_idx
];
3098 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3099 assert (!IS_EPSILON_NODE (type
));
3101 #ifdef RE_ENABLE_I18N
3102 /* If the node may accept "multi byte". */
3103 if (dfa
->nodes
[cur_node
].accept_mb
)
3105 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3109 re_dfastate_t
*dest_state
;
3110 Idx next_node
= dfa
->nexts
[cur_node
];
3111 Idx next_idx
= str_idx
+ naccepted
;
3112 dest_state
= mctx
->state_log
[next_idx
];
3113 re_node_set_empty (&union_set
);
3116 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3117 if (BE (err
!= REG_NOERROR
, 0))
3119 re_node_set_free (&union_set
);
3123 ok
= re_node_set_insert (&union_set
, next_node
);
3126 re_node_set_free (&union_set
);
3129 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3131 if (BE (mctx
->state_log
[next_idx
] == NULL
3132 && err
!= REG_NOERROR
, 0))
3134 re_node_set_free (&union_set
);
3139 #endif /* RE_ENABLE_I18N */
3141 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3143 ok
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3146 re_node_set_free (&union_set
);
3151 re_node_set_free (&union_set
);
3155 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3156 CUR_NODES, however exclude the nodes which are:
3157 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3158 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3161 static reg_errcode_t
3163 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3164 Idx ex_subexp
, int type
)
3167 Idx idx
, outside_node
;
3168 re_node_set new_nodes
;
3170 assert (cur_nodes
->nelem
);
3172 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3173 if (BE (err
!= REG_NOERROR
, 0))
3175 /* Create a new node set NEW_NODES with the nodes which are epsilon
3176 closures of the node in CUR_NODES. */
3178 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3180 Idx cur_node
= cur_nodes
->elems
[idx
];
3181 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3182 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3183 if (outside_node
== REG_MISSING
)
3185 /* There are no problematic nodes, just merge them. */
3186 err
= re_node_set_merge (&new_nodes
, eclosure
);
3187 if (BE (err
!= REG_NOERROR
, 0))
3189 re_node_set_free (&new_nodes
);
3195 /* There are problematic nodes, re-calculate incrementally. */
3196 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3198 if (BE (err
!= REG_NOERROR
, 0))
3200 re_node_set_free (&new_nodes
);
3205 re_node_set_free (cur_nodes
);
3206 *cur_nodes
= new_nodes
;
3210 /* Helper function for check_arrival_expand_ecl.
3211 Check incrementally the epsilon closure of TARGET, and if it isn't
3212 problematic append it to DST_NODES. */
3214 static reg_errcode_t
3215 internal_function __attribute_warn_unused_result__
3216 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3217 Idx target
, Idx ex_subexp
, int type
)
3220 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3224 if (dfa
->nodes
[cur_node
].type
== type
3225 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3227 if (type
== OP_CLOSE_SUBEXP
)
3229 ok
= re_node_set_insert (dst_nodes
, cur_node
);
3235 ok
= re_node_set_insert (dst_nodes
, cur_node
);
3238 if (dfa
->edests
[cur_node
].nelem
== 0)
3240 if (dfa
->edests
[cur_node
].nelem
== 2)
3243 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3244 dfa
->edests
[cur_node
].elems
[1],
3246 if (BE (err
!= REG_NOERROR
, 0))
3249 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3255 /* For all the back references in the current state, calculate the
3256 destination of the back references by the appropriate entry
3257 in MCTX->BKREF_ENTS. */
3259 static reg_errcode_t
3260 internal_function __attribute_warn_unused_result__
3261 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3262 Idx cur_str
, Idx subexp_num
, int type
)
3264 const re_dfa_t
*const dfa
= mctx
->dfa
;
3266 Idx cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3267 struct re_backref_cache_entry
*ent
;
3269 if (cache_idx_start
== REG_MISSING
)
3273 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3276 Idx to_idx
, next_node
;
3278 /* Is this entry ENT is appropriate? */
3279 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3282 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3283 /* Calculate the destination of the back reference, and append it
3284 to MCTX->STATE_LOG. */
3285 if (to_idx
== cur_str
)
3287 /* The backreference did epsilon transit, we must re-check all the
3288 node in the current state. */
3289 re_node_set new_dests
;
3290 reg_errcode_t err2
, err3
;
3291 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3292 if (re_node_set_contains (cur_nodes
, next_node
))
3294 err
= re_node_set_init_1 (&new_dests
, next_node
);
3295 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3296 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3297 re_node_set_free (&new_dests
);
3298 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3299 || err3
!= REG_NOERROR
, 0))
3301 err
= (err
!= REG_NOERROR
? err
3302 : (err2
!= REG_NOERROR
? err2
: err3
));
3305 /* TODO: It is still inefficient... */
3310 re_node_set union_set
;
3311 next_node
= dfa
->nexts
[ent
->node
];
3312 if (mctx
->state_log
[to_idx
])
3315 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3318 err
= re_node_set_init_copy (&union_set
,
3319 &mctx
->state_log
[to_idx
]->nodes
);
3320 ok
= re_node_set_insert (&union_set
, next_node
);
3321 if (BE (err
!= REG_NOERROR
|| ! ok
, 0))
3323 re_node_set_free (&union_set
);
3324 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3330 err
= re_node_set_init_1 (&union_set
, next_node
);
3331 if (BE (err
!= REG_NOERROR
, 0))
3334 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3335 re_node_set_free (&union_set
);
3336 if (BE (mctx
->state_log
[to_idx
] == NULL
3337 && err
!= REG_NOERROR
, 0))
3341 while (ent
++->more
);
3345 /* Build transition table for the state.
3346 Return true if successful. */
3350 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3355 bool need_word_trtable
= false;
3356 bitset_word_t elem
, mask
;
3357 bool dests_node_malloced
= false;
3358 bool dest_states_malloced
= false;
3359 Idx ndests
; /* Number of the destination states from 'state'. */
3360 re_dfastate_t
**trtable
;
3361 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3362 re_node_set follows
, *dests_node
;
3364 bitset_t acceptable
;
3368 re_node_set dests_node
[SBC_MAX
];
3369 bitset_t dests_ch
[SBC_MAX
];
3372 /* We build DFA states which corresponds to the destination nodes
3373 from 'state'. 'dests_node[i]' represents the nodes which i-th
3374 destination state contains, and 'dests_ch[i]' represents the
3375 characters which i-th destination state accepts. */
3376 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3377 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3380 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3381 if (BE (dests_alloc
== NULL
, 0))
3383 dests_node_malloced
= true;
3385 dests_node
= dests_alloc
->dests_node
;
3386 dests_ch
= dests_alloc
->dests_ch
;
3388 /* Initialize transition table. */
3389 state
->word_trtable
= state
->trtable
= NULL
;
3391 /* At first, group all nodes belonging to 'state' into several
3393 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3394 if (BE (! REG_VALID_NONZERO_INDEX (ndests
), 0))
3396 if (dests_node_malloced
)
3398 /* Return false in case of an error, true otherwise. */
3401 state
->trtable
= (re_dfastate_t
**)
3402 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3403 if (BE (state
->trtable
== NULL
, 0))
3410 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3411 if (BE (err
!= REG_NOERROR
, 0))
3414 /* Avoid arithmetic overflow in size calculation. */
3415 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3416 / (3 * sizeof (re_dfastate_t
*)))
3421 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3422 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3423 dest_states
= (re_dfastate_t
**)
3424 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3427 dest_states
= (re_dfastate_t
**)
3428 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3429 if (BE (dest_states
== NULL
, 0))
3432 if (dest_states_malloced
)
3434 re_node_set_free (&follows
);
3435 for (i
= 0; i
< ndests
; ++i
)
3436 re_node_set_free (dests_node
+ i
);
3437 if (dests_node_malloced
)
3441 dest_states_malloced
= true;
3443 dest_states_word
= dest_states
+ ndests
;
3444 dest_states_nl
= dest_states_word
+ ndests
;
3445 bitset_empty (acceptable
);
3447 /* Then build the states for all destinations. */
3448 for (i
= 0; i
< ndests
; ++i
)
3451 re_node_set_empty (&follows
);
3452 /* Merge the follows of this destination states. */
3453 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3455 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3456 if (next_node
!= REG_MISSING
)
3458 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3459 if (BE (err
!= REG_NOERROR
, 0))
3463 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3464 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3466 /* If the new state has context constraint,
3467 build appropriate states for these contexts. */
3468 if (dest_states
[i
]->has_constraint
)
3470 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3472 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3475 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3476 need_word_trtable
= true;
3478 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3480 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3485 dest_states_word
[i
] = dest_states
[i
];
3486 dest_states_nl
[i
] = dest_states
[i
];
3488 bitset_merge (acceptable
, dests_ch
[i
]);
3491 if (!BE (need_word_trtable
, 0))
3493 /* We don't care about whether the following character is a word
3494 character, or we are in a single-byte character set so we can
3495 discern by looking at the character code: allocate a
3496 256-entry transition table. */
3497 trtable
= state
->trtable
=
3498 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3499 if (BE (trtable
== NULL
, 0))
3502 /* For all characters ch...: */
3503 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3504 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3506 mask
<<= 1, elem
>>= 1, ++ch
)
3507 if (BE (elem
& 1, 0))
3509 /* There must be exactly one destination which accepts
3510 character ch. See group_nodes_into_DFAstates. */
3511 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3514 /* j-th destination accepts the word character ch. */
3515 if (dfa
->word_char
[i
] & mask
)
3516 trtable
[ch
] = dest_states_word
[j
];
3518 trtable
[ch
] = dest_states
[j
];
3523 /* We care about whether the following character is a word
3524 character, and we are in a multi-byte character set: discern
3525 by looking at the character code: build two 256-entry
3526 transition tables, one starting at trtable[0] and one
3527 starting at trtable[SBC_MAX]. */
3528 trtable
= state
->word_trtable
=
3529 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3530 if (BE (trtable
== NULL
, 0))
3533 /* For all characters ch...: */
3534 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3535 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3537 mask
<<= 1, elem
>>= 1, ++ch
)
3538 if (BE (elem
& 1, 0))
3540 /* There must be exactly one destination which accepts
3541 character ch. See group_nodes_into_DFAstates. */
3542 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3545 /* j-th destination accepts the word character ch. */
3546 trtable
[ch
] = dest_states
[j
];
3547 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3552 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3554 /* The current state accepts newline character. */
3555 for (j
= 0; j
< ndests
; ++j
)
3556 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3558 /* k-th destination accepts newline character. */
3559 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3560 if (need_word_trtable
)
3561 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3562 /* There must be only one destination which accepts
3563 newline. See group_nodes_into_DFAstates. */
3568 if (dest_states_malloced
)
3571 re_node_set_free (&follows
);
3572 for (i
= 0; i
< ndests
; ++i
)
3573 re_node_set_free (dests_node
+ i
);
3575 if (dests_node_malloced
)
3581 /* Group all nodes belonging to STATE into several destinations.
3582 Then for all destinations, set the nodes belonging to the destination
3583 to DESTS_NODE[i] and set the characters accepted by the destination
3584 to DEST_CH[i]. This function return the number of destinations. */
3588 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3589 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3594 Idx ndests
; /* Number of the destinations from 'state'. */
3595 bitset_t accepts
; /* Characters a node can accept. */
3596 const re_node_set
*cur_nodes
= &state
->nodes
;
3597 bitset_empty (accepts
);
3600 /* For all the nodes belonging to 'state', */
3601 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3603 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3604 re_token_type_t type
= node
->type
;
3605 unsigned int constraint
= node
->constraint
;
3607 /* Enumerate all single byte character this node can accept. */
3608 if (type
== CHARACTER
)
3609 bitset_set (accepts
, node
->opr
.c
);
3610 else if (type
== SIMPLE_BRACKET
)
3612 bitset_merge (accepts
, node
->opr
.sbcset
);
3614 else if (type
== OP_PERIOD
)
3616 #ifdef RE_ENABLE_I18N
3617 if (dfa
->mb_cur_max
> 1)
3618 bitset_merge (accepts
, dfa
->sb_char
);
3621 bitset_set_all (accepts
);
3622 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3623 bitset_clear (accepts
, '\n');
3624 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3625 bitset_clear (accepts
, '\0');
3627 #ifdef RE_ENABLE_I18N
3628 else if (type
== OP_UTF8_PERIOD
)
3630 if (ASCII_CHARS
% BITSET_WORD_BITS
== 0)
3631 memset (accepts
, -1, ASCII_CHARS
/ CHAR_BIT
);
3633 bitset_merge (accepts
, utf8_sb_map
);
3634 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3635 bitset_clear (accepts
, '\n');
3636 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3637 bitset_clear (accepts
, '\0');
3643 /* Check the 'accepts' and sift the characters which are not
3644 match it the context. */
3647 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3649 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3650 bitset_empty (accepts
);
3651 if (accepts_newline
)
3652 bitset_set (accepts
, NEWLINE_CHAR
);
3656 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3658 bitset_empty (accepts
);
3662 if (constraint
& NEXT_WORD_CONSTRAINT
)
3664 bitset_word_t any_set
= 0;
3665 if (type
== CHARACTER
&& !node
->word_char
)
3667 bitset_empty (accepts
);
3670 #ifdef RE_ENABLE_I18N
3671 if (dfa
->mb_cur_max
> 1)
3672 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3673 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3676 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3677 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3681 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3683 bitset_word_t any_set
= 0;
3684 if (type
== CHARACTER
&& node
->word_char
)
3686 bitset_empty (accepts
);
3689 #ifdef RE_ENABLE_I18N
3690 if (dfa
->mb_cur_max
> 1)
3691 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3692 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3695 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3696 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3702 /* Then divide 'accepts' into DFA states, or create a new
3703 state. Above, we make sure that accepts is not empty. */
3704 for (j
= 0; j
< ndests
; ++j
)
3706 bitset_t intersec
; /* Intersection sets, see below. */
3708 /* Flags, see below. */
3709 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3711 /* Optimization, skip if this state doesn't accept the character. */
3712 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3715 /* Enumerate the intersection set of this state and 'accepts'. */
3717 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3718 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3719 /* And skip if the intersection set is empty. */
3723 /* Then check if this state is a subset of 'accepts'. */
3724 not_subset
= not_consumed
= 0;
3725 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3727 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3728 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3731 /* If this state isn't a subset of 'accepts', create a
3732 new group state, which has the 'remains'. */
3735 bitset_copy (dests_ch
[ndests
], remains
);
3736 bitset_copy (dests_ch
[j
], intersec
);
3737 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3738 if (BE (err
!= REG_NOERROR
, 0))
3743 /* Put the position in the current group. */
3744 ok
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3748 /* If all characters are consumed, go to next node. */
3752 /* Some characters remain, create a new group. */
3755 bitset_copy (dests_ch
[ndests
], accepts
);
3756 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3757 if (BE (err
!= REG_NOERROR
, 0))
3760 bitset_empty (accepts
);
3765 for (j
= 0; j
< ndests
; ++j
)
3766 re_node_set_free (dests_node
+ j
);
3770 #ifdef RE_ENABLE_I18N
3771 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3772 Return the number of the bytes the node accepts.
3773 STR_IDX is the current index of the input string.
3775 This function handles the nodes which can accept one character, or
3776 one collating element like '.', '[a-z]', opposite to the other nodes
3777 can only accept one byte. */
3780 # include <locale/weight.h>
3785 check_node_accept_bytes (const re_dfa_t
*dfa
, Idx node_idx
,
3786 const re_string_t
*input
, Idx str_idx
)
3788 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3789 int char_len
, elem_len
;
3792 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3794 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3795 if (BE (c
< 0xc2, 1))
3798 if (str_idx
+ 2 > input
->len
)
3801 d
= re_string_byte_at (input
, str_idx
+ 1);
3803 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3807 if (c
== 0xe0 && d
< 0xa0)
3813 if (c
== 0xf0 && d
< 0x90)
3819 if (c
== 0xf8 && d
< 0x88)
3825 if (c
== 0xfc && d
< 0x84)
3831 if (str_idx
+ char_len
> input
->len
)
3834 for (i
= 1; i
< char_len
; ++i
)
3836 d
= re_string_byte_at (input
, str_idx
+ i
);
3837 if (d
< 0x80 || d
> 0xbf)
3843 char_len
= re_string_char_size_at (input
, str_idx
);
3844 if (node
->type
== OP_PERIOD
)
3848 /* FIXME: I don't think this if is needed, as both '\n'
3849 and '\0' are char_len == 1. */
3850 /* '.' accepts any one character except the following two cases. */
3851 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3852 re_string_byte_at (input
, str_idx
) == '\n') ||
3853 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3854 re_string_byte_at (input
, str_idx
) == '\0'))
3859 elem_len
= re_string_elem_size_at (input
, str_idx
);
3860 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3863 if (node
->type
== COMPLEX_BRACKET
)
3865 const re_charset_t
*cset
= node
->opr
.mbcset
;
3867 const unsigned char *pin
3868 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3873 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3874 ? re_string_wchar_at (input
, str_idx
) : 0);
3876 /* match with multibyte character? */
3877 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3878 if (wc
== cset
->mbchars
[i
])
3880 match_len
= char_len
;
3881 goto check_node_accept_bytes_match
;
3883 /* match with character_class? */
3884 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3886 wctype_t wt
= cset
->char_classes
[i
];
3887 if (__iswctype (wc
, wt
))
3889 match_len
= char_len
;
3890 goto check_node_accept_bytes_match
;
3895 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3898 unsigned int in_collseq
= 0;
3899 const int32_t *table
, *indirect
;
3900 const unsigned char *weights
, *extra
;
3901 const char *collseqwc
;
3903 /* match with collating_symbol? */
3904 if (cset
->ncoll_syms
)
3905 extra
= (const unsigned char *)
3906 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3907 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3909 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3910 /* Compare the length of input collating element and
3911 the length of current collating element. */
3912 if (*coll_sym
!= elem_len
)
3914 /* Compare each bytes. */
3915 for (j
= 0; j
< *coll_sym
; j
++)
3916 if (pin
[j
] != coll_sym
[1 + j
])
3920 /* Match if every bytes is equal. */
3922 goto check_node_accept_bytes_match
;
3928 if (elem_len
<= char_len
)
3930 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3931 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3934 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3936 /* match with range expression? */
3937 /* FIXME: Implement rational ranges here, too. */
3938 for (i
= 0; i
< cset
->nranges
; ++i
)
3939 if (cset
->range_starts
[i
] <= in_collseq
3940 && in_collseq
<= cset
->range_ends
[i
])
3942 match_len
= elem_len
;
3943 goto check_node_accept_bytes_match
;
3946 /* match with equivalence_class? */
3947 if (cset
->nequiv_classes
)
3949 const unsigned char *cp
= pin
;
3950 table
= (const int32_t *)
3951 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3952 weights
= (const unsigned char *)
3953 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3954 extra
= (const unsigned char *)
3955 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3956 indirect
= (const int32_t *)
3957 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3958 int32_t idx
= findidx (table
, indirect
, extra
, &cp
, elem_len
);
3960 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3962 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3963 size_t weight_len
= weights
[idx
& 0xffffff];
3964 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3965 && (idx
>> 24) == (equiv_class_idx
>> 24))
3970 equiv_class_idx
&= 0xffffff;
3972 while (cnt
<= weight_len
3973 && (weights
[equiv_class_idx
+ 1 + cnt
]
3974 == weights
[idx
+ 1 + cnt
]))
3976 if (cnt
> weight_len
)
3978 match_len
= elem_len
;
3979 goto check_node_accept_bytes_match
;
3988 /* match with range expression? */
3989 for (i
= 0; i
< cset
->nranges
; ++i
)
3991 if (cset
->range_starts
[i
] <= wc
&& wc
<= cset
->range_ends
[i
])
3993 match_len
= char_len
;
3994 goto check_node_accept_bytes_match
;
3998 check_node_accept_bytes_match
:
3999 if (!cset
->non_match
)
4006 return (elem_len
> char_len
) ? elem_len
: char_len
;
4015 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
4017 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
4022 /* No valid character. Match it as a single byte character. */
4023 const unsigned char *collseq
= (const unsigned char *)
4024 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
4025 return collseq
[mbs
[0]];
4032 const unsigned char *extra
= (const unsigned char *)
4033 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
4034 int32_t extrasize
= (const unsigned char *)
4035 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
4037 for (idx
= 0; idx
< extrasize
;)
4041 int32_t elem_mbs_len
;
4042 /* Skip the name of collating element name. */
4043 idx
= idx
+ extra
[idx
] + 1;
4044 elem_mbs_len
= extra
[idx
++];
4045 if (mbs_len
== elem_mbs_len
)
4047 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4048 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4050 if (mbs_cnt
== elem_mbs_len
)
4051 /* Found the entry. */
4054 /* Skip the byte sequence of the collating element. */
4055 idx
+= elem_mbs_len
;
4056 /* Adjust for the alignment. */
4057 idx
= (idx
+ 3) & ~3;
4058 /* Skip the collation sequence value. */
4059 idx
+= sizeof (uint32_t);
4060 /* Skip the wide char sequence of the collating element. */
4061 idx
= idx
+ sizeof (uint32_t) * (*(int32_t *) (extra
+ idx
) + 1);
4062 /* If we found the entry, return the sequence value. */
4064 return *(uint32_t *) (extra
+ idx
);
4065 /* Skip the collation sequence value. */
4066 idx
+= sizeof (uint32_t);
4072 #endif /* RE_ENABLE_I18N */
4074 /* Check whether the node accepts the byte which is IDX-th
4075 byte of the INPUT. */
4079 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4083 ch
= re_string_byte_at (&mctx
->input
, idx
);
4087 if (node
->opr
.c
!= ch
)
4091 case SIMPLE_BRACKET
:
4092 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4096 #ifdef RE_ENABLE_I18N
4097 case OP_UTF8_PERIOD
:
4098 if (ch
>= ASCII_CHARS
)
4103 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4104 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4112 if (node
->constraint
)
4114 /* The node has constraints. Check whether the current context
4115 satisfies the constraints. */
4116 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4118 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4125 /* Extend the buffers, if the buffers have run out. */
4127 static reg_errcode_t
4128 internal_function __attribute_warn_unused_result__
4129 extend_buffers (re_match_context_t
*mctx
, int min_len
)
4132 re_string_t
*pstr
= &mctx
->input
;
4134 /* Avoid overflow. */
4135 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ sizeof (re_dfastate_t
*)) / 2
4136 <= pstr
->bufs_len
, 0))
4139 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4140 ret
= re_string_realloc_buffers (pstr
,
4142 MIN (pstr
->len
, pstr
->bufs_len
* 2)));
4143 if (BE (ret
!= REG_NOERROR
, 0))
4146 if (mctx
->state_log
!= NULL
)
4148 /* And double the length of state_log. */
4149 /* XXX We have no indication of the size of this buffer. If this
4150 allocation fail we have no indication that the state_log array
4151 does not have the right size. */
4152 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4153 pstr
->bufs_len
+ 1);
4154 if (BE (new_array
== NULL
, 0))
4156 mctx
->state_log
= new_array
;
4159 /* Then reconstruct the buffers. */
4162 #ifdef RE_ENABLE_I18N
4163 if (pstr
->mb_cur_max
> 1)
4165 ret
= build_wcs_upper_buffer (pstr
);
4166 if (BE (ret
!= REG_NOERROR
, 0))
4170 #endif /* RE_ENABLE_I18N */
4171 build_upper_buffer (pstr
);
4175 #ifdef RE_ENABLE_I18N
4176 if (pstr
->mb_cur_max
> 1)
4177 build_wcs_buffer (pstr
);
4179 #endif /* RE_ENABLE_I18N */
4181 if (pstr
->trans
!= NULL
)
4182 re_string_translate_buffer (pstr
);
4189 /* Functions for matching context. */
4191 /* Initialize MCTX. */
4193 static reg_errcode_t
4194 internal_function __attribute_warn_unused_result__
4195 match_ctx_init (re_match_context_t
*mctx
, int eflags
, Idx n
)
4197 mctx
->eflags
= eflags
;
4198 mctx
->match_last
= REG_MISSING
;
4201 /* Avoid overflow. */
4202 size_t max_object_size
=
4203 MAX (sizeof (struct re_backref_cache_entry
),
4204 sizeof (re_sub_match_top_t
*));
4205 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) < n
, 0))
4208 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4209 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4210 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4213 /* Already zero-ed by the caller.
4215 mctx->bkref_ents = NULL;
4216 mctx->nbkref_ents = 0;
4217 mctx->nsub_tops = 0; */
4218 mctx
->abkref_ents
= n
;
4219 mctx
->max_mb_elem_len
= 1;
4220 mctx
->asub_tops
= n
;
4224 /* Clean the entries which depend on the current input in MCTX.
4225 This function must be invoked when the matcher changes the start index
4226 of the input, or changes the input string. */
4230 match_ctx_clean (re_match_context_t
*mctx
)
4233 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4236 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4237 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4239 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4240 re_free (last
->path
.array
);
4243 re_free (top
->lasts
);
4246 re_free (top
->path
->array
);
4247 re_free (top
->path
);
4252 mctx
->nsub_tops
= 0;
4253 mctx
->nbkref_ents
= 0;
4256 /* Free all the memory associated with MCTX. */
4260 match_ctx_free (re_match_context_t
*mctx
)
4262 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4263 match_ctx_clean (mctx
);
4264 re_free (mctx
->sub_tops
);
4265 re_free (mctx
->bkref_ents
);
4268 /* Add a new backreference entry to MCTX.
4269 Note that we assume that caller never call this function with duplicate
4270 entry, and call with STR_IDX which isn't smaller than any existing entry.
4273 static reg_errcode_t
4274 internal_function __attribute_warn_unused_result__
4275 match_ctx_add_entry (re_match_context_t
*mctx
, Idx node
, Idx str_idx
, Idx from
,
4278 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4280 struct re_backref_cache_entry
* new_entry
;
4281 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4282 mctx
->abkref_ents
* 2);
4283 if (BE (new_entry
== NULL
, 0))
4285 re_free (mctx
->bkref_ents
);
4288 mctx
->bkref_ents
= new_entry
;
4289 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4290 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4291 mctx
->abkref_ents
*= 2;
4293 if (mctx
->nbkref_ents
> 0
4294 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4295 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4297 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4298 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4299 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4300 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4302 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4303 If bit N is clear, means that this entry won't epsilon-transition to
4304 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4305 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4308 A backreference does not epsilon-transition unless it is empty, so set
4309 to all zeros if FROM != TO. */
4310 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4311 = (from
== to
? -1 : 0);
4313 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4314 if (mctx
->max_mb_elem_len
< to
- from
)
4315 mctx
->max_mb_elem_len
= to
- from
;
4319 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4320 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4324 search_cur_bkref_entry (const re_match_context_t
*mctx
, Idx str_idx
)
4326 Idx left
, right
, mid
, last
;
4327 last
= right
= mctx
->nbkref_ents
;
4328 for (left
= 0; left
< right
;)
4330 mid
= (left
+ right
) / 2;
4331 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4336 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4342 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4345 static reg_errcode_t
4346 internal_function __attribute_warn_unused_result__
4347 match_ctx_add_subtop (re_match_context_t
*mctx
, Idx node
, Idx str_idx
)
4350 assert (mctx
->sub_tops
!= NULL
);
4351 assert (mctx
->asub_tops
> 0);
4353 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4355 Idx new_asub_tops
= mctx
->asub_tops
* 2;
4356 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4357 re_sub_match_top_t
*,
4359 if (BE (new_array
== NULL
, 0))
4361 mctx
->sub_tops
= new_array
;
4362 mctx
->asub_tops
= new_asub_tops
;
4364 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4365 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4367 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4368 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4372 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4373 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4375 static re_sub_match_last_t
*
4377 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, Idx node
, Idx str_idx
)
4379 re_sub_match_last_t
*new_entry
;
4380 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4382 Idx new_alasts
= 2 * subtop
->alasts
+ 1;
4383 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4384 re_sub_match_last_t
*,
4386 if (BE (new_array
== NULL
, 0))
4388 subtop
->lasts
= new_array
;
4389 subtop
->alasts
= new_alasts
;
4391 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4392 if (BE (new_entry
!= NULL
, 1))
4394 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4395 new_entry
->node
= node
;
4396 new_entry
->str_idx
= str_idx
;
4404 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4405 re_dfastate_t
**limited_sts
, Idx last_node
, Idx last_str_idx
)
4407 sctx
->sifted_states
= sifted_sts
;
4408 sctx
->limited_states
= limited_sts
;
4409 sctx
->last_node
= last_node
;
4410 sctx
->last_str_idx
= last_str_idx
;
4411 re_node_set_init_empty (&sctx
->limits
);