1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
23 int n
) internal_function
;
24 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
25 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
26 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
27 int str_idx
, int from
, int to
)
29 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
31 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
32 int str_idx
) internal_function
;
33 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
34 int node
, int str_idx
)
36 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
37 re_dfastate_t
**limited_sts
, int last_node
,
40 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
41 const char *string
, int length
,
42 int start
, int range
, int stop
,
43 size_t nmatch
, regmatch_t pmatch
[],
44 int eflags
) internal_function
;
45 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
46 const char *string1
, int length1
,
47 const char *string2
, int length2
,
48 int start
, int range
, struct re_registers
*regs
,
49 int stop
, int ret_len
) internal_function
;
50 static int re_search_stub (struct re_pattern_buffer
*bufp
,
51 const char *string
, int length
, int start
,
52 int range
, int stop
, struct re_registers
*regs
,
53 int ret_len
) internal_function
;
54 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
55 int nregs
, int regs_allocated
) internal_function
;
56 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
58 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
59 int *p_match_first
) internal_function
;
60 static int check_halt_state_context (const re_match_context_t
*mctx
,
61 const re_dfastate_t
*state
, int idx
)
63 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
64 regmatch_t
*prev_idx_match
, int cur_node
,
65 int cur_idx
, int nmatch
) internal_function
;
66 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
67 int str_idx
, int dest_node
, int nregs
,
69 re_node_set
*eps_via_nodes
)
71 static reg_errcode_t
set_regs (const regex_t
*preg
,
72 const re_match_context_t
*mctx
,
73 size_t nmatch
, regmatch_t
*pmatch
,
74 int fl_backtrack
) internal_function
;
75 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
79 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
80 re_sift_context_t
*sctx
,
81 int node_idx
, int str_idx
, int max_str_idx
)
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
85 re_sift_context_t
*sctx
)
87 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
88 re_sift_context_t
*sctx
, int str_idx
,
89 re_node_set
*cur_dest
)
91 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
92 re_sift_context_t
*sctx
,
94 re_node_set
*dest_nodes
)
96 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
97 re_node_set
*dest_nodes
,
98 const re_node_set
*candidates
)
100 static int check_dst_limits (const re_match_context_t
*mctx
,
102 int dst_node
, int dst_idx
, int src_node
,
103 int src_idx
) internal_function
;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
105 int boundaries
, int subexp_idx
,
106 int from_node
, int bkref_idx
)
108 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
109 int limit
, int subexp_idx
,
110 int node
, int str_idx
,
111 int bkref_idx
) internal_function
;
112 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
113 re_node_set
*dest_nodes
,
114 const re_node_set
*candidates
,
116 struct re_backref_cache_entry
*bkref_ents
,
117 int str_idx
) internal_function
;
118 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
119 re_sift_context_t
*sctx
,
120 int str_idx
, const re_node_set
*candidates
)
122 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
124 re_dfastate_t
**src
, int num
)
126 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
127 re_match_context_t
*mctx
) internal_function
;
128 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
129 re_match_context_t
*mctx
,
130 re_dfastate_t
*state
) internal_function
;
131 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
132 re_match_context_t
*mctx
,
133 re_dfastate_t
*next_state
)
135 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
136 re_node_set
*cur_nodes
,
137 int str_idx
) internal_function
;
139 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
140 re_match_context_t
*mctx
,
141 re_dfastate_t
*pstate
)
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
146 re_dfastate_t
*pstate
)
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
150 const re_node_set
*nodes
)
152 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
153 int bkref_node
, int bkref_str_idx
)
155 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
156 const re_sub_match_top_t
*sub_top
,
157 re_sub_match_last_t
*sub_last
,
158 int bkref_node
, int bkref_str
)
160 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
161 int subexp_idx
, int type
) internal_function
;
162 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
163 state_array_t
*path
, int top_node
,
164 int top_str
, int last_node
, int last_str
,
165 int type
) internal_function
;
166 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
168 re_node_set
*cur_nodes
,
169 re_node_set
*next_nodes
)
171 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
172 re_node_set
*cur_nodes
,
173 int ex_subexp
, int type
)
175 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
176 re_node_set
*dst_nodes
,
177 int target
, int ex_subexp
,
178 int type
) internal_function
;
179 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
180 re_node_set
*cur_nodes
, int cur_str
,
181 int subexp_num
, int type
)
183 static int build_trtable (const re_dfa_t
*dfa
,
184 re_dfastate_t
*state
) internal_function
;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
187 const re_string_t
*input
, int idx
)
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
194 #endif /* RE_ENABLE_I18N */
195 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
196 const re_dfastate_t
*state
,
197 re_node_set
*states_node
,
198 bitset_t
*states_ch
) internal_function
;
199 static int check_node_accept (const re_match_context_t
*mctx
,
200 const re_token_t
*node
, int idx
)
202 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
, int min_len
)
205 /* Entry point for POSIX code. */
207 /* regexec searches for a given pattern, specified by PREG, in the
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
219 We return 0 if we find a match and REG_NOMATCH if not. */
222 regexec (const regex_t
*__restrict preg
, const char *__restrict string
,
223 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
227 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
229 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
232 if (eflags
& REG_STARTEND
)
234 start
= pmatch
[0].rm_so
;
235 length
= pmatch
[0].rm_eo
;
240 length
= strlen (string
);
243 __libc_lock_lock (dfa
->lock
);
245 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
246 length
, 0, NULL
, eflags
);
248 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
249 length
, nmatch
, pmatch
, eflags
);
250 __libc_lock_unlock (dfa
->lock
);
251 return err
!= REG_NOERROR
;
255 # include <shlib-compat.h>
256 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec
) __compat_regexec
;
262 attribute_compat_text_section
263 __compat_regexec (const regex_t
*__restrict preg
,
264 const char *__restrict string
, size_t nmatch
,
265 regmatch_t pmatch
[], int eflags
)
267 return regexec (preg
, string
, nmatch
, pmatch
,
268 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
270 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
274 /* Entry points for GNU code. */
276 /* re_match, re_search, re_match_2, re_search_2
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
304 re_match (struct re_pattern_buffer
*bufp
, const char *string
, int length
,
305 int start
, struct re_registers
*regs
)
307 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
310 weak_alias (__re_match
, re_match
)
314 re_search (struct re_pattern_buffer
*bufp
, const char *string
, int length
,
315 int start
, int range
, struct re_registers
*regs
)
317 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
320 weak_alias (__re_search
, re_search
)
324 re_match_2 (struct re_pattern_buffer
*bufp
, const char *string1
, int length1
,
325 const char *string2
, int length2
, int start
,
326 struct re_registers
*regs
, int stop
)
328 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
329 start
, 0, regs
, stop
, 1);
332 weak_alias (__re_match_2
, re_match_2
)
336 re_search_2 (struct re_pattern_buffer
*bufp
, const char *string1
, int length1
,
337 const char *string2
, int length2
, int start
, int range
,
338 struct re_registers
*regs
, int stop
)
340 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
341 start
, range
, regs
, stop
, 0);
344 weak_alias (__re_search_2
, re_search_2
)
349 re_search_2_stub (struct re_pattern_buffer
*bufp
, const char *string1
,
350 int length1
, const char *string2
, int length2
, int start
,
351 int range
, struct re_registers
*regs
,
352 int stop
, int ret_len
)
356 int len
= length1
+ length2
;
359 if (BE (length1
< 0 || length2
< 0 || stop
< 0 || len
< length1
, 0))
362 /* Concatenate the strings. */
366 s
= re_malloc (char, len
);
368 if (BE (s
== NULL
, 0))
371 memcpy (__mempcpy (s
, string1
, length1
), string2
, length2
);
373 memcpy (s
, string1
, length1
);
374 memcpy (s
+ length1
, string2
, length2
);
383 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
388 /* The parameters have the same meaning as those of re_search.
389 Additional parameters:
390 If RET_LEN is nonzero the length of the match is returned (re_match style);
391 otherwise the position of the match is returned. */
395 re_search_stub (struct re_pattern_buffer
*bufp
, const char *string
, int length
,
396 int start
, int range
, int stop
, struct re_registers
*regs
,
399 reg_errcode_t result
;
403 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
405 /* Check for out-of-range. */
406 if (BE (start
< 0 || start
> length
, 0))
408 if (BE (start
+ range
> length
, 0))
409 range
= length
- start
;
410 else if (BE (start
+ range
< 0, 0))
413 __libc_lock_lock (dfa
->lock
);
415 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
416 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
418 /* Compile fastmap if we haven't yet. */
419 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
420 re_compile_fastmap (bufp
);
422 if (BE (bufp
->no_sub
, 0))
425 /* We need at least 1 register. */
428 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
429 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
431 nregs
= regs
->num_regs
;
432 if (BE (nregs
< 1, 0))
434 /* Nothing can be copied to regs. */
440 nregs
= bufp
->re_nsub
+ 1;
441 pmatch
= re_malloc (regmatch_t
, nregs
);
442 if (BE (pmatch
== NULL
, 0))
448 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
449 nregs
, pmatch
, eflags
);
453 /* I hope we needn't fill ther regs with -1's when no match was found. */
454 if (result
!= REG_NOERROR
)
456 else if (regs
!= NULL
)
458 /* If caller wants register contents data back, copy them. */
459 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
460 bufp
->regs_allocated
);
461 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
465 if (BE (rval
== 0, 1))
469 assert (pmatch
[0].rm_so
== start
);
470 rval
= pmatch
[0].rm_eo
- start
;
473 rval
= pmatch
[0].rm_so
;
477 __libc_lock_unlock (dfa
->lock
);
483 re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
, int nregs
,
486 int rval
= REGS_REALLOCATE
;
488 int need_regs
= nregs
+ 1;
489 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
492 /* Have the register data arrays been allocated? */
493 if (regs_allocated
== REGS_UNALLOCATED
)
494 { /* No. So allocate them with malloc. */
495 regs
->start
= re_malloc (regoff_t
, need_regs
);
496 if (BE (regs
->start
== NULL
, 0))
497 return REGS_UNALLOCATED
;
498 regs
->end
= re_malloc (regoff_t
, need_regs
);
499 if (BE (regs
->end
== NULL
, 0))
501 re_free (regs
->start
);
502 return REGS_UNALLOCATED
;
504 regs
->num_regs
= need_regs
;
506 else if (regs_allocated
== REGS_REALLOCATE
)
507 { /* Yes. If we need more elements than were already
508 allocated, reallocate them. If we need fewer, just
510 if (BE (need_regs
> regs
->num_regs
, 0))
512 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
514 if (BE (new_start
== NULL
, 0))
515 return REGS_UNALLOCATED
;
516 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
517 if (BE (new_end
== NULL
, 0))
520 return REGS_UNALLOCATED
;
522 regs
->start
= new_start
;
524 regs
->num_regs
= need_regs
;
529 assert (regs_allocated
== REGS_FIXED
);
530 /* This function may not be called with REGS_FIXED and nregs too big. */
531 assert (regs
->num_regs
>= nregs
);
536 for (i
= 0; i
< nregs
; ++i
)
538 regs
->start
[i
] = pmatch
[i
].rm_so
;
539 regs
->end
[i
] = pmatch
[i
].rm_eo
;
541 for ( ; i
< regs
->num_regs
; ++i
)
542 regs
->start
[i
] = regs
->end
[i
] = -1;
547 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
548 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
549 this memory for recording register information. STARTS and ENDS
550 must be allocated using the malloc library routine, and must each
551 be at least NUM_REGS * sizeof (regoff_t) bytes long.
553 If NUM_REGS == 0, then subsequent matches should allocate their own
556 Unless this function is called, the first search or match using
557 PATTERN_BUFFER will allocate its own register data, without
558 freeing the old data. */
561 re_set_registers (struct re_pattern_buffer
*bufp
, struct re_registers
*regs
,
562 unsigned num_regs
, regoff_t
*starts
, regoff_t
*ends
)
566 bufp
->regs_allocated
= REGS_REALLOCATE
;
567 regs
->num_regs
= num_regs
;
568 regs
->start
= starts
;
573 bufp
->regs_allocated
= REGS_UNALLOCATED
;
575 regs
->start
= regs
->end
= (regoff_t
*) 0;
579 weak_alias (__re_set_registers
, re_set_registers
)
582 /* Entry points compatible with 4.2 BSD regex library. We don't define
583 them unless specifically requested. */
585 #if defined _REGEX_RE_COMP || defined _LIBC
590 re_exec (const char *s
)
592 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
594 #endif /* _REGEX_RE_COMP */
596 /* Internal entry point. */
598 /* Searches for a compiled pattern PREG in the string STRING, whose
599 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
600 mingings with regexec. START, and RANGE have the same meanings
602 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
603 otherwise return the error code.
604 Note: We assume front end functions already check ranges.
605 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
608 __attribute_warn_unused_result__ internal_function
609 re_search_internal (const regex_t
*preg
, const char *string
, int length
,
610 int start
, int range
, int stop
, size_t nmatch
,
611 regmatch_t pmatch
[], int eflags
)
614 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
615 int left_lim
, right_lim
, incr
;
616 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
619 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
620 re_match_context_t mctx
= { .dfa
= dfa
};
622 re_match_context_t mctx
;
624 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
625 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
626 RE_TRANSLATE_TYPE t
= preg
->translate
;
628 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
629 memset (&mctx
, '\0', sizeof (re_match_context_t
));
633 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
634 nmatch
-= extra_nmatch
;
636 /* Check if the DFA haven't been compiled. */
637 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
638 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
639 || dfa
->init_state_begbuf
== NULL
, 0))
643 /* We assume front-end functions already check them. */
644 assert (start
+ range
>= 0 && start
+ range
<= length
);
647 /* If initial states with non-begbuf contexts have no elements,
648 the regex must be anchored. If preg->newline_anchor is set,
649 we'll never use init_state_nl, so do not check it. */
650 if (dfa
->init_state
->nodes
.nelem
== 0
651 && dfa
->init_state_word
->nodes
.nelem
== 0
652 && (dfa
->init_state_nl
->nodes
.nelem
== 0
653 || !preg
->newline_anchor
))
655 if (start
!= 0 && start
+ range
!= 0)
660 /* We must check the longest matching, if nmatch > 0. */
661 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
663 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
664 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
665 if (BE (err
!= REG_NOERROR
, 0))
667 mctx
.input
.stop
= stop
;
668 mctx
.input
.raw_stop
= stop
;
669 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
671 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
672 if (BE (err
!= REG_NOERROR
, 0))
675 /* We will log all the DFA states through which the dfa pass,
676 if nmatch > 1, or this dfa has "multibyte node", which is a
677 back-reference or a node which can accept multibyte character or
678 multi character collating element. */
679 if (nmatch
> 1 || dfa
->has_mb_node
)
681 /* Avoid overflow. */
682 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
688 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
689 if (BE (mctx
.state_log
== NULL
, 0))
696 mctx
.state_log
= NULL
;
699 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
702 /* Check incrementally whether of not the input string match. */
703 incr
= (range
< 0) ? -1 : 1;
704 left_lim
= (range
< 0) ? start
+ range
: start
;
705 right_lim
= (range
< 0) ? start
: start
+ range
;
706 sb
= dfa
->mb_cur_max
== 1;
709 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
710 | (range
>= 0 ? 2 : 0)
711 | (t
!= NULL
? 1 : 0))
714 for (;; match_first
+= incr
)
717 if (match_first
< left_lim
|| right_lim
< match_first
)
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first
< right_lim
, 1)
734 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
736 goto forward_match_found_start_or_reached_end
;
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first
< right_lim
, 1)
741 && !fastmap
[(unsigned char) string
[match_first
]])
744 forward_match_found_start_or_reached_end
:
745 if (BE (match_first
== right_lim
, 0))
747 ch
= match_first
>= length
748 ? 0 : (unsigned char) string
[match_first
];
749 if (!fastmap
[t
? t
[ch
] : ch
])
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first
>= left_lim
)
759 ch
= match_first
>= length
760 ? 0 : (unsigned char) string
[match_first
];
761 if (fastmap
[t
? t
[ch
] : ch
])
765 if (match_first
< left_lim
)
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
777 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
778 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
780 err
= re_string_reconstruct (&mctx
.input
, match_first
,
782 if (BE (err
!= REG_NOERROR
, 0))
785 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch
= (match_first
>= length
790 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
794 if (match_first
< left_lim
|| match_first
> right_lim
)
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
806 if (BE (err
!= REG_NOERROR
, 0))
809 #ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
819 match_last
= check_matching (&mctx
, fl_longest_match
,
820 range
>= 0 ? &match_first
: NULL
);
821 if (match_last
!= -1)
823 if (BE (match_last
== -2, 0))
830 mctx
.match_last
= match_last
;
831 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
833 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
834 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
837 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
840 err
= prune_impossible_nodes (&mctx
);
841 if (err
== REG_NOERROR
)
843 if (BE (err
!= REG_NOMATCH
, 0))
848 break; /* We found a match. */
852 match_ctx_clean (&mctx
);
856 assert (match_last
!= -1);
857 assert (err
== REG_NOERROR
);
860 /* Set pmatch[] if we need. */
865 /* Initialize registers. */
866 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
867 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
869 /* Set the points where matching start/end. */
871 pmatch
[0].rm_eo
= mctx
.match_last
;
873 if (!preg
->no_sub
&& nmatch
> 1)
875 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
876 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
877 if (BE (err
!= REG_NOERROR
, 0))
881 /* At last, add the offset to the each registers, since we slided
882 the buffers so that we could assume that the matching starts
884 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
885 if (pmatch
[reg_idx
].rm_so
!= -1)
887 #ifdef RE_ENABLE_I18N
888 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
890 pmatch
[reg_idx
].rm_so
=
891 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
892 ? mctx
.input
.valid_raw_len
893 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
894 pmatch
[reg_idx
].rm_eo
=
895 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
896 ? mctx
.input
.valid_raw_len
897 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
900 assert (mctx
.input
.offsets_needed
== 0);
902 pmatch
[reg_idx
].rm_so
+= match_first
;
903 pmatch
[reg_idx
].rm_eo
+= match_first
;
905 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
907 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
908 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
912 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
913 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
915 pmatch
[reg_idx
+ 1].rm_so
916 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
917 pmatch
[reg_idx
+ 1].rm_eo
918 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
923 re_free (mctx
.state_log
);
925 match_ctx_free (&mctx
);
926 re_string_destruct (&mctx
.input
);
931 internal_function __attribute_warn_unused_result__
932 prune_impossible_nodes (re_match_context_t
*mctx
)
934 const re_dfa_t
*const dfa
= mctx
->dfa
;
935 int halt_node
, match_last
;
937 re_dfastate_t
**sifted_states
;
938 re_dfastate_t
**lim_states
= NULL
;
939 re_sift_context_t sctx
;
941 assert (mctx
->state_log
!= NULL
);
943 match_last
= mctx
->match_last
;
944 halt_node
= mctx
->last_node
;
946 /* Avoid overflow. */
947 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
950 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
951 if (BE (sifted_states
== NULL
, 0))
958 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
959 if (BE (lim_states
== NULL
, 0))
966 memset (lim_states
, '\0',
967 sizeof (re_dfastate_t
*) * (match_last
+ 1));
968 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
970 ret
= sift_states_backward (mctx
, &sctx
);
971 re_node_set_free (&sctx
.limits
);
972 if (BE (ret
!= REG_NOERROR
, 0))
974 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
984 } while (mctx
->state_log
[match_last
] == NULL
985 || !mctx
->state_log
[match_last
]->halt
);
986 halt_node
= check_halt_state_context (mctx
,
987 mctx
->state_log
[match_last
],
990 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
992 re_free (lim_states
);
994 if (BE (ret
!= REG_NOERROR
, 0))
999 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1000 ret
= sift_states_backward (mctx
, &sctx
);
1001 re_node_set_free (&sctx
.limits
);
1002 if (BE (ret
!= REG_NOERROR
, 0))
1004 if (sifted_states
[0] == NULL
)
1010 re_free (mctx
->state_log
);
1011 mctx
->state_log
= sifted_states
;
1012 sifted_states
= NULL
;
1013 mctx
->last_node
= halt_node
;
1014 mctx
->match_last
= match_last
;
1017 re_free (sifted_states
);
1018 re_free (lim_states
);
1022 /* Acquire an initial state and return it.
1023 We must select appropriate initial state depending on the context,
1024 since initial states may have constraints like "\<", "^", etc.. */
1026 static inline re_dfastate_t
*
1027 __attribute ((always_inline
)) internal_function
1028 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1031 const re_dfa_t
*const dfa
= mctx
->dfa
;
1032 if (dfa
->init_state
->has_constraint
)
1034 unsigned int context
;
1035 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1036 if (IS_WORD_CONTEXT (context
))
1037 return dfa
->init_state_word
;
1038 else if (IS_ORDINARY_CONTEXT (context
))
1039 return dfa
->init_state
;
1040 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1041 return dfa
->init_state_begbuf
;
1042 else if (IS_NEWLINE_CONTEXT (context
))
1043 return dfa
->init_state_nl
;
1044 else if (IS_BEGBUF_CONTEXT (context
))
1046 /* It is relatively rare case, then calculate on demand. */
1047 return re_acquire_state_context (err
, dfa
,
1048 dfa
->init_state
->entrance_nodes
,
1052 /* Must not happen? */
1053 return dfa
->init_state
;
1056 return dfa
->init_state
;
1059 /* Check whether the regular expression match input string INPUT or not,
1060 and return the index where the matching end, return -1 if not match,
1061 or return -2 in case of an error.
1062 FL_LONGEST_MATCH means we want the POSIX longest matching.
1063 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1064 next place where we may want to try matching.
1065 Note that the matcher assume that the maching starts from the current
1066 index of the buffer. */
1069 internal_function __attribute_warn_unused_result__
1070 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1073 const re_dfa_t
*const dfa
= mctx
->dfa
;
1076 int match_last
= -1;
1077 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1078 re_dfastate_t
*cur_state
;
1079 int at_init_state
= p_match_first
!= NULL
;
1080 int next_start_idx
= cur_str_idx
;
1083 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1084 /* An initial state must not be NULL (invalid). */
1085 if (BE (cur_state
== NULL
, 0))
1087 assert (err
== REG_ESPACE
);
1091 if (mctx
->state_log
!= NULL
)
1093 mctx
->state_log
[cur_str_idx
] = cur_state
;
1095 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1096 later. E.g. Processing back references. */
1097 if (BE (dfa
->nbackref
, 0))
1100 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1101 if (BE (err
!= REG_NOERROR
, 0))
1104 if (cur_state
->has_backref
)
1106 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1107 if (BE (err
!= REG_NOERROR
, 0))
1113 /* If the RE accepts NULL string. */
1114 if (BE (cur_state
->halt
, 0))
1116 if (!cur_state
->has_constraint
1117 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1119 if (!fl_longest_match
)
1123 match_last
= cur_str_idx
;
1129 while (!re_string_eoi (&mctx
->input
))
1131 re_dfastate_t
*old_state
= cur_state
;
1132 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1134 if ((BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1135 && mctx
->input
.bufs_len
< mctx
->input
.len
)
1136 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1137 && mctx
->input
.valid_len
< mctx
->input
.len
))
1139 err
= extend_buffers (mctx
, next_char_idx
+ 1);
1140 if (BE (err
!= REG_NOERROR
, 0))
1142 assert (err
== REG_ESPACE
);
1147 cur_state
= transit_state (&err
, mctx
, cur_state
);
1148 if (mctx
->state_log
!= NULL
)
1149 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1151 if (cur_state
== NULL
)
1153 /* Reached the invalid state or an error. Try to recover a valid
1154 state using the state log, if available and if we have not
1155 already found a valid (even if not the longest) match. */
1156 if (BE (err
!= REG_NOERROR
, 0))
1159 if (mctx
->state_log
== NULL
1160 || (match
&& !fl_longest_match
)
1161 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1165 if (BE (at_init_state
, 0))
1167 if (old_state
== cur_state
)
1168 next_start_idx
= next_char_idx
;
1173 if (cur_state
->halt
)
1175 /* Reached a halt state.
1176 Check the halt state can satisfy the current context. */
1177 if (!cur_state
->has_constraint
1178 || check_halt_state_context (mctx
, cur_state
,
1179 re_string_cur_idx (&mctx
->input
)))
1181 /* We found an appropriate halt state. */
1182 match_last
= re_string_cur_idx (&mctx
->input
);
1185 /* We found a match, do not modify match_first below. */
1186 p_match_first
= NULL
;
1187 if (!fl_longest_match
)
1194 *p_match_first
+= next_start_idx
;
1199 /* Check NODE match the current context. */
1203 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1205 re_token_type_t type
= dfa
->nodes
[node
].type
;
1206 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1207 if (type
!= END_OF_RE
)
1211 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1216 /* Check the halt state STATE match the current context.
1217 Return 0 if not match, if the node, STATE has, is a halt node and
1218 match the context, return the node. */
1222 check_halt_state_context (const re_match_context_t
*mctx
,
1223 const re_dfastate_t
*state
, int idx
)
1226 unsigned int context
;
1228 assert (state
->halt
);
1230 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1231 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1232 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1233 return state
->nodes
.elems
[i
];
1237 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1238 corresponding to the DFA).
1239 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1244 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1245 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1246 struct re_fail_stack_t
*fs
)
1248 const re_dfa_t
*const dfa
= mctx
->dfa
;
1250 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1252 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1253 re_node_set
*edests
= &dfa
->edests
[node
];
1255 err
= re_node_set_insert (eps_via_nodes
, node
);
1256 if (BE (err
< 0, 0))
1258 /* Pick up a valid destination, or return -1 if none is found. */
1259 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1261 int candidate
= edests
->elems
[i
];
1262 if (!re_node_set_contains (cur_nodes
, candidate
))
1264 if (dest_node
== -1)
1265 dest_node
= candidate
;
1269 /* In order to avoid infinite loop like "(a*)*", return the second
1270 epsilon-transition if the first was already considered. */
1271 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1274 /* Otherwise, push the second epsilon-transition on the fail stack. */
1276 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1280 /* We know we are going to exit. */
1289 re_token_type_t type
= dfa
->nodes
[node
].type
;
1291 #ifdef RE_ENABLE_I18N
1292 if (dfa
->nodes
[node
].accept_mb
)
1293 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1295 #endif /* RE_ENABLE_I18N */
1296 if (type
== OP_BACK_REF
)
1298 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1299 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1302 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1306 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1307 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1316 err
= re_node_set_insert (eps_via_nodes
, node
);
1317 if (BE (err
< 0, 0))
1319 dest_node
= dfa
->edests
[node
].elems
[0];
1320 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1327 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1329 int dest_node
= dfa
->nexts
[node
];
1330 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1331 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1332 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1335 re_node_set_empty (eps_via_nodes
);
1342 static reg_errcode_t
1343 internal_function __attribute_warn_unused_result__
1344 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1345 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1348 int num
= fs
->num
++;
1349 if (fs
->num
== fs
->alloc
)
1351 struct re_fail_stack_ent_t
*new_array
;
1352 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1354 if (new_array
== NULL
)
1357 fs
->stack
= new_array
;
1359 fs
->stack
[num
].idx
= str_idx
;
1360 fs
->stack
[num
].node
= dest_node
;
1361 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1362 if (fs
->stack
[num
].regs
== NULL
)
1364 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1365 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1371 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1372 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1374 int num
= --fs
->num
;
1376 *pidx
= fs
->stack
[num
].idx
;
1377 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1378 re_node_set_free (eps_via_nodes
);
1379 re_free (fs
->stack
[num
].regs
);
1380 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1381 return fs
->stack
[num
].node
;
1384 /* Set the positions where the subexpressions are starts/ends to registers
1386 Note: We assume that pmatch[0] is already set, and
1387 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1389 static reg_errcode_t
1390 internal_function __attribute_warn_unused_result__
1391 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1392 regmatch_t
*pmatch
, int fl_backtrack
)
1394 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1396 re_node_set eps_via_nodes
;
1397 struct re_fail_stack_t
*fs
;
1398 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1399 regmatch_t
*prev_idx_match
;
1400 int prev_idx_match_malloced
= 0;
1403 assert (nmatch
> 1);
1404 assert (mctx
->state_log
!= NULL
);
1409 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1410 if (fs
->stack
== NULL
)
1416 cur_node
= dfa
->init_node
;
1417 re_node_set_init_empty (&eps_via_nodes
);
1419 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1420 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1423 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1424 if (prev_idx_match
== NULL
)
1426 free_fail_stack_return (fs
);
1429 prev_idx_match_malloced
= 1;
1431 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1433 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1435 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1437 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1442 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1443 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1445 if (reg_idx
== nmatch
)
1447 re_node_set_free (&eps_via_nodes
);
1448 if (prev_idx_match_malloced
)
1449 re_free (prev_idx_match
);
1450 return free_fail_stack_return (fs
);
1452 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1457 re_node_set_free (&eps_via_nodes
);
1458 if (prev_idx_match_malloced
)
1459 re_free (prev_idx_match
);
1464 /* Proceed to next node. */
1465 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1466 &eps_via_nodes
, fs
);
1468 if (BE (cur_node
< 0, 0))
1470 if (BE (cur_node
== -2, 0))
1472 re_node_set_free (&eps_via_nodes
);
1473 if (prev_idx_match_malloced
)
1474 re_free (prev_idx_match
);
1475 free_fail_stack_return (fs
);
1479 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1483 re_node_set_free (&eps_via_nodes
);
1484 if (prev_idx_match_malloced
)
1485 re_free (prev_idx_match
);
1490 re_node_set_free (&eps_via_nodes
);
1491 if (prev_idx_match_malloced
)
1492 re_free (prev_idx_match
);
1493 return free_fail_stack_return (fs
);
1496 static reg_errcode_t
1498 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1503 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1505 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1506 re_free (fs
->stack
[fs_idx
].regs
);
1508 re_free (fs
->stack
);
1515 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1516 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1518 int type
= dfa
->nodes
[cur_node
].type
;
1519 if (type
== OP_OPEN_SUBEXP
)
1521 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1523 /* We are at the first node of this sub expression. */
1524 if (reg_num
< nmatch
)
1526 pmatch
[reg_num
].rm_so
= cur_idx
;
1527 pmatch
[reg_num
].rm_eo
= -1;
1530 else if (type
== OP_CLOSE_SUBEXP
)
1532 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1533 if (reg_num
< nmatch
)
1535 /* We are at the last node of this sub expression. */
1536 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1538 pmatch
[reg_num
].rm_eo
= cur_idx
;
1539 /* This is a non-empty match or we are not inside an optional
1540 subexpression. Accept this right away. */
1541 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1545 if (dfa
->nodes
[cur_node
].opt_subexp
1546 && prev_idx_match
[reg_num
].rm_so
!= -1)
1547 /* We transited through an empty match for an optional
1548 subexpression, like (a?)*, and this is not the subexp's
1549 first match. Copy back the old content of the registers
1550 so that matches of an inner subexpression are undone as
1551 well, like in ((a?))*. */
1552 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1554 /* We completed a subexpression, but it may be part of
1555 an optional one, so do not update PREV_IDX_MATCH. */
1556 pmatch
[reg_num
].rm_eo
= cur_idx
;
1562 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1563 and sift the nodes in each states according to the following rules.
1564 Updated state_log will be wrote to STATE_LOG.
1566 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1567 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1568 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1569 the LAST_NODE, we throw away the node `a'.
1570 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1571 string `s' and transit to `b':
1572 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1574 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1575 thrown away, we throw away the node `a'.
1576 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1577 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1579 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1580 we throw away the node `a'. */
1582 #define STATE_NODE_CONTAINS(state,node) \
1583 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1585 static reg_errcode_t
1587 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1591 int str_idx
= sctx
->last_str_idx
;
1592 re_node_set cur_dest
;
1595 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1598 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1599 transit to the last_node and the last_node itself. */
1600 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1601 if (BE (err
!= REG_NOERROR
, 0))
1603 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1604 if (BE (err
!= REG_NOERROR
, 0))
1607 /* Then check each states in the state_log. */
1610 /* Update counters. */
1611 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1612 if (null_cnt
> mctx
->max_mb_elem_len
)
1614 memset (sctx
->sifted_states
, '\0',
1615 sizeof (re_dfastate_t
*) * str_idx
);
1616 re_node_set_free (&cur_dest
);
1619 re_node_set_empty (&cur_dest
);
1622 if (mctx
->state_log
[str_idx
])
1624 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1625 if (BE (err
!= REG_NOERROR
, 0))
1629 /* Add all the nodes which satisfy the following conditions:
1630 - It can epsilon transit to a node in CUR_DEST.
1632 And update state_log. */
1633 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1634 if (BE (err
!= REG_NOERROR
, 0))
1639 re_node_set_free (&cur_dest
);
1643 static reg_errcode_t
1644 internal_function __attribute_warn_unused_result__
1645 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1646 int str_idx
, re_node_set
*cur_dest
)
1648 const re_dfa_t
*const dfa
= mctx
->dfa
;
1649 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1652 /* Then build the next sifted state.
1653 We build the next sifted state on `cur_dest', and update
1654 `sifted_states[str_idx]' with `cur_dest'.
1656 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1657 `cur_src' points the node_set of the old `state_log[str_idx]'
1658 (with the epsilon nodes pre-filtered out). */
1659 for (i
= 0; i
< cur_src
->nelem
; i
++)
1661 int prev_node
= cur_src
->elems
[i
];
1666 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1667 assert (!IS_EPSILON_NODE (type
));
1669 #ifdef RE_ENABLE_I18N
1670 /* If the node may accept `multi byte'. */
1671 if (dfa
->nodes
[prev_node
].accept_mb
)
1672 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1673 str_idx
, sctx
->last_str_idx
);
1674 #endif /* RE_ENABLE_I18N */
1676 /* We don't check backreferences here.
1677 See update_cur_sifted_state(). */
1679 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1680 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1681 dfa
->nexts
[prev_node
]))
1687 if (sctx
->limits
.nelem
)
1689 int to_idx
= str_idx
+ naccepted
;
1690 if (check_dst_limits (mctx
, &sctx
->limits
,
1691 dfa
->nexts
[prev_node
], to_idx
,
1692 prev_node
, str_idx
))
1695 ret
= re_node_set_insert (cur_dest
, prev_node
);
1696 if (BE (ret
== -1, 0))
1703 /* Helper functions. */
1705 static reg_errcode_t
1707 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1709 int top
= mctx
->state_log_top
;
1711 if ((next_state_log_idx
>= mctx
->input
.bufs_len
1712 && mctx
->input
.bufs_len
< mctx
->input
.len
)
1713 || (next_state_log_idx
>= mctx
->input
.valid_len
1714 && mctx
->input
.valid_len
< mctx
->input
.len
))
1717 err
= extend_buffers (mctx
, next_state_log_idx
+ 1);
1718 if (BE (err
!= REG_NOERROR
, 0))
1722 if (top
< next_state_log_idx
)
1724 memset (mctx
->state_log
+ top
+ 1, '\0',
1725 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1726 mctx
->state_log_top
= next_state_log_idx
;
1731 static reg_errcode_t
1733 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1734 re_dfastate_t
**src
, int num
)
1738 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1740 if (dst
[st_idx
] == NULL
)
1741 dst
[st_idx
] = src
[st_idx
];
1742 else if (src
[st_idx
] != NULL
)
1744 re_node_set merged_set
;
1745 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1746 &src
[st_idx
]->nodes
);
1747 if (BE (err
!= REG_NOERROR
, 0))
1749 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1750 re_node_set_free (&merged_set
);
1751 if (BE (err
!= REG_NOERROR
, 0))
1758 static reg_errcode_t
1760 update_cur_sifted_state (const re_match_context_t
*mctx
,
1761 re_sift_context_t
*sctx
, int str_idx
,
1762 re_node_set
*dest_nodes
)
1764 const re_dfa_t
*const dfa
= mctx
->dfa
;
1765 reg_errcode_t err
= REG_NOERROR
;
1766 const re_node_set
*candidates
;
1767 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1768 : &mctx
->state_log
[str_idx
]->nodes
);
1770 if (dest_nodes
->nelem
== 0)
1771 sctx
->sifted_states
[str_idx
] = NULL
;
1776 /* At first, add the nodes which can epsilon transit to a node in
1778 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1779 if (BE (err
!= REG_NOERROR
, 0))
1782 /* Then, check the limitations in the current sift_context. */
1783 if (sctx
->limits
.nelem
)
1785 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1786 mctx
->bkref_ents
, str_idx
);
1787 if (BE (err
!= REG_NOERROR
, 0))
1792 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1793 if (BE (err
!= REG_NOERROR
, 0))
1797 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1799 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1800 if (BE (err
!= REG_NOERROR
, 0))
1806 static reg_errcode_t
1807 internal_function __attribute_warn_unused_result__
1808 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1809 const re_node_set
*candidates
)
1811 reg_errcode_t err
= REG_NOERROR
;
1814 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1815 if (BE (err
!= REG_NOERROR
, 0))
1818 if (!state
->inveclosure
.alloc
)
1820 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1821 if (BE (err
!= REG_NOERROR
, 0))
1823 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1825 err
= re_node_set_merge (&state
->inveclosure
,
1826 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1827 if (BE (err
!= REG_NOERROR
, 0))
1831 return re_node_set_add_intersect (dest_nodes
, candidates
,
1832 &state
->inveclosure
);
1835 static reg_errcode_t
1837 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1838 const re_node_set
*candidates
)
1842 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1843 re_node_set except_nodes
;
1844 re_node_set_init_empty (&except_nodes
);
1845 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1847 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1848 if (cur_node
== node
)
1850 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1852 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1853 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1854 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1855 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1856 && re_node_set_contains (dest_nodes
, edst1
))
1858 && !re_node_set_contains (inv_eclosure
, edst2
)
1859 && re_node_set_contains (dest_nodes
, edst2
)))
1861 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1862 dfa
->inveclosures
+ cur_node
);
1863 if (BE (err
!= REG_NOERROR
, 0))
1865 re_node_set_free (&except_nodes
);
1871 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1873 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1874 if (!re_node_set_contains (&except_nodes
, cur_node
))
1876 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1877 re_node_set_remove_at (dest_nodes
, idx
);
1880 re_node_set_free (&except_nodes
);
1886 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1887 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1889 const re_dfa_t
*const dfa
= mctx
->dfa
;
1890 int lim_idx
, src_pos
, dst_pos
;
1892 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1893 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1894 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1897 struct re_backref_cache_entry
*ent
;
1898 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1899 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1901 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1902 subexp_idx
, dst_node
, dst_idx
,
1904 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1905 subexp_idx
, src_node
, src_idx
,
1909 <src> <dst> ( <subexp> )
1910 ( <subexp> ) <src> <dst>
1911 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1912 if (src_pos
== dst_pos
)
1913 continue; /* This is unrelated limitation. */
1922 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1923 int subexp_idx
, int from_node
, int bkref_idx
)
1925 const re_dfa_t
*const dfa
= mctx
->dfa
;
1926 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1929 /* Else, we are on the boundary: examine the nodes on the epsilon
1931 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1933 int node
= eclosures
->elems
[node_idx
];
1934 switch (dfa
->nodes
[node
].type
)
1937 if (bkref_idx
!= -1)
1939 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1944 if (ent
->node
!= node
)
1947 if (subexp_idx
< BITSET_WORD_BITS
1948 && !(ent
->eps_reachable_subexps_map
1949 & ((bitset_word_t
) 1 << subexp_idx
)))
1952 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1953 OP_CLOSE_SUBEXP cases below. But, if the
1954 destination node is the same node as the source
1955 node, don't recurse because it would cause an
1956 infinite loop: a regex that exhibits this behavior
1958 dst
= dfa
->edests
[node
].elems
[0];
1959 if (dst
== from_node
)
1963 else /* if (boundaries & 2) */
1968 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1970 if (cpos
== -1 /* && (boundaries & 1) */)
1972 if (cpos
== 0 && (boundaries
& 2))
1975 if (subexp_idx
< BITSET_WORD_BITS
)
1976 ent
->eps_reachable_subexps_map
1977 &= ~((bitset_word_t
) 1 << subexp_idx
);
1979 while (ent
++->more
);
1983 case OP_OPEN_SUBEXP
:
1984 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1988 case OP_CLOSE_SUBEXP
:
1989 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1998 return (boundaries
& 2) ? 1 : 0;
2003 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2004 int subexp_idx
, int from_node
, int str_idx
,
2007 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2010 /* If we are outside the range of the subexpression, return -1 or 1. */
2011 if (str_idx
< lim
->subexp_from
)
2014 if (lim
->subexp_to
< str_idx
)
2017 /* If we are within the subexpression, return 0. */
2018 boundaries
= (str_idx
== lim
->subexp_from
);
2019 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2020 if (boundaries
== 0)
2023 /* Else, examine epsilon closure. */
2024 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2025 from_node
, bkref_idx
);
2028 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2029 which are against limitations from DEST_NODES. */
2031 static reg_errcode_t
2033 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2034 const re_node_set
*candidates
, re_node_set
*limits
,
2035 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2038 int node_idx
, lim_idx
;
2040 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2043 struct re_backref_cache_entry
*ent
;
2044 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2046 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2047 continue; /* This is unrelated limitation. */
2049 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2050 if (ent
->subexp_to
== str_idx
)
2054 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2056 int node
= dest_nodes
->elems
[node_idx
];
2057 re_token_type_t type
= dfa
->nodes
[node
].type
;
2058 if (type
== OP_OPEN_SUBEXP
2059 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2061 else if (type
== OP_CLOSE_SUBEXP
2062 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2066 /* Check the limitation of the open subexpression. */
2067 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2070 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2072 if (BE (err
!= REG_NOERROR
, 0))
2076 /* Check the limitation of the close subexpression. */
2078 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2080 int node
= dest_nodes
->elems
[node_idx
];
2081 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2083 && !re_node_set_contains (dfa
->eclosures
+ node
,
2086 /* It is against this limitation.
2087 Remove it form the current sifted state. */
2088 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2090 if (BE (err
!= REG_NOERROR
, 0))
2096 else /* (ent->subexp_to != str_idx) */
2098 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2100 int node
= dest_nodes
->elems
[node_idx
];
2101 re_token_type_t type
= dfa
->nodes
[node
].type
;
2102 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2104 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2106 /* It is against this limitation.
2107 Remove it form the current sifted state. */
2108 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2110 if (BE (err
!= REG_NOERROR
, 0))
2119 static reg_errcode_t
2120 internal_function __attribute_warn_unused_result__
2121 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2122 int str_idx
, const re_node_set
*candidates
)
2124 const re_dfa_t
*const dfa
= mctx
->dfa
;
2127 re_sift_context_t local_sctx
;
2128 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2130 if (first_idx
== -1)
2133 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2135 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2138 re_token_type_t type
;
2139 struct re_backref_cache_entry
*entry
;
2140 node
= candidates
->elems
[node_idx
];
2141 type
= dfa
->nodes
[node
].type
;
2142 /* Avoid infinite loop for the REs like "()\1+". */
2143 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2145 if (type
!= OP_BACK_REF
)
2148 entry
= mctx
->bkref_ents
+ first_idx
;
2149 enabled_idx
= first_idx
;
2156 re_dfastate_t
*cur_state
;
2158 if (entry
->node
!= node
)
2160 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2161 to_idx
= str_idx
+ subexp_len
;
2162 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2163 : dfa
->edests
[node
].elems
[0]);
2165 if (to_idx
> sctx
->last_str_idx
2166 || sctx
->sifted_states
[to_idx
] == NULL
2167 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2168 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2169 str_idx
, dst_node
, to_idx
))
2172 if (local_sctx
.sifted_states
== NULL
)
2175 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2176 if (BE (err
!= REG_NOERROR
, 0))
2179 local_sctx
.last_node
= node
;
2180 local_sctx
.last_str_idx
= str_idx
;
2181 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2182 if (BE (ret
< 0, 0))
2187 cur_state
= local_sctx
.sifted_states
[str_idx
];
2188 err
= sift_states_backward (mctx
, &local_sctx
);
2189 if (BE (err
!= REG_NOERROR
, 0))
2191 if (sctx
->limited_states
!= NULL
)
2193 err
= merge_state_array (dfa
, sctx
->limited_states
,
2194 local_sctx
.sifted_states
,
2196 if (BE (err
!= REG_NOERROR
, 0))
2199 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2200 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2202 /* mctx->bkref_ents may have changed, reload the pointer. */
2203 entry
= mctx
->bkref_ents
+ enabled_idx
;
2205 while (enabled_idx
++, entry
++->more
);
2209 if (local_sctx
.sifted_states
!= NULL
)
2211 re_node_set_free (&local_sctx
.limits
);
2218 #ifdef RE_ENABLE_I18N
2221 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2222 int node_idx
, int str_idx
, int max_str_idx
)
2224 const re_dfa_t
*const dfa
= mctx
->dfa
;
2226 /* Check the node can accept `multi byte'. */
2227 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2228 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2229 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2230 dfa
->nexts
[node_idx
]))
2231 /* The node can't accept the `multi byte', or the
2232 destination was already thrown away, then the node
2233 could't accept the current input `multi byte'. */
2235 /* Otherwise, it is sure that the node could accept
2236 `naccepted' bytes input. */
2239 #endif /* RE_ENABLE_I18N */
2242 /* Functions for state transition. */
2244 /* Return the next state to which the current state STATE will transit by
2245 accepting the current input byte, and update STATE_LOG if necessary.
2246 If STATE can accept a multibyte char/collating element/back reference
2247 update the destination of STATE_LOG. */
2249 static re_dfastate_t
*
2250 internal_function __attribute_warn_unused_result__
2251 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2252 re_dfastate_t
*state
)
2254 re_dfastate_t
**trtable
;
2257 #ifdef RE_ENABLE_I18N
2258 /* If the current state can accept multibyte. */
2259 if (BE (state
->accept_mb
, 0))
2261 *err
= transit_state_mb (mctx
, state
);
2262 if (BE (*err
!= REG_NOERROR
, 0))
2265 #endif /* RE_ENABLE_I18N */
2267 /* Then decide the next state with the single byte. */
2270 /* don't use transition table */
2271 return transit_state_sb (err
, mctx
, state
);
2274 /* Use transition table */
2275 ch
= re_string_fetch_byte (&mctx
->input
);
2278 trtable
= state
->trtable
;
2279 if (BE (trtable
!= NULL
, 1))
2282 trtable
= state
->word_trtable
;
2283 if (BE (trtable
!= NULL
, 1))
2285 unsigned int context
;
2287 = re_string_context_at (&mctx
->input
,
2288 re_string_cur_idx (&mctx
->input
) - 1,
2290 if (IS_WORD_CONTEXT (context
))
2291 return trtable
[ch
+ SBC_MAX
];
2296 if (!build_trtable (mctx
->dfa
, state
))
2302 /* Retry, we now have a transition table. */
2306 /* Update the state_log if we need */
2309 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2310 re_dfastate_t
*next_state
)
2312 const re_dfa_t
*const dfa
= mctx
->dfa
;
2313 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2315 if (cur_idx
> mctx
->state_log_top
)
2317 mctx
->state_log
[cur_idx
] = next_state
;
2318 mctx
->state_log_top
= cur_idx
;
2320 else if (mctx
->state_log
[cur_idx
] == 0)
2322 mctx
->state_log
[cur_idx
] = next_state
;
2326 re_dfastate_t
*pstate
;
2327 unsigned int context
;
2328 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2329 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2330 the destination of a multibyte char/collating element/
2331 back reference. Then the next state is the union set of
2332 these destinations and the results of the transition table. */
2333 pstate
= mctx
->state_log
[cur_idx
];
2334 log_nodes
= pstate
->entrance_nodes
;
2335 if (next_state
!= NULL
)
2337 table_nodes
= next_state
->entrance_nodes
;
2338 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2340 if (BE (*err
!= REG_NOERROR
, 0))
2344 next_nodes
= *log_nodes
;
2345 /* Note: We already add the nodes of the initial state,
2346 then we don't need to add them here. */
2348 context
= re_string_context_at (&mctx
->input
,
2349 re_string_cur_idx (&mctx
->input
) - 1,
2351 next_state
= mctx
->state_log
[cur_idx
]
2352 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2353 /* We don't need to check errors here, since the return value of
2354 this function is next_state and ERR is already set. */
2356 if (table_nodes
!= NULL
)
2357 re_node_set_free (&next_nodes
);
2360 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2362 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2363 later. We must check them here, since the back references in the
2364 next state might use them. */
2365 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2367 if (BE (*err
!= REG_NOERROR
, 0))
2370 /* If the next state has back references. */
2371 if (next_state
->has_backref
)
2373 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2374 if (BE (*err
!= REG_NOERROR
, 0))
2376 next_state
= mctx
->state_log
[cur_idx
];
2383 /* Skip bytes in the input that correspond to part of a
2384 multi-byte match, then look in the log for a state
2385 from which to restart matching. */
2388 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2390 re_dfastate_t
*cur_state
;
2393 int max
= mctx
->state_log_top
;
2394 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2398 if (++cur_str_idx
> max
)
2400 re_string_skip_bytes (&mctx
->input
, 1);
2402 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2404 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2406 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2410 /* Helper functions for transit_state. */
2412 /* From the node set CUR_NODES, pick up the nodes whose types are
2413 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2414 expression. And register them to use them later for evaluating the
2415 correspoding back references. */
2417 static reg_errcode_t
2419 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2422 const re_dfa_t
*const dfa
= mctx
->dfa
;
2426 /* TODO: This isn't efficient.
2427 Because there might be more than one nodes whose types are
2428 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2431 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2433 int node
= cur_nodes
->elems
[node_idx
];
2434 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2435 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2436 && (dfa
->used_bkref_map
2437 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2439 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2440 if (BE (err
!= REG_NOERROR
, 0))
2448 /* Return the next state to which the current state STATE will transit by
2449 accepting the current input byte. */
2451 static re_dfastate_t
*
2452 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2453 re_dfastate_t
*state
)
2455 const re_dfa_t
*const dfa
= mctx
->dfa
;
2456 re_node_set next_nodes
;
2457 re_dfastate_t
*next_state
;
2458 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2459 unsigned int context
;
2461 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2462 if (BE (*err
!= REG_NOERROR
, 0))
2464 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2466 int cur_node
= state
->nodes
.elems
[node_cnt
];
2467 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2469 *err
= re_node_set_merge (&next_nodes
,
2470 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2471 if (BE (*err
!= REG_NOERROR
, 0))
2473 re_node_set_free (&next_nodes
);
2478 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2479 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2480 /* We don't need to check errors here, since the return value of
2481 this function is next_state and ERR is already set. */
2483 re_node_set_free (&next_nodes
);
2484 re_string_skip_bytes (&mctx
->input
, 1);
2489 #ifdef RE_ENABLE_I18N
2490 static reg_errcode_t
2492 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2494 const re_dfa_t
*const dfa
= mctx
->dfa
;
2498 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2500 re_node_set dest_nodes
, *new_nodes
;
2501 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2502 int naccepted
, dest_idx
;
2503 unsigned int context
;
2504 re_dfastate_t
*dest_state
;
2506 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2509 if (dfa
->nodes
[cur_node_idx
].constraint
)
2511 context
= re_string_context_at (&mctx
->input
,
2512 re_string_cur_idx (&mctx
->input
),
2514 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2519 /* How many bytes the node can accept? */
2520 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2521 re_string_cur_idx (&mctx
->input
));
2525 /* The node can accepts `naccepted' bytes. */
2526 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2527 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2528 : mctx
->max_mb_elem_len
);
2529 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2530 if (BE (err
!= REG_NOERROR
, 0))
2533 assert (dfa
->nexts
[cur_node_idx
] != -1);
2535 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2537 dest_state
= mctx
->state_log
[dest_idx
];
2538 if (dest_state
== NULL
)
2539 dest_nodes
= *new_nodes
;
2542 err
= re_node_set_init_union (&dest_nodes
,
2543 dest_state
->entrance_nodes
, new_nodes
);
2544 if (BE (err
!= REG_NOERROR
, 0))
2547 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2549 mctx
->state_log
[dest_idx
]
2550 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2551 if (dest_state
!= NULL
)
2552 re_node_set_free (&dest_nodes
);
2553 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2558 #endif /* RE_ENABLE_I18N */
2560 static reg_errcode_t
2562 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2564 const re_dfa_t
*const dfa
= mctx
->dfa
;
2567 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2569 for (i
= 0; i
< nodes
->nelem
; ++i
)
2571 int dest_str_idx
, prev_nelem
, bkc_idx
;
2572 int node_idx
= nodes
->elems
[i
];
2573 unsigned int context
;
2574 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2575 re_node_set
*new_dest_nodes
;
2577 /* Check whether `node' is a backreference or not. */
2578 if (node
->type
!= OP_BACK_REF
)
2581 if (node
->constraint
)
2583 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2585 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2589 /* `node' is a backreference.
2590 Check the substring which the substring matched. */
2591 bkc_idx
= mctx
->nbkref_ents
;
2592 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2593 if (BE (err
!= REG_NOERROR
, 0))
2596 /* And add the epsilon closures (which is `new_dest_nodes') of
2597 the backreference to appropriate state_log. */
2599 assert (dfa
->nexts
[node_idx
] != -1);
2601 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2604 re_dfastate_t
*dest_state
;
2605 struct re_backref_cache_entry
*bkref_ent
;
2606 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2607 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2609 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2610 new_dest_nodes
= (subexp_len
== 0
2611 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2612 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2613 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2614 - bkref_ent
->subexp_from
);
2615 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2617 dest_state
= mctx
->state_log
[dest_str_idx
];
2618 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2619 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2620 /* Add `new_dest_node' to state_log. */
2621 if (dest_state
== NULL
)
2623 mctx
->state_log
[dest_str_idx
]
2624 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2626 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2627 && err
!= REG_NOERROR
, 0))
2632 re_node_set dest_nodes
;
2633 err
= re_node_set_init_union (&dest_nodes
,
2634 dest_state
->entrance_nodes
,
2636 if (BE (err
!= REG_NOERROR
, 0))
2638 re_node_set_free (&dest_nodes
);
2641 mctx
->state_log
[dest_str_idx
]
2642 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2643 re_node_set_free (&dest_nodes
);
2644 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2645 && err
!= REG_NOERROR
, 0))
2648 /* We need to check recursively if the backreference can epsilon
2651 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2653 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2655 if (BE (err
!= REG_NOERROR
, 0))
2657 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2658 if (BE (err
!= REG_NOERROR
, 0))
2668 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2669 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2670 Note that we might collect inappropriate candidates here.
2671 However, the cost of checking them strictly here is too high, then we
2672 delay these checking for prune_impossible_nodes(). */
2674 static reg_errcode_t
2675 internal_function __attribute_warn_unused_result__
2676 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2678 const re_dfa_t
*const dfa
= mctx
->dfa
;
2679 int subexp_num
, sub_top_idx
;
2680 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2681 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2682 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2683 if (cache_idx
!= -1)
2685 const struct re_backref_cache_entry
*entry
2686 = mctx
->bkref_ents
+ cache_idx
;
2688 if (entry
->node
== bkref_node
)
2689 return REG_NOERROR
; /* We already checked it. */
2690 while (entry
++->more
);
2693 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2695 /* For each sub expression */
2696 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2699 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2700 re_sub_match_last_t
*sub_last
;
2701 int sub_last_idx
, sl_str
, bkref_str_off
;
2703 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2704 continue; /* It isn't related. */
2706 sl_str
= sub_top
->str_idx
;
2707 bkref_str_off
= bkref_str_idx
;
2708 /* At first, check the last node of sub expressions we already
2710 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2713 sub_last
= sub_top
->lasts
[sub_last_idx
];
2714 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2715 /* The matched string by the sub expression match with the substring
2716 at the back reference? */
2717 if (sl_str_diff
> 0)
2719 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2721 /* Not enough chars for a successful match. */
2722 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2725 err
= clean_state_log_if_needed (mctx
,
2728 if (BE (err
!= REG_NOERROR
, 0))
2730 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2732 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2733 /* We don't need to search this sub expression any more. */
2736 bkref_str_off
+= sl_str_diff
;
2737 sl_str
+= sl_str_diff
;
2738 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2741 /* Reload buf, since the preceding call might have reallocated
2743 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2745 if (err
== REG_NOMATCH
)
2747 if (BE (err
!= REG_NOERROR
, 0))
2751 if (sub_last_idx
< sub_top
->nlasts
)
2753 if (sub_last_idx
> 0)
2755 /* Then, search for the other last nodes of the sub expression. */
2756 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2758 int cls_node
, sl_str_off
;
2759 const re_node_set
*nodes
;
2760 sl_str_off
= sl_str
- sub_top
->str_idx
;
2761 /* The matched string by the sub expression match with the substring
2762 at the back reference? */
2765 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2767 /* If we are at the end of the input, we cannot match. */
2768 if (bkref_str_off
>= mctx
->input
.len
)
2771 err
= extend_buffers (mctx
, bkref_str_off
+ 1);
2772 if (BE (err
!= REG_NOERROR
, 0))
2775 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2777 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2778 break; /* We don't need to search this sub expression
2781 if (mctx
->state_log
[sl_str
] == NULL
)
2783 /* Does this state have a ')' of the sub expression? */
2784 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2785 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2789 if (sub_top
->path
== NULL
)
2791 sub_top
->path
= calloc (sizeof (state_array_t
),
2792 sl_str
- sub_top
->str_idx
+ 1);
2793 if (sub_top
->path
== NULL
)
2796 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2797 in the current context? */
2798 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2799 sub_top
->str_idx
, cls_node
, sl_str
,
2801 if (err
== REG_NOMATCH
)
2803 if (BE (err
!= REG_NOERROR
, 0))
2805 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2806 if (BE (sub_last
== NULL
, 0))
2808 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2810 if (err
== REG_NOMATCH
)
2817 /* Helper functions for get_subexp(). */
2819 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2820 If it can arrive, register the sub expression expressed with SUB_TOP
2823 static reg_errcode_t
2825 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2826 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2830 /* Can the subexpression arrive the back reference? */
2831 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2832 sub_last
->str_idx
, bkref_node
, bkref_str
,
2834 if (err
!= REG_NOERROR
)
2836 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2838 if (BE (err
!= REG_NOERROR
, 0))
2840 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2841 return clean_state_log_if_needed (mctx
, to_idx
);
2844 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2845 Search '(' if FL_OPEN, or search ')' otherwise.
2846 TODO: This function isn't efficient...
2847 Because there might be more than one nodes whose types are
2848 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2854 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2855 int subexp_idx
, int type
)
2858 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2860 int cls_node
= nodes
->elems
[cls_idx
];
2861 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2862 if (node
->type
== type
2863 && node
->opr
.idx
== subexp_idx
)
2869 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2870 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2872 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2874 static reg_errcode_t
2875 internal_function __attribute_warn_unused_result__
2876 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2877 int top_str
, int last_node
, int last_str
, int type
)
2879 const re_dfa_t
*const dfa
= mctx
->dfa
;
2880 reg_errcode_t err
= REG_NOERROR
;
2881 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2882 re_dfastate_t
*cur_state
= NULL
;
2883 re_node_set
*cur_nodes
, next_nodes
;
2884 re_dfastate_t
**backup_state_log
;
2885 unsigned int context
;
2887 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2888 /* Extend the buffer if we need. */
2889 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2891 re_dfastate_t
**new_array
;
2892 int old_alloc
= path
->alloc
;
2893 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2894 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2895 if (BE (new_array
== NULL
, 0))
2897 path
->alloc
= old_alloc
;
2900 path
->array
= new_array
;
2901 memset (new_array
+ old_alloc
, '\0',
2902 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2905 str_idx
= path
->next_idx
?: top_str
;
2907 /* Temporary modify MCTX. */
2908 backup_state_log
= mctx
->state_log
;
2909 backup_cur_idx
= mctx
->input
.cur_idx
;
2910 mctx
->state_log
= path
->array
;
2911 mctx
->input
.cur_idx
= str_idx
;
2913 /* Setup initial node set. */
2914 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2915 if (str_idx
== top_str
)
2917 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2918 if (BE (err
!= REG_NOERROR
, 0))
2920 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2921 if (BE (err
!= REG_NOERROR
, 0))
2923 re_node_set_free (&next_nodes
);
2929 cur_state
= mctx
->state_log
[str_idx
];
2930 if (cur_state
&& cur_state
->has_backref
)
2932 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2933 if (BE (err
!= REG_NOERROR
, 0))
2937 re_node_set_init_empty (&next_nodes
);
2939 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2941 if (next_nodes
.nelem
)
2943 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2945 if (BE (err
!= REG_NOERROR
, 0))
2947 re_node_set_free (&next_nodes
);
2951 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2952 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2954 re_node_set_free (&next_nodes
);
2957 mctx
->state_log
[str_idx
] = cur_state
;
2960 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2962 re_node_set_empty (&next_nodes
);
2963 if (mctx
->state_log
[str_idx
+ 1])
2965 err
= re_node_set_merge (&next_nodes
,
2966 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2967 if (BE (err
!= REG_NOERROR
, 0))
2969 re_node_set_free (&next_nodes
);
2975 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2976 &cur_state
->non_eps_nodes
,
2978 if (BE (err
!= REG_NOERROR
, 0))
2980 re_node_set_free (&next_nodes
);
2985 if (next_nodes
.nelem
)
2987 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2988 if (BE (err
!= REG_NOERROR
, 0))
2990 re_node_set_free (&next_nodes
);
2993 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2995 if (BE (err
!= REG_NOERROR
, 0))
2997 re_node_set_free (&next_nodes
);
3001 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3002 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3003 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3005 re_node_set_free (&next_nodes
);
3008 mctx
->state_log
[str_idx
] = cur_state
;
3009 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3011 re_node_set_free (&next_nodes
);
3012 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3013 : &mctx
->state_log
[last_str
]->nodes
);
3014 path
->next_idx
= str_idx
;
3017 mctx
->state_log
= backup_state_log
;
3018 mctx
->input
.cur_idx
= backup_cur_idx
;
3020 /* Then check the current node set has the node LAST_NODE. */
3021 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3027 /* Helper functions for check_arrival. */
3029 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3031 TODO: This function is similar to the functions transit_state*(),
3032 however this function has many additional works.
3033 Can't we unify them? */
3035 static reg_errcode_t
3036 internal_function __attribute_warn_unused_result__
3037 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3038 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3040 const re_dfa_t
*const dfa
= mctx
->dfa
;
3043 reg_errcode_t err
= REG_NOERROR
;
3044 re_node_set union_set
;
3045 re_node_set_init_empty (&union_set
);
3046 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3049 int cur_node
= cur_nodes
->elems
[cur_idx
];
3051 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3052 assert (!IS_EPSILON_NODE (type
));
3054 #ifdef RE_ENABLE_I18N
3055 /* If the node may accept `multi byte'. */
3056 if (dfa
->nodes
[cur_node
].accept_mb
)
3058 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3062 re_dfastate_t
*dest_state
;
3063 int next_node
= dfa
->nexts
[cur_node
];
3064 int next_idx
= str_idx
+ naccepted
;
3065 dest_state
= mctx
->state_log
[next_idx
];
3066 re_node_set_empty (&union_set
);
3069 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3070 if (BE (err
!= REG_NOERROR
, 0))
3072 re_node_set_free (&union_set
);
3076 result
= re_node_set_insert (&union_set
, next_node
);
3077 if (BE (result
< 0, 0))
3079 re_node_set_free (&union_set
);
3082 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3084 if (BE (mctx
->state_log
[next_idx
] == NULL
3085 && err
!= REG_NOERROR
, 0))
3087 re_node_set_free (&union_set
);
3092 #endif /* RE_ENABLE_I18N */
3094 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3096 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3097 if (BE (result
< 0, 0))
3099 re_node_set_free (&union_set
);
3104 re_node_set_free (&union_set
);
3108 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3109 CUR_NODES, however exclude the nodes which are:
3110 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3111 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3114 static reg_errcode_t
3116 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3117 int ex_subexp
, int type
)
3120 int idx
, outside_node
;
3121 re_node_set new_nodes
;
3123 assert (cur_nodes
->nelem
);
3125 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3126 if (BE (err
!= REG_NOERROR
, 0))
3128 /* Create a new node set NEW_NODES with the nodes which are epsilon
3129 closures of the node in CUR_NODES. */
3131 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3133 int cur_node
= cur_nodes
->elems
[idx
];
3134 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3135 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3136 if (outside_node
== -1)
3138 /* There are no problematic nodes, just merge them. */
3139 err
= re_node_set_merge (&new_nodes
, eclosure
);
3140 if (BE (err
!= REG_NOERROR
, 0))
3142 re_node_set_free (&new_nodes
);
3148 /* There are problematic nodes, re-calculate incrementally. */
3149 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3151 if (BE (err
!= REG_NOERROR
, 0))
3153 re_node_set_free (&new_nodes
);
3158 re_node_set_free (cur_nodes
);
3159 *cur_nodes
= new_nodes
;
3163 /* Helper function for check_arrival_expand_ecl.
3164 Check incrementally the epsilon closure of TARGET, and if it isn't
3165 problematic append it to DST_NODES. */
3167 static reg_errcode_t
3168 internal_function __attribute_warn_unused_result__
3169 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3170 int target
, int ex_subexp
, int type
)
3173 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3177 if (dfa
->nodes
[cur_node
].type
== type
3178 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3180 if (type
== OP_CLOSE_SUBEXP
)
3182 err
= re_node_set_insert (dst_nodes
, cur_node
);
3183 if (BE (err
== -1, 0))
3188 err
= re_node_set_insert (dst_nodes
, cur_node
);
3189 if (BE (err
== -1, 0))
3191 if (dfa
->edests
[cur_node
].nelem
== 0)
3193 if (dfa
->edests
[cur_node
].nelem
== 2)
3195 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3196 dfa
->edests
[cur_node
].elems
[1],
3198 if (BE (err
!= REG_NOERROR
, 0))
3201 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3207 /* For all the back references in the current state, calculate the
3208 destination of the back references by the appropriate entry
3209 in MCTX->BKREF_ENTS. */
3211 static reg_errcode_t
3212 internal_function __attribute_warn_unused_result__
3213 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3214 int cur_str
, int subexp_num
, int type
)
3216 const re_dfa_t
*const dfa
= mctx
->dfa
;
3218 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3219 struct re_backref_cache_entry
*ent
;
3221 if (cache_idx_start
== -1)
3225 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3228 int to_idx
, next_node
;
3230 /* Is this entry ENT is appropriate? */
3231 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3234 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3235 /* Calculate the destination of the back reference, and append it
3236 to MCTX->STATE_LOG. */
3237 if (to_idx
== cur_str
)
3239 /* The backreference did epsilon transit, we must re-check all the
3240 node in the current state. */
3241 re_node_set new_dests
;
3242 reg_errcode_t err2
, err3
;
3243 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3244 if (re_node_set_contains (cur_nodes
, next_node
))
3246 err
= re_node_set_init_1 (&new_dests
, next_node
);
3247 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3248 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3249 re_node_set_free (&new_dests
);
3250 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3251 || err3
!= REG_NOERROR
, 0))
3253 err
= (err
!= REG_NOERROR
? err
3254 : (err2
!= REG_NOERROR
? err2
: err3
));
3257 /* TODO: It is still inefficient... */
3262 re_node_set union_set
;
3263 next_node
= dfa
->nexts
[ent
->node
];
3264 if (mctx
->state_log
[to_idx
])
3267 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3270 err
= re_node_set_init_copy (&union_set
,
3271 &mctx
->state_log
[to_idx
]->nodes
);
3272 ret
= re_node_set_insert (&union_set
, next_node
);
3273 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3275 re_node_set_free (&union_set
);
3276 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3282 err
= re_node_set_init_1 (&union_set
, next_node
);
3283 if (BE (err
!= REG_NOERROR
, 0))
3286 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3287 re_node_set_free (&union_set
);
3288 if (BE (mctx
->state_log
[to_idx
] == NULL
3289 && err
!= REG_NOERROR
, 0))
3293 while (ent
++->more
);
3297 /* Build transition table for the state.
3298 Return 1 if succeeded, otherwise return NULL. */
3302 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3305 int i
, j
, ch
, need_word_trtable
= 0;
3306 bitset_word_t elem
, mask
;
3307 bool dests_node_malloced
= false;
3308 bool dest_states_malloced
= false;
3309 int ndests
; /* Number of the destination states from `state'. */
3310 re_dfastate_t
**trtable
;
3311 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3312 re_node_set follows
, *dests_node
;
3314 bitset_t acceptable
;
3318 re_node_set dests_node
[SBC_MAX
];
3319 bitset_t dests_ch
[SBC_MAX
];
3322 /* We build DFA states which corresponds to the destination nodes
3323 from `state'. `dests_node[i]' represents the nodes which i-th
3324 destination state contains, and `dests_ch[i]' represents the
3325 characters which i-th destination state accepts. */
3326 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3327 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3330 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3331 if (BE (dests_alloc
== NULL
, 0))
3333 dests_node_malloced
= true;
3335 dests_node
= dests_alloc
->dests_node
;
3336 dests_ch
= dests_alloc
->dests_ch
;
3338 /* Initialize transiton table. */
3339 state
->word_trtable
= state
->trtable
= NULL
;
3341 /* At first, group all nodes belonging to `state' into several
3343 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3344 if (BE (ndests
<= 0, 0))
3346 if (dests_node_malloced
)
3348 /* Return 0 in case of an error, 1 otherwise. */
3351 state
->trtable
= (re_dfastate_t
**)
3352 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3353 if (BE (state
->trtable
== NULL
, 0))
3360 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3361 if (BE (err
!= REG_NOERROR
, 0))
3364 /* Avoid arithmetic overflow in size calculation. */
3365 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3366 / (3 * sizeof (re_dfastate_t
*)))
3371 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3372 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3373 dest_states
= (re_dfastate_t
**)
3374 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3377 dest_states
= (re_dfastate_t
**)
3378 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3379 if (BE (dest_states
== NULL
, 0))
3382 if (dest_states_malloced
)
3384 re_node_set_free (&follows
);
3385 for (i
= 0; i
< ndests
; ++i
)
3386 re_node_set_free (dests_node
+ i
);
3387 if (dests_node_malloced
)
3391 dest_states_malloced
= true;
3393 dest_states_word
= dest_states
+ ndests
;
3394 dest_states_nl
= dest_states_word
+ ndests
;
3395 bitset_empty (acceptable
);
3397 /* Then build the states for all destinations. */
3398 for (i
= 0; i
< ndests
; ++i
)
3401 re_node_set_empty (&follows
);
3402 /* Merge the follows of this destination states. */
3403 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3405 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3406 if (next_node
!= -1)
3408 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3409 if (BE (err
!= REG_NOERROR
, 0))
3413 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3414 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3416 /* If the new state has context constraint,
3417 build appropriate states for these contexts. */
3418 if (dest_states
[i
]->has_constraint
)
3420 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3422 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3425 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3426 need_word_trtable
= 1;
3428 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3430 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3435 dest_states_word
[i
] = dest_states
[i
];
3436 dest_states_nl
[i
] = dest_states
[i
];
3438 bitset_merge (acceptable
, dests_ch
[i
]);
3441 if (!BE (need_word_trtable
, 0))
3443 /* We don't care about whether the following character is a word
3444 character, or we are in a single-byte character set so we can
3445 discern by looking at the character code: allocate a
3446 256-entry transition table. */
3447 trtable
= state
->trtable
=
3448 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3449 if (BE (trtable
== NULL
, 0))
3452 /* For all characters ch...: */
3453 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3454 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3456 mask
<<= 1, elem
>>= 1, ++ch
)
3457 if (BE (elem
& 1, 0))
3459 /* There must be exactly one destination which accepts
3460 character ch. See group_nodes_into_DFAstates. */
3461 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3464 /* j-th destination accepts the word character ch. */
3465 if (dfa
->word_char
[i
] & mask
)
3466 trtable
[ch
] = dest_states_word
[j
];
3468 trtable
[ch
] = dest_states
[j
];
3473 /* We care about whether the following character is a word
3474 character, and we are in a multi-byte character set: discern
3475 by looking at the character code: build two 256-entry
3476 transition tables, one starting at trtable[0] and one
3477 starting at trtable[SBC_MAX]. */
3478 trtable
= state
->word_trtable
=
3479 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3480 if (BE (trtable
== NULL
, 0))
3483 /* For all characters ch...: */
3484 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3485 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3487 mask
<<= 1, elem
>>= 1, ++ch
)
3488 if (BE (elem
& 1, 0))
3490 /* There must be exactly one destination which accepts
3491 character ch. See group_nodes_into_DFAstates. */
3492 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3495 /* j-th destination accepts the word character ch. */
3496 trtable
[ch
] = dest_states
[j
];
3497 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3502 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3504 /* The current state accepts newline character. */
3505 for (j
= 0; j
< ndests
; ++j
)
3506 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3508 /* k-th destination accepts newline character. */
3509 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3510 if (need_word_trtable
)
3511 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3512 /* There must be only one destination which accepts
3513 newline. See group_nodes_into_DFAstates. */
3518 if (dest_states_malloced
)
3521 re_node_set_free (&follows
);
3522 for (i
= 0; i
< ndests
; ++i
)
3523 re_node_set_free (dests_node
+ i
);
3525 if (dests_node_malloced
)
3531 /* Group all nodes belonging to STATE into several destinations.
3532 Then for all destinations, set the nodes belonging to the destination
3533 to DESTS_NODE[i] and set the characters accepted by the destination
3534 to DEST_CH[i]. This function return the number of destinations. */
3538 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3539 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3544 int ndests
; /* Number of the destinations from `state'. */
3545 bitset_t accepts
; /* Characters a node can accept. */
3546 const re_node_set
*cur_nodes
= &state
->nodes
;
3547 bitset_empty (accepts
);
3550 /* For all the nodes belonging to `state', */
3551 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3553 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3554 re_token_type_t type
= node
->type
;
3555 unsigned int constraint
= node
->constraint
;
3557 /* Enumerate all single byte character this node can accept. */
3558 if (type
== CHARACTER
)
3559 bitset_set (accepts
, node
->opr
.c
);
3560 else if (type
== SIMPLE_BRACKET
)
3562 bitset_merge (accepts
, node
->opr
.sbcset
);
3564 else if (type
== OP_PERIOD
)
3566 #ifdef RE_ENABLE_I18N
3567 if (dfa
->mb_cur_max
> 1)
3568 bitset_merge (accepts
, dfa
->sb_char
);
3571 bitset_set_all (accepts
);
3572 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3573 bitset_clear (accepts
, '\n');
3574 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3575 bitset_clear (accepts
, '\0');
3577 #ifdef RE_ENABLE_I18N
3578 else if (type
== OP_UTF8_PERIOD
)
3580 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3581 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3582 bitset_clear (accepts
, '\n');
3583 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3584 bitset_clear (accepts
, '\0');
3590 /* Check the `accepts' and sift the characters which are not
3591 match it the context. */
3594 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3596 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3597 bitset_empty (accepts
);
3598 if (accepts_newline
)
3599 bitset_set (accepts
, NEWLINE_CHAR
);
3603 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3605 bitset_empty (accepts
);
3609 if (constraint
& NEXT_WORD_CONSTRAINT
)
3611 bitset_word_t any_set
= 0;
3612 if (type
== CHARACTER
&& !node
->word_char
)
3614 bitset_empty (accepts
);
3617 #ifdef RE_ENABLE_I18N
3618 if (dfa
->mb_cur_max
> 1)
3619 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3620 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3623 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3624 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3628 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3630 bitset_word_t any_set
= 0;
3631 if (type
== CHARACTER
&& node
->word_char
)
3633 bitset_empty (accepts
);
3636 #ifdef RE_ENABLE_I18N
3637 if (dfa
->mb_cur_max
> 1)
3638 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3639 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3642 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3643 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3649 /* Then divide `accepts' into DFA states, or create a new
3650 state. Above, we make sure that accepts is not empty. */
3651 for (j
= 0; j
< ndests
; ++j
)
3653 bitset_t intersec
; /* Intersection sets, see below. */
3655 /* Flags, see below. */
3656 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3658 /* Optimization, skip if this state doesn't accept the character. */
3659 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3662 /* Enumerate the intersection set of this state and `accepts'. */
3664 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3665 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3666 /* And skip if the intersection set is empty. */
3670 /* Then check if this state is a subset of `accepts'. */
3671 not_subset
= not_consumed
= 0;
3672 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3674 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3675 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3678 /* If this state isn't a subset of `accepts', create a
3679 new group state, which has the `remains'. */
3682 bitset_copy (dests_ch
[ndests
], remains
);
3683 bitset_copy (dests_ch
[j
], intersec
);
3684 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3685 if (BE (err
!= REG_NOERROR
, 0))
3690 /* Put the position in the current group. */
3691 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3692 if (BE (result
< 0, 0))
3695 /* If all characters are consumed, go to next node. */
3699 /* Some characters remain, create a new group. */
3702 bitset_copy (dests_ch
[ndests
], accepts
);
3703 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3704 if (BE (err
!= REG_NOERROR
, 0))
3707 bitset_empty (accepts
);
3712 for (j
= 0; j
< ndests
; ++j
)
3713 re_node_set_free (dests_node
+ j
);
3717 #ifdef RE_ENABLE_I18N
3718 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3719 Return the number of the bytes the node accepts.
3720 STR_IDX is the current index of the input string.
3722 This function handles the nodes which can accept one character, or
3723 one collating element like '.', '[a-z]', opposite to the other nodes
3724 can only accept one byte. */
3727 # include <locale/weight.h>
3732 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3733 const re_string_t
*input
, int str_idx
)
3735 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3736 int char_len
, elem_len
;
3739 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3741 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3742 if (BE (c
< 0xc2, 1))
3745 if (str_idx
+ 2 > input
->len
)
3748 d
= re_string_byte_at (input
, str_idx
+ 1);
3750 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3754 if (c
== 0xe0 && d
< 0xa0)
3760 if (c
== 0xf0 && d
< 0x90)
3766 if (c
== 0xf8 && d
< 0x88)
3772 if (c
== 0xfc && d
< 0x84)
3778 if (str_idx
+ char_len
> input
->len
)
3781 for (i
= 1; i
< char_len
; ++i
)
3783 d
= re_string_byte_at (input
, str_idx
+ i
);
3784 if (d
< 0x80 || d
> 0xbf)
3790 char_len
= re_string_char_size_at (input
, str_idx
);
3791 if (node
->type
== OP_PERIOD
)
3795 /* FIXME: I don't think this if is needed, as both '\n'
3796 and '\0' are char_len == 1. */
3797 /* '.' accepts any one character except the following two cases. */
3798 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3799 re_string_byte_at (input
, str_idx
) == '\n') ||
3800 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3801 re_string_byte_at (input
, str_idx
) == '\0'))
3806 elem_len
= re_string_elem_size_at (input
, str_idx
);
3807 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3810 if (node
->type
== COMPLEX_BRACKET
)
3812 const re_charset_t
*cset
= node
->opr
.mbcset
;
3814 const unsigned char *pin
3815 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3820 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3821 ? re_string_wchar_at (input
, str_idx
) : 0);
3823 /* match with multibyte character? */
3824 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3825 if (wc
== cset
->mbchars
[i
])
3827 match_len
= char_len
;
3828 goto check_node_accept_bytes_match
;
3830 /* match with character_class? */
3831 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3833 wctype_t wt
= cset
->char_classes
[i
];
3834 if (__iswctype (wc
, wt
))
3836 match_len
= char_len
;
3837 goto check_node_accept_bytes_match
;
3842 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3845 unsigned int in_collseq
= 0;
3846 const int32_t *table
, *indirect
;
3847 const unsigned char *weights
, *extra
;
3848 const char *collseqwc
;
3850 /* match with collating_symbol? */
3851 if (cset
->ncoll_syms
)
3852 extra
= (const unsigned char *)
3853 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3854 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3856 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3857 /* Compare the length of input collating element and
3858 the length of current collating element. */
3859 if (*coll_sym
!= elem_len
)
3861 /* Compare each bytes. */
3862 for (j
= 0; j
< *coll_sym
; j
++)
3863 if (pin
[j
] != coll_sym
[1 + j
])
3867 /* Match if every bytes is equal. */
3869 goto check_node_accept_bytes_match
;
3875 if (elem_len
<= char_len
)
3877 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3878 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3881 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3883 /* match with range expression? */
3884 for (i
= 0; i
< cset
->nranges
; ++i
)
3885 if (cset
->range_starts
[i
] <= in_collseq
3886 && in_collseq
<= cset
->range_ends
[i
])
3888 match_len
= elem_len
;
3889 goto check_node_accept_bytes_match
;
3892 /* match with equivalence_class? */
3893 if (cset
->nequiv_classes
)
3895 const unsigned char *cp
= pin
;
3896 table
= (const int32_t *)
3897 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3898 weights
= (const unsigned char *)
3899 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3900 extra
= (const unsigned char *)
3901 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3902 indirect
= (const int32_t *)
3903 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3904 int32_t idx
= findidx (table
, indirect
, extra
, &cp
, elem_len
);
3906 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3908 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3909 size_t weight_len
= weights
[idx
& 0xffffff];
3910 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3911 && (idx
>> 24) == (equiv_class_idx
>> 24))
3916 equiv_class_idx
&= 0xffffff;
3918 while (cnt
<= weight_len
3919 && (weights
[equiv_class_idx
+ 1 + cnt
]
3920 == weights
[idx
+ 1 + cnt
]))
3922 if (cnt
> weight_len
)
3924 match_len
= elem_len
;
3925 goto check_node_accept_bytes_match
;
3934 /* match with range expression? */
3936 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3938 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3941 for (i
= 0; i
< cset
->nranges
; ++i
)
3943 cmp_buf
[0] = cset
->range_starts
[i
];
3944 cmp_buf
[4] = cset
->range_ends
[i
];
3945 if (__wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3946 && __wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3948 match_len
= char_len
;
3949 goto check_node_accept_bytes_match
;
3953 check_node_accept_bytes_match
:
3954 if (!cset
->non_match
)
3961 return (elem_len
> char_len
) ? elem_len
: char_len
;
3970 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3972 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3977 /* No valid character. Match it as a single byte character. */
3978 const unsigned char *collseq
= (const unsigned char *)
3979 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3980 return collseq
[mbs
[0]];
3987 const unsigned char *extra
= (const unsigned char *)
3988 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3989 int32_t extrasize
= (const unsigned char *)
3990 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3992 for (idx
= 0; idx
< extrasize
;)
3994 int mbs_cnt
, found
= 0;
3995 int32_t elem_mbs_len
;
3996 /* Skip the name of collating element name. */
3997 idx
= idx
+ extra
[idx
] + 1;
3998 elem_mbs_len
= extra
[idx
++];
3999 if (mbs_len
== elem_mbs_len
)
4001 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4002 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4004 if (mbs_cnt
== elem_mbs_len
)
4005 /* Found the entry. */
4008 /* Skip the byte sequence of the collating element. */
4009 idx
+= elem_mbs_len
;
4010 /* Adjust for the alignment. */
4011 idx
= (idx
+ 3) & ~3;
4012 /* Skip the collation sequence value. */
4013 idx
+= sizeof (uint32_t);
4014 /* Skip the wide char sequence of the collating element. */
4015 idx
= idx
+ sizeof (uint32_t) * (*(int32_t *) (extra
+ idx
) + 1);
4016 /* If we found the entry, return the sequence value. */
4018 return *(uint32_t *) (extra
+ idx
);
4019 /* Skip the collation sequence value. */
4020 idx
+= sizeof (uint32_t);
4026 #endif /* RE_ENABLE_I18N */
4028 /* Check whether the node accepts the byte which is IDX-th
4029 byte of the INPUT. */
4033 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4037 ch
= re_string_byte_at (&mctx
->input
, idx
);
4041 if (node
->opr
.c
!= ch
)
4045 case SIMPLE_BRACKET
:
4046 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4050 #ifdef RE_ENABLE_I18N
4051 case OP_UTF8_PERIOD
:
4057 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4058 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4066 if (node
->constraint
)
4068 /* The node has constraints. Check whether the current context
4069 satisfies the constraints. */
4070 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4072 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4079 /* Extend the buffers, if the buffers have run out. */
4081 static reg_errcode_t
4082 internal_function __attribute_warn_unused_result__
4083 extend_buffers (re_match_context_t
*mctx
, int min_len
)
4086 re_string_t
*pstr
= &mctx
->input
;
4088 /* Avoid overflow. */
4089 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4092 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4093 ret
= re_string_realloc_buffers (pstr
,
4095 MIN (pstr
->len
, pstr
->bufs_len
* 2)));
4096 if (BE (ret
!= REG_NOERROR
, 0))
4099 if (mctx
->state_log
!= NULL
)
4101 /* And double the length of state_log. */
4102 /* XXX We have no indication of the size of this buffer. If this
4103 allocation fail we have no indication that the state_log array
4104 does not have the right size. */
4105 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4106 pstr
->bufs_len
+ 1);
4107 if (BE (new_array
== NULL
, 0))
4109 mctx
->state_log
= new_array
;
4112 /* Then reconstruct the buffers. */
4115 #ifdef RE_ENABLE_I18N
4116 if (pstr
->mb_cur_max
> 1)
4118 ret
= build_wcs_upper_buffer (pstr
);
4119 if (BE (ret
!= REG_NOERROR
, 0))
4123 #endif /* RE_ENABLE_I18N */
4124 build_upper_buffer (pstr
);
4128 #ifdef RE_ENABLE_I18N
4129 if (pstr
->mb_cur_max
> 1)
4130 build_wcs_buffer (pstr
);
4132 #endif /* RE_ENABLE_I18N */
4134 if (pstr
->trans
!= NULL
)
4135 re_string_translate_buffer (pstr
);
4142 /* Functions for matching context. */
4144 /* Initialize MCTX. */
4146 static reg_errcode_t
4147 internal_function __attribute_warn_unused_result__
4148 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4150 mctx
->eflags
= eflags
;
4151 mctx
->match_last
= -1;
4154 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4155 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4156 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4159 /* Already zero-ed by the caller.
4161 mctx->bkref_ents = NULL;
4162 mctx->nbkref_ents = 0;
4163 mctx->nsub_tops = 0; */
4164 mctx
->abkref_ents
= n
;
4165 mctx
->max_mb_elem_len
= 1;
4166 mctx
->asub_tops
= n
;
4170 /* Clean the entries which depend on the current input in MCTX.
4171 This function must be invoked when the matcher changes the start index
4172 of the input, or changes the input string. */
4176 match_ctx_clean (re_match_context_t
*mctx
)
4179 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4182 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4183 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4185 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4186 re_free (last
->path
.array
);
4189 re_free (top
->lasts
);
4192 re_free (top
->path
->array
);
4193 re_free (top
->path
);
4198 mctx
->nsub_tops
= 0;
4199 mctx
->nbkref_ents
= 0;
4202 /* Free all the memory associated with MCTX. */
4206 match_ctx_free (re_match_context_t
*mctx
)
4208 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4209 match_ctx_clean (mctx
);
4210 re_free (mctx
->sub_tops
);
4211 re_free (mctx
->bkref_ents
);
4214 /* Add a new backreference entry to MCTX.
4215 Note that we assume that caller never call this function with duplicate
4216 entry, and call with STR_IDX which isn't smaller than any existing entry.
4219 static reg_errcode_t
4220 internal_function __attribute_warn_unused_result__
4221 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4224 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4226 struct re_backref_cache_entry
* new_entry
;
4227 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4228 mctx
->abkref_ents
* 2);
4229 if (BE (new_entry
== NULL
, 0))
4231 re_free (mctx
->bkref_ents
);
4234 mctx
->bkref_ents
= new_entry
;
4235 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4236 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4237 mctx
->abkref_ents
*= 2;
4239 if (mctx
->nbkref_ents
> 0
4240 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4241 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4243 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4244 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4245 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4246 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4248 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4249 If bit N is clear, means that this entry won't epsilon-transition to
4250 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4251 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4254 A backreference does not epsilon-transition unless it is empty, so set
4255 to all zeros if FROM != TO. */
4256 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4257 = (from
== to
? ~0 : 0);
4259 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4260 if (mctx
->max_mb_elem_len
< to
- from
)
4261 mctx
->max_mb_elem_len
= to
- from
;
4265 /* Search for the first entry which has the same str_idx, or -1 if none is
4266 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4270 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4272 int left
, right
, mid
, last
;
4273 last
= right
= mctx
->nbkref_ents
;
4274 for (left
= 0; left
< right
;)
4276 mid
= (left
+ right
) / 2;
4277 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4282 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4288 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4291 static reg_errcode_t
4292 internal_function __attribute_warn_unused_result__
4293 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4296 assert (mctx
->sub_tops
!= NULL
);
4297 assert (mctx
->asub_tops
> 0);
4299 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4301 int new_asub_tops
= mctx
->asub_tops
* 2;
4302 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4303 re_sub_match_top_t
*,
4305 if (BE (new_array
== NULL
, 0))
4307 mctx
->sub_tops
= new_array
;
4308 mctx
->asub_tops
= new_asub_tops
;
4310 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4311 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4313 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4314 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4318 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4319 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4321 static re_sub_match_last_t
*
4323 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4325 re_sub_match_last_t
*new_entry
;
4326 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4328 int new_alasts
= 2 * subtop
->alasts
+ 1;
4329 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4330 re_sub_match_last_t
*,
4332 if (BE (new_array
== NULL
, 0))
4334 subtop
->lasts
= new_array
;
4335 subtop
->alasts
= new_alasts
;
4337 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4338 if (BE (new_entry
!= NULL
, 1))
4340 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4341 new_entry
->node
= node
;
4342 new_entry
->str_idx
= str_idx
;
4350 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4351 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4353 sctx
->sifted_states
= sifted_sts
;
4354 sctx
->limited_states
= limited_sts
;
4355 sctx
->last_node
= last_node
;
4356 sctx
->last_str_idx
= last_str_idx
;
4357 re_node_set_init_empty (&sctx
->limits
);