1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
43 int eflags
) internal_function
;
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
) internal_function
;
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
52 int ret_len
) internal_function
;
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
) internal_function
;
55 static inline re_dfastate_t
*acquire_init_state_context
56 (reg_errcode_t
*err
, const re_match_context_t
*mctx
, int idx
)
57 __attribute ((always_inline
)) internal_function
;
58 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
60 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
63 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
64 unsigned int context
) internal_function
;
65 static int check_halt_state_context (const re_match_context_t
*mctx
,
66 const re_dfastate_t
*state
, int idx
)
68 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
,
69 regmatch_t
*prev_idx_match
, int cur_node
,
70 int cur_idx
, int nmatch
) internal_function
;
71 static int proceed_next_node (const re_match_context_t
*mctx
,
72 int nregs
, regmatch_t
*regs
,
73 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
74 struct re_fail_stack_t
*fs
) internal_function
;
75 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
76 int str_idx
, int dest_node
, int nregs
,
78 re_node_set
*eps_via_nodes
) internal_function
;
79 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
80 regmatch_t
*regs
, re_node_set
*eps_via_nodes
) internal_function
;
81 static reg_errcode_t
set_regs (const regex_t
*preg
,
82 const re_match_context_t
*mctx
,
83 size_t nmatch
, regmatch_t
*pmatch
,
84 int fl_backtrack
) internal_function
;
85 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
) internal_function
;
88 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
89 re_sift_context_t
*sctx
,
90 int node_idx
, int str_idx
, int max_str_idx
) internal_function
;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t
sift_states_backward (re_match_context_t
*mctx
,
93 re_sift_context_t
*sctx
) internal_function
;
94 static reg_errcode_t
build_sifted_states (re_match_context_t
*mctx
,
95 re_sift_context_t
*sctx
, int str_idx
,
96 re_node_set
*cur_dest
) internal_function
;
97 static reg_errcode_t
update_cur_sifted_state (re_match_context_t
*mctx
,
98 re_sift_context_t
*sctx
,
100 re_node_set
*dest_nodes
) internal_function
;
101 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
102 re_node_set
*dest_nodes
,
103 const re_node_set
*candidates
) internal_function
;
104 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
105 re_node_set
*dest_nodes
,
106 const re_node_set
*and_nodes
) internal_function
;
107 static int check_dst_limits (re_match_context_t
*mctx
, re_node_set
*limits
,
108 int dst_node
, int dst_idx
, int src_node
,
109 int src_idx
) internal_function
;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t
*mctx
,
111 int boundaries
, int subexp_idx
,
112 int from_node
, int bkref_idx
) internal_function
;
113 static int check_dst_limits_calc_pos (re_match_context_t
*mctx
,
114 int limit
, int subexp_idx
,
115 int node
, int str_idx
,
116 int bkref_idx
) internal_function
;
117 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
118 re_node_set
*dest_nodes
,
119 const re_node_set
*candidates
,
121 struct re_backref_cache_entry
*bkref_ents
,
122 int str_idx
) internal_function
;
123 static reg_errcode_t
sift_states_bkref (re_match_context_t
*mctx
,
124 re_sift_context_t
*sctx
,
125 int str_idx
, const re_node_set
*candidates
) internal_function
;
126 static reg_errcode_t
clean_state_log_if_needed (re_match_context_t
*mctx
,
127 int next_state_log_idx
) internal_function
;
128 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
129 re_dfastate_t
**src
, int num
) internal_function
;
130 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
131 re_match_context_t
*mctx
) internal_function
;
132 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
133 re_match_context_t
*mctx
,
134 re_dfastate_t
*state
) internal_function
;
135 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
136 re_match_context_t
*mctx
,
137 re_dfastate_t
*next_state
) internal_function
;
138 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
139 re_node_set
*cur_nodes
,
140 int str_idx
) internal_function
;
142 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
143 re_match_context_t
*mctx
,
144 re_dfastate_t
*pstate
) internal_function
;
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
148 re_dfastate_t
*pstate
) internal_function
;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
151 const re_node_set
*nodes
) internal_function
;
152 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
153 int bkref_node
, int bkref_str_idx
) internal_function
;
154 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
155 const re_sub_match_top_t
*sub_top
,
156 re_sub_match_last_t
*sub_last
,
157 int bkref_node
, int bkref_str
) internal_function
;
158 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
159 int subexp_idx
, int type
) internal_function
;
160 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
161 state_array_t
*path
, int top_node
,
162 int top_str
, int last_node
, int last_str
,
163 int type
) internal_function
;
164 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
166 re_node_set
*cur_nodes
,
167 re_node_set
*next_nodes
) internal_function
;
168 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
169 re_node_set
*cur_nodes
,
170 int ex_subexp
, int type
) internal_function
;
171 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
172 re_node_set
*dst_nodes
,
173 int target
, int ex_subexp
,
174 int type
) internal_function
;
175 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
176 re_node_set
*cur_nodes
, int cur_str
,
177 int subexp_num
, int type
) internal_function
;
178 static int build_trtable (re_dfa_t
*dfa
,
179 re_dfastate_t
*state
) internal_function
;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t
*dfa
, int node_idx
,
182 const re_string_t
*input
, int idx
) internal_function
;
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
185 size_t name_len
) internal_function
;
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t
*dfa
,
189 const re_dfastate_t
*state
,
190 re_node_set
*states_node
,
191 bitset
*states_ch
) internal_function
;
192 static int check_node_accept (const re_match_context_t
*mctx
,
193 const re_token_t
*node
, int idx
) internal_function
;
194 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
) internal_function
;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
214 const regex_t
*__restrict preg
;
215 const char *__restrict string
;
223 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
226 if (eflags
& REG_STARTEND
)
228 start
= pmatch
[0].rm_so
;
229 length
= pmatch
[0].rm_eo
;
234 length
= strlen (string
);
237 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
238 length
, 0, NULL
, eflags
);
240 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
241 length
, nmatch
, pmatch
, eflags
);
242 return err
!= REG_NOERROR
;
246 # include <shlib-compat.h>
247 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
250 __typeof__ (__regexec
) __compat_regexec
;
253 attribute_compat_text_section
254 __compat_regexec (const regex_t
*__restrict preg
,
255 const char *__restrict string
, size_t nmatch
,
256 regmatch_t pmatch
[], int eflags
)
258 return regexec (preg
, string
, nmatch
, pmatch
,
259 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
261 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
265 /* Entry points for GNU code. */
267 /* re_match, re_search, re_match_2, re_search_2
269 The former two functions operate on STRING with length LENGTH,
270 while the later two operate on concatenation of STRING1 and STRING2
271 with lengths LENGTH1 and LENGTH2, respectively.
273 re_match() matches the compiled pattern in BUFP against the string,
274 starting at index START.
276 re_search() first tries matching at index START, then it tries to match
277 starting from index START + 1, and so on. The last start position tried
278 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
281 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
282 the first STOP characters of the concatenation of the strings should be
285 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
286 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
287 computed relative to the concatenation, not relative to the individual
290 On success, re_match* functions return the length of the match, re_search*
291 return the position of the start of the match. Return value -1 means no
292 match was found and -2 indicates an internal error. */
295 re_match (bufp
, string
, length
, start
, regs
)
296 struct re_pattern_buffer
*bufp
;
299 struct re_registers
*regs
;
301 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
304 weak_alias (__re_match
, re_match
)
308 re_search (bufp
, string
, length
, start
, range
, regs
)
309 struct re_pattern_buffer
*bufp
;
311 int length
, start
, range
;
312 struct re_registers
*regs
;
314 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
317 weak_alias (__re_search
, re_search
)
321 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
322 struct re_pattern_buffer
*bufp
;
323 const char *string1
, *string2
;
324 int length1
, length2
, start
, stop
;
325 struct re_registers
*regs
;
327 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
328 start
, 0, regs
, stop
, 1);
331 weak_alias (__re_match_2
, re_match_2
)
335 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
336 struct re_pattern_buffer
*bufp
;
337 const char *string1
, *string2
;
338 int length1
, length2
, start
, range
, stop
;
339 struct re_registers
*regs
;
341 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
342 start
, range
, regs
, stop
, 0);
345 weak_alias (__re_search_2
, re_search_2
)
349 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
351 struct re_pattern_buffer
*bufp
;
352 const char *string1
, *string2
;
353 int length1
, length2
, start
, range
, stop
, ret_len
;
354 struct re_registers
*regs
;
358 int len
= length1
+ length2
;
361 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
364 /* Concatenate the strings. */
368 char *s
= re_malloc (char, len
);
370 if (BE (s
== NULL
, 0))
372 memcpy (s
, string1
, length1
);
373 memcpy (s
+ length1
, string2
, length2
);
382 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
385 re_free ((char *) str
);
389 /* The parameters have the same meaning as those of re_search.
390 Additional parameters:
391 If RET_LEN is nonzero the length of the match is returned (re_match style);
392 otherwise the position of the match is returned. */
395 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
396 struct re_pattern_buffer
*bufp
;
398 int length
, start
, range
, stop
, ret_len
;
399 struct re_registers
*regs
;
401 reg_errcode_t result
;
406 /* Check for out-of-range. */
407 if (BE (start
< 0 || start
> length
, 0))
409 if (BE (start
+ range
> length
, 0))
410 range
= length
- start
;
411 else if (BE (start
+ range
< 0, 0))
414 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
415 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
417 /* Compile fastmap if we haven't yet. */
418 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
419 re_compile_fastmap (bufp
);
421 if (BE (bufp
->no_sub
, 0))
424 /* We need at least 1 register. */
427 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
428 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
430 nregs
= regs
->num_regs
;
431 if (BE (nregs
< 1, 0))
433 /* Nothing can be copied to regs. */
439 nregs
= bufp
->re_nsub
+ 1;
440 pmatch
= re_malloc (regmatch_t
, nregs
);
441 if (BE (pmatch
== NULL
, 0))
444 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
445 nregs
, pmatch
, eflags
);
449 /* I hope we needn't fill ther regs with -1's when no match was found. */
450 if (result
!= REG_NOERROR
)
452 else if (regs
!= NULL
)
454 /* If caller wants register contents data back, copy them. */
455 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
456 bufp
->regs_allocated
);
457 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
461 if (BE (rval
== 0, 1))
465 assert (pmatch
[0].rm_so
== start
);
466 rval
= pmatch
[0].rm_eo
- start
;
469 rval
= pmatch
[0].rm_so
;
476 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
477 struct re_registers
*regs
;
479 int nregs
, regs_allocated
;
481 int rval
= REGS_REALLOCATE
;
483 int need_regs
= nregs
+ 1;
484 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
487 /* Have the register data arrays been allocated? */
488 if (regs_allocated
== REGS_UNALLOCATED
)
489 { /* No. So allocate them with malloc. */
490 regs
->start
= re_malloc (regoff_t
, need_regs
);
491 regs
->end
= re_malloc (regoff_t
, need_regs
);
492 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
493 return REGS_UNALLOCATED
;
494 regs
->num_regs
= need_regs
;
496 else if (regs_allocated
== REGS_REALLOCATE
)
497 { /* Yes. If we need more elements than were already
498 allocated, reallocate them. If we need fewer, just
500 if (BE (need_regs
> regs
->num_regs
, 0))
502 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
503 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
504 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
505 return REGS_UNALLOCATED
;
506 regs
->start
= new_start
;
508 regs
->num_regs
= need_regs
;
513 assert (regs_allocated
== REGS_FIXED
);
514 /* This function may not be called with REGS_FIXED and nregs too big. */
515 assert (regs
->num_regs
>= nregs
);
520 for (i
= 0; i
< nregs
; ++i
)
522 regs
->start
[i
] = pmatch
[i
].rm_so
;
523 regs
->end
[i
] = pmatch
[i
].rm_eo
;
525 for ( ; i
< regs
->num_regs
; ++i
)
526 regs
->start
[i
] = regs
->end
[i
] = -1;
531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
532 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
533 this memory for recording register information. STARTS and ENDS
534 must be allocated using the malloc library routine, and must each
535 be at least NUM_REGS * sizeof (regoff_t) bytes long.
537 If NUM_REGS == 0, then subsequent matches should allocate their own
540 Unless this function is called, the first search or match using
541 PATTERN_BUFFER will allocate its own register data, without
542 freeing the old data. */
545 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
546 struct re_pattern_buffer
*bufp
;
547 struct re_registers
*regs
;
549 regoff_t
*starts
, *ends
;
553 bufp
->regs_allocated
= REGS_REALLOCATE
;
554 regs
->num_regs
= num_regs
;
555 regs
->start
= starts
;
560 bufp
->regs_allocated
= REGS_UNALLOCATED
;
562 regs
->start
= regs
->end
= (regoff_t
*) 0;
566 weak_alias (__re_set_registers
, re_set_registers
)
569 /* Entry points compatible with 4.2 BSD regex library. We don't define
570 them unless specifically requested. */
572 #if defined _REGEX_RE_COMP || defined _LIBC
580 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
582 #endif /* _REGEX_RE_COMP */
584 /* Internal entry point. */
586 /* Searches for a compiled pattern PREG in the string STRING, whose
587 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
588 mingings with regexec. START, and RANGE have the same meanings
590 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
591 otherwise return the error code.
592 Note: We assume front end functions already check ranges.
593 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
596 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
600 int length
, start
, range
, stop
, eflags
;
605 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
606 int left_lim
, right_lim
, incr
;
607 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
610 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
611 re_match_context_t mctx
= { .dfa
= dfa
};
613 re_match_context_t mctx
;
615 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
616 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
617 unsigned RE_TRANSLATE_TYPE t
= (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
620 memset (&mctx
, '\0', sizeof (re_match_context_t
));
624 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
625 nmatch
-= extra_nmatch
;
627 /* Check if the DFA haven't been compiled. */
628 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
629 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
630 || dfa
->init_state_begbuf
== NULL
, 0))
634 /* We assume front-end functions already check them. */
635 assert (start
+ range
>= 0 && start
+ range
<= length
);
638 /* If initial states with non-begbuf contexts have no elements,
639 the regex must be anchored. If preg->newline_anchor is set,
640 we'll never use init_state_nl, so do not check it. */
641 if (dfa
->init_state
->nodes
.nelem
== 0
642 && dfa
->init_state_word
->nodes
.nelem
== 0
643 && (dfa
->init_state_nl
->nodes
.nelem
== 0
644 || !preg
->newline_anchor
))
646 if (start
!= 0 && start
+ range
!= 0)
651 /* We must check the longest matching, if nmatch > 0. */
652 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
654 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
655 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
656 if (BE (err
!= REG_NOERROR
, 0))
658 mctx
.input
.stop
= stop
;
659 mctx
.input
.raw_stop
= stop
;
660 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
662 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
663 if (BE (err
!= REG_NOERROR
, 0))
666 /* We will log all the DFA states through which the dfa pass,
667 if nmatch > 1, or this dfa has "multibyte node", which is a
668 back-reference or a node which can accept multibyte character or
669 multi character collating element. */
670 if (nmatch
> 1 || dfa
->has_mb_node
)
672 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
673 if (BE (mctx
.state_log
== NULL
, 0))
680 mctx
.state_log
= NULL
;
683 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
684 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
686 /* Check incrementally whether of not the input string match. */
687 incr
= (range
< 0) ? -1 : 1;
688 left_lim
= (range
< 0) ? start
+ range
: start
;
689 right_lim
= (range
< 0) ? start
: start
+ range
;
690 sb
= dfa
->mb_cur_max
== 1;
693 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
694 | (range
>= 0 ? 2 : 0)
695 | (t
!= NULL
? 1 : 0))
698 for (;; match_first
+= incr
)
701 if (match_first
< left_lim
|| right_lim
< match_first
)
704 /* Advance as rapidly as possible through the string, until we
705 find a plausible place to start matching. This may be done
706 with varying efficiency, so there are various possibilities:
707 only the most common of them are specialized, in order to
708 save on code size. We use a switch statement for speed. */
716 /* Fastmap with single-byte translation, match forward. */
717 while (BE (match_first
< right_lim
, 1)
718 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
720 goto forward_match_found_start_or_reached_end
;
723 /* Fastmap without translation, match forward. */
724 while (BE (match_first
< right_lim
, 1)
725 && !fastmap
[(unsigned char) string
[match_first
]])
728 forward_match_found_start_or_reached_end
:
729 if (BE (match_first
== right_lim
, 0))
731 ch
= match_first
>= length
732 ? 0 : (unsigned char) string
[match_first
];
733 if (!fastmap
[t
? t
[ch
] : ch
])
740 /* Fastmap without multi-byte translation, match backwards. */
741 while (match_first
>= left_lim
)
743 ch
= match_first
>= length
744 ? 0 : (unsigned char) string
[match_first
];
745 if (fastmap
[t
? t
[ch
] : ch
])
749 if (match_first
< left_lim
)
754 /* In this case, we can't determine easily the current byte,
755 since it might be a component byte of a multibyte
756 character. Then we use the constructed buffer instead. */
759 /* If MATCH_FIRST is out of the valid range, reconstruct the
761 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
762 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
764 err
= re_string_reconstruct (&mctx
.input
, match_first
,
766 if (BE (err
!= REG_NOERROR
, 0))
769 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
771 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
772 Note that MATCH_FIRST must not be smaller than 0. */
773 ch
= (match_first
>= length
774 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
778 if (match_first
< left_lim
|| match_first
> right_lim
)
787 /* Reconstruct the buffers so that the matcher can assume that
788 the matching starts from the beginning of the buffer. */
789 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
790 if (BE (err
!= REG_NOERROR
, 0))
793 #ifdef RE_ENABLE_I18N
794 /* Don't consider this char as a possible match start if it part,
795 yet isn't the head, of a multibyte character. */
796 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
800 /* It seems to be appropriate one, then use the matcher. */
801 /* We assume that the matching starts from 0. */
802 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
803 match_last
= check_matching (&mctx
, fl_longest_match
,
804 range
>= 0 ? &match_first
: NULL
);
805 if (match_last
!= -1)
807 if (BE (match_last
== -2, 0))
814 mctx
.match_last
= match_last
;
815 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
817 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
818 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
821 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
824 err
= prune_impossible_nodes (&mctx
);
825 if (err
== REG_NOERROR
)
827 if (BE (err
!= REG_NOMATCH
, 0))
832 break; /* We found a match. */
836 match_ctx_clean (&mctx
);
840 assert (match_last
!= -1);
841 assert (err
== REG_NOERROR
);
844 /* Set pmatch[] if we need. */
849 /* Initialize registers. */
850 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
851 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
853 /* Set the points where matching start/end. */
855 pmatch
[0].rm_eo
= mctx
.match_last
;
857 if (!preg
->no_sub
&& nmatch
> 1)
859 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
860 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
861 if (BE (err
!= REG_NOERROR
, 0))
865 /* At last, add the offset to the each registers, since we slided
866 the buffers so that we could assume that the matching starts
868 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
869 if (pmatch
[reg_idx
].rm_so
!= -1)
871 #ifdef RE_ENABLE_I18N
872 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
874 if (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
)
875 pmatch
[reg_idx
].rm_so
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
877 pmatch
[reg_idx
].rm_so
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
];
878 if (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
)
879 pmatch
[reg_idx
].rm_eo
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
881 pmatch
[reg_idx
].rm_eo
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
];
884 assert (mctx
.input
.offsets_needed
== 0);
886 pmatch
[reg_idx
].rm_so
+= match_first
;
887 pmatch
[reg_idx
].rm_eo
+= match_first
;
889 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
891 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
892 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
896 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
897 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
899 pmatch
[reg_idx
+ 1].rm_so
900 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
901 pmatch
[reg_idx
+ 1].rm_eo
902 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
907 re_free (mctx
.state_log
);
909 match_ctx_free (&mctx
);
910 re_string_destruct (&mctx
.input
);
915 prune_impossible_nodes (mctx
)
916 re_match_context_t
*mctx
;
918 re_dfa_t
*const dfa
= mctx
->dfa
;
919 int halt_node
, match_last
;
921 re_dfastate_t
**sifted_states
;
922 re_dfastate_t
**lim_states
= NULL
;
923 re_sift_context_t sctx
;
925 assert (mctx
->state_log
!= NULL
);
927 match_last
= mctx
->match_last
;
928 halt_node
= mctx
->last_node
;
929 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
930 if (BE (sifted_states
== NULL
, 0))
937 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
938 if (BE (lim_states
== NULL
, 0))
945 memset (lim_states
, '\0',
946 sizeof (re_dfastate_t
*) * (match_last
+ 1));
947 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
949 ret
= sift_states_backward (mctx
, &sctx
);
950 re_node_set_free (&sctx
.limits
);
951 if (BE (ret
!= REG_NOERROR
, 0))
953 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
963 } while (mctx
->state_log
[match_last
] == NULL
964 || !mctx
->state_log
[match_last
]->halt
);
965 halt_node
= check_halt_state_context (mctx
,
966 mctx
->state_log
[match_last
],
969 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
971 re_free (lim_states
);
973 if (BE (ret
!= REG_NOERROR
, 0))
978 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
979 ret
= sift_states_backward (mctx
, &sctx
);
980 re_node_set_free (&sctx
.limits
);
981 if (BE (ret
!= REG_NOERROR
, 0))
984 re_free (mctx
->state_log
);
985 mctx
->state_log
= sifted_states
;
986 sifted_states
= NULL
;
987 mctx
->last_node
= halt_node
;
988 mctx
->match_last
= match_last
;
991 re_free (sifted_states
);
992 re_free (lim_states
);
996 /* Acquire an initial state and return it.
997 We must select appropriate initial state depending on the context,
998 since initial states may have constraints like "\<", "^", etc.. */
1000 static inline re_dfastate_t
*
1001 acquire_init_state_context (err
, mctx
, idx
)
1003 const re_match_context_t
*mctx
;
1006 re_dfa_t
*const dfa
= mctx
->dfa
;
1007 if (dfa
->init_state
->has_constraint
)
1009 unsigned int context
;
1010 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1011 if (IS_WORD_CONTEXT (context
))
1012 return dfa
->init_state_word
;
1013 else if (IS_ORDINARY_CONTEXT (context
))
1014 return dfa
->init_state
;
1015 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1016 return dfa
->init_state_begbuf
;
1017 else if (IS_NEWLINE_CONTEXT (context
))
1018 return dfa
->init_state_nl
;
1019 else if (IS_BEGBUF_CONTEXT (context
))
1021 /* It is relatively rare case, then calculate on demand. */
1022 return re_acquire_state_context (err
, dfa
,
1023 dfa
->init_state
->entrance_nodes
,
1027 /* Must not happen? */
1028 return dfa
->init_state
;
1031 return dfa
->init_state
;
1034 /* Check whether the regular expression match input string INPUT or not,
1035 and return the index where the matching end, return -1 if not match,
1036 or return -2 in case of an error.
1037 FL_LONGEST_MATCH means we want the POSIX longest matching.
1038 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1039 next place where we may want to try matching.
1040 Note that the matcher assume that the maching starts from the current
1041 index of the buffer. */
1044 check_matching (mctx
, fl_longest_match
, p_match_first
)
1045 re_match_context_t
*mctx
;
1046 int fl_longest_match
;
1049 re_dfa_t
*const dfa
= mctx
->dfa
;
1052 int match_last
= -1;
1053 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1054 re_dfastate_t
*cur_state
;
1055 int at_init_state
= p_match_first
!= NULL
;
1056 int next_start_idx
= cur_str_idx
;
1059 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1060 /* An initial state must not be NULL (invalid). */
1061 if (BE (cur_state
== NULL
, 0))
1063 assert (err
== REG_ESPACE
);
1067 if (mctx
->state_log
!= NULL
)
1069 mctx
->state_log
[cur_str_idx
] = cur_state
;
1071 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1072 later. E.g. Processing back references. */
1073 if (BE (dfa
->nbackref
, 0))
1076 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1077 if (BE (err
!= REG_NOERROR
, 0))
1080 if (cur_state
->has_backref
)
1082 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1083 if (BE (err
!= REG_NOERROR
, 0))
1089 /* If the RE accepts NULL string. */
1090 if (BE (cur_state
->halt
, 0))
1092 if (!cur_state
->has_constraint
1093 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1095 if (!fl_longest_match
)
1099 match_last
= cur_str_idx
;
1105 while (!re_string_eoi (&mctx
->input
))
1107 re_dfastate_t
*old_state
= cur_state
;
1108 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1110 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1111 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1112 && mctx
->input
.valid_len
< mctx
->input
.len
))
1114 err
= extend_buffers (mctx
);
1115 if (BE (err
!= REG_NOERROR
, 0))
1117 assert (err
== REG_ESPACE
);
1122 cur_state
= transit_state (&err
, mctx
, cur_state
);
1123 if (mctx
->state_log
!= NULL
)
1124 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1126 if (cur_state
== NULL
)
1128 /* Reached the invalid state or an error. Try to recover a valid
1129 state using the state log, if available and if we have not
1130 already found a valid (even if not the longest) match. */
1131 if (BE (err
!= REG_NOERROR
, 0))
1134 if (mctx
->state_log
== NULL
1135 || (match
&& !fl_longest_match
)
1136 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1140 if (BE (at_init_state
, 0))
1142 if (old_state
== cur_state
)
1143 next_start_idx
= next_char_idx
;
1148 if (cur_state
->halt
)
1150 /* Reached a halt state.
1151 Check the halt state can satisfy the current context. */
1152 if (!cur_state
->has_constraint
1153 || check_halt_state_context (mctx
, cur_state
,
1154 re_string_cur_idx (&mctx
->input
)))
1156 /* We found an appropriate halt state. */
1157 match_last
= re_string_cur_idx (&mctx
->input
);
1160 /* We found a match, do not modify match_first below. */
1161 p_match_first
= NULL
;
1162 if (!fl_longest_match
)
1169 *p_match_first
+= next_start_idx
;
1174 /* Check NODE match the current context. */
1176 static int check_halt_node_context (dfa
, node
, context
)
1177 const re_dfa_t
*dfa
;
1179 unsigned int context
;
1181 re_token_type_t type
= dfa
->nodes
[node
].type
;
1182 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1183 if (type
!= END_OF_RE
)
1187 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1192 /* Check the halt state STATE match the current context.
1193 Return 0 if not match, if the node, STATE has, is a halt node and
1194 match the context, return the node. */
1197 check_halt_state_context (mctx
, state
, idx
)
1198 const re_match_context_t
*mctx
;
1199 const re_dfastate_t
*state
;
1203 unsigned int context
;
1205 assert (state
->halt
);
1207 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1208 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1209 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1210 return state
->nodes
.elems
[i
];
1214 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1215 corresponding to the DFA).
1216 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1220 proceed_next_node (mctx
, nregs
, regs
, pidx
, node
, eps_via_nodes
, fs
)
1221 const re_match_context_t
*mctx
;
1223 int nregs
, *pidx
, node
;
1224 re_node_set
*eps_via_nodes
;
1225 struct re_fail_stack_t
*fs
;
1227 re_dfa_t
*const dfa
= mctx
->dfa
;
1228 int i
, err
, dest_node
;
1230 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1232 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1233 re_node_set
*edests
= &dfa
->edests
[node
];
1235 err
= re_node_set_insert (eps_via_nodes
, node
);
1236 if (BE (err
< 0, 0))
1238 /* Pick up a valid destination, or return -1 if none is found. */
1239 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1241 int candidate
= edests
->elems
[i
];
1242 if (!re_node_set_contains (cur_nodes
, candidate
))
1244 if (dest_node
== -1)
1245 dest_node
= candidate
;
1249 /* In order to avoid infinite loop like "(a*)*", return the second
1250 epsilon-transition if the first was already considered. */
1251 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1254 /* Otherwise, push the second epsilon-transition on the fail stack. */
1256 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1260 /* We know we are going to exit. */
1269 re_token_type_t type
= dfa
->nodes
[node
].type
;
1271 #ifdef RE_ENABLE_I18N
1272 if (dfa
->nodes
[node
].accept_mb
)
1273 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1275 #endif /* RE_ENABLE_I18N */
1276 if (type
== OP_BACK_REF
)
1278 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1279 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1282 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1286 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1287 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1295 err
= re_node_set_insert (eps_via_nodes
, node
);
1296 if (BE (err
< 0, 0))
1298 dest_node
= dfa
->edests
[node
].elems
[0];
1299 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1306 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1308 dest_node
= dfa
->nexts
[node
];
1309 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1310 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1311 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1314 re_node_set_empty (eps_via_nodes
);
1321 static reg_errcode_t
1322 push_fail_stack (fs
, str_idx
, dest_node
, nregs
, regs
, eps_via_nodes
)
1323 struct re_fail_stack_t
*fs
;
1324 int str_idx
, dest_node
, nregs
;
1326 re_node_set
*eps_via_nodes
;
1329 int num
= fs
->num
++;
1330 if (fs
->num
== fs
->alloc
)
1332 struct re_fail_stack_ent_t
*new_array
;
1333 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1335 if (new_array
== NULL
)
1338 fs
->stack
= new_array
;
1340 fs
->stack
[num
].idx
= str_idx
;
1341 fs
->stack
[num
].node
= dest_node
;
1342 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1343 if (fs
->stack
[num
].regs
== NULL
)
1345 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1346 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1351 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1352 struct re_fail_stack_t
*fs
;
1355 re_node_set
*eps_via_nodes
;
1357 int num
= --fs
->num
;
1359 *pidx
= fs
->stack
[num
].idx
;
1360 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1361 re_node_set_free (eps_via_nodes
);
1362 re_free (fs
->stack
[num
].regs
);
1363 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1364 return fs
->stack
[num
].node
;
1367 /* Set the positions where the subexpressions are starts/ends to registers
1369 Note: We assume that pmatch[0] is already set, and
1370 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1372 static reg_errcode_t
1373 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1374 const regex_t
*preg
;
1375 const re_match_context_t
*mctx
;
1380 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1382 re_node_set eps_via_nodes
;
1383 struct re_fail_stack_t
*fs
;
1384 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1385 regmatch_t
*prev_idx_match
;
1388 assert (nmatch
> 1);
1389 assert (mctx
->state_log
!= NULL
);
1394 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1395 if (fs
->stack
== NULL
)
1401 cur_node
= dfa
->init_node
;
1402 re_node_set_init_empty (&eps_via_nodes
);
1404 prev_idx_match
= (regmatch_t
*) alloca (sizeof (regmatch_t
) * nmatch
);
1405 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1407 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1409 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1411 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1416 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1417 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1419 if (reg_idx
== nmatch
)
1421 re_node_set_free (&eps_via_nodes
);
1422 return free_fail_stack_return (fs
);
1424 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1429 re_node_set_free (&eps_via_nodes
);
1434 /* Proceed to next node. */
1435 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1436 &eps_via_nodes
, fs
);
1438 if (BE (cur_node
< 0, 0))
1440 if (BE (cur_node
== -2, 0))
1442 re_node_set_free (&eps_via_nodes
);
1443 free_fail_stack_return (fs
);
1447 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1451 re_node_set_free (&eps_via_nodes
);
1456 re_node_set_free (&eps_via_nodes
);
1457 return free_fail_stack_return (fs
);
1460 static reg_errcode_t
1461 free_fail_stack_return (fs
)
1462 struct re_fail_stack_t
*fs
;
1467 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1469 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1470 re_free (fs
->stack
[fs_idx
].regs
);
1472 re_free (fs
->stack
);
1478 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, cur_idx
, nmatch
)
1480 regmatch_t
*pmatch
, *prev_idx_match
;
1481 int cur_node
, cur_idx
, nmatch
;
1483 int type
= dfa
->nodes
[cur_node
].type
;
1484 if (type
== OP_OPEN_SUBEXP
)
1486 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1488 /* We are at the first node of this sub expression. */
1489 if (reg_num
< nmatch
)
1491 pmatch
[reg_num
].rm_so
= cur_idx
;
1492 pmatch
[reg_num
].rm_eo
= -1;
1495 else if (type
== OP_CLOSE_SUBEXP
)
1497 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1498 if (reg_num
< nmatch
)
1500 /* We are at the last node of this sub expression. */
1501 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1503 pmatch
[reg_num
].rm_eo
= cur_idx
;
1504 /* This is a non-empty match or we are not inside an optional
1505 subexpression. Accept this right away. */
1506 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1510 if (dfa
->nodes
[cur_node
].opt_subexp
1511 && prev_idx_match
[reg_num
].rm_so
!= -1)
1512 /* We transited through an empty match for an optional
1513 subexpression, like (a?)*, and this is not the subexp's
1514 first match. Copy back the old content of the registers
1515 so that matches of an inner subexpression are undone as
1516 well, like in ((a?))*. */
1517 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1519 /* We completed a subexpression, but it may be part of
1520 an optional one, so do not update PREV_IDX_MATCH. */
1521 pmatch
[reg_num
].rm_eo
= cur_idx
;
1527 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1528 and sift the nodes in each states according to the following rules.
1529 Updated state_log will be wrote to STATE_LOG.
1531 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1532 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1533 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1534 the LAST_NODE, we throw away the node `a'.
1535 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1536 string `s' and transit to `b':
1537 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1539 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1540 thrown away, we throw away the node `a'.
1541 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1542 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1544 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1545 we throw away the node `a'. */
1547 #define STATE_NODE_CONTAINS(state,node) \
1548 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1550 static reg_errcode_t
1551 sift_states_backward (mctx
, sctx
)
1552 re_match_context_t
*mctx
;
1553 re_sift_context_t
*sctx
;
1557 int str_idx
= sctx
->last_str_idx
;
1558 re_node_set cur_dest
;
1561 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1564 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1565 transit to the last_node and the last_node itself. */
1566 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1567 if (BE (err
!= REG_NOERROR
, 0))
1569 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1570 if (BE (err
!= REG_NOERROR
, 0))
1573 /* Then check each states in the state_log. */
1576 /* Update counters. */
1577 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1578 if (null_cnt
> mctx
->max_mb_elem_len
)
1580 memset (sctx
->sifted_states
, '\0',
1581 sizeof (re_dfastate_t
*) * str_idx
);
1582 re_node_set_free (&cur_dest
);
1585 re_node_set_empty (&cur_dest
);
1588 if (mctx
->state_log
[str_idx
])
1590 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1591 if (BE (err
!= REG_NOERROR
, 0))
1595 /* Add all the nodes which satisfy the following conditions:
1596 - It can epsilon transit to a node in CUR_DEST.
1598 And update state_log. */
1599 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1600 if (BE (err
!= REG_NOERROR
, 0))
1605 re_node_set_free (&cur_dest
);
1609 static reg_errcode_t
1610 build_sifted_states (mctx
, sctx
, str_idx
, cur_dest
)
1611 re_match_context_t
*mctx
;
1612 re_sift_context_t
*sctx
;
1614 re_node_set
*cur_dest
;
1616 re_dfa_t
*const dfa
= mctx
->dfa
;
1617 re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1620 /* Then build the next sifted state.
1621 We build the next sifted state on `cur_dest', and update
1622 `sifted_states[str_idx]' with `cur_dest'.
1624 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1625 `cur_src' points the node_set of the old `state_log[str_idx]'
1626 (with the epsilon nodes pre-filtered out). */
1627 for (i
= 0; i
< cur_src
->nelem
; i
++)
1629 int prev_node
= cur_src
->elems
[i
];
1634 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1635 assert (!IS_EPSILON_NODE (type
));
1637 #ifdef RE_ENABLE_I18N
1638 /* If the node may accept `multi byte'. */
1639 if (dfa
->nodes
[prev_node
].accept_mb
)
1640 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1641 str_idx
, sctx
->last_str_idx
);
1642 #endif /* RE_ENABLE_I18N */
1644 /* We don't check backreferences here.
1645 See update_cur_sifted_state(). */
1647 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1648 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1649 dfa
->nexts
[prev_node
]))
1655 if (sctx
->limits
.nelem
)
1657 int to_idx
= str_idx
+ naccepted
;
1658 if (check_dst_limits (mctx
, &sctx
->limits
,
1659 dfa
->nexts
[prev_node
], to_idx
,
1660 prev_node
, str_idx
))
1663 ret
= re_node_set_insert (cur_dest
, prev_node
);
1664 if (BE (ret
== -1, 0))
1671 /* Helper functions. */
1673 static reg_errcode_t
1674 clean_state_log_if_needed (mctx
, next_state_log_idx
)
1675 re_match_context_t
*mctx
;
1676 int next_state_log_idx
;
1678 int top
= mctx
->state_log_top
;
1680 if (next_state_log_idx
>= mctx
->input
.bufs_len
1681 || (next_state_log_idx
>= mctx
->input
.valid_len
1682 && mctx
->input
.valid_len
< mctx
->input
.len
))
1685 err
= extend_buffers (mctx
);
1686 if (BE (err
!= REG_NOERROR
, 0))
1690 if (top
< next_state_log_idx
)
1692 memset (mctx
->state_log
+ top
+ 1, '\0',
1693 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1694 mctx
->state_log_top
= next_state_log_idx
;
1699 static reg_errcode_t
1700 merge_state_array (dfa
, dst
, src
, num
)
1702 re_dfastate_t
**dst
;
1703 re_dfastate_t
**src
;
1708 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1710 if (dst
[st_idx
] == NULL
)
1711 dst
[st_idx
] = src
[st_idx
];
1712 else if (src
[st_idx
] != NULL
)
1714 re_node_set merged_set
;
1715 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1716 &src
[st_idx
]->nodes
);
1717 if (BE (err
!= REG_NOERROR
, 0))
1719 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1720 re_node_set_free (&merged_set
);
1721 if (BE (err
!= REG_NOERROR
, 0))
1728 static reg_errcode_t
1729 update_cur_sifted_state (mctx
, sctx
, str_idx
, dest_nodes
)
1730 re_match_context_t
*mctx
;
1731 re_sift_context_t
*sctx
;
1733 re_node_set
*dest_nodes
;
1735 re_dfa_t
*const dfa
= mctx
->dfa
;
1737 const re_node_set
*candidates
;
1738 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1739 : &mctx
->state_log
[str_idx
]->nodes
);
1741 if (dest_nodes
->nelem
== 0)
1742 sctx
->sifted_states
[str_idx
] = NULL
;
1747 /* At first, add the nodes which can epsilon transit to a node in
1749 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1750 if (BE (err
!= REG_NOERROR
, 0))
1753 /* Then, check the limitations in the current sift_context. */
1754 if (sctx
->limits
.nelem
)
1756 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1757 mctx
->bkref_ents
, str_idx
);
1758 if (BE (err
!= REG_NOERROR
, 0))
1763 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1764 if (BE (err
!= REG_NOERROR
, 0))
1768 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1770 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1771 if (BE (err
!= REG_NOERROR
, 0))
1777 static reg_errcode_t
1778 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1780 re_node_set
*dest_nodes
;
1781 const re_node_set
*candidates
;
1783 reg_errcode_t err
= REG_NOERROR
;
1786 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1787 if (BE (err
!= REG_NOERROR
, 0))
1790 if (!state
->inveclosure
.alloc
)
1792 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1793 if (BE (err
!= REG_NOERROR
, 0))
1795 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1796 re_node_set_merge (&state
->inveclosure
,
1797 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1799 return re_node_set_add_intersect (dest_nodes
, candidates
,
1800 &state
->inveclosure
);
1803 static reg_errcode_t
1804 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1807 re_node_set
*dest_nodes
;
1808 const re_node_set
*candidates
;
1812 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1813 re_node_set except_nodes
;
1814 re_node_set_init_empty (&except_nodes
);
1815 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1817 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1818 if (cur_node
== node
)
1820 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1822 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1823 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1824 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1825 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1826 && re_node_set_contains (dest_nodes
, edst1
))
1828 && !re_node_set_contains (inv_eclosure
, edst2
)
1829 && re_node_set_contains (dest_nodes
, edst2
)))
1831 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1832 dfa
->inveclosures
+ cur_node
);
1833 if (BE (err
!= REG_NOERROR
, 0))
1835 re_node_set_free (&except_nodes
);
1841 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1843 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1844 if (!re_node_set_contains (&except_nodes
, cur_node
))
1846 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1847 re_node_set_remove_at (dest_nodes
, idx
);
1850 re_node_set_free (&except_nodes
);
1855 check_dst_limits (mctx
, limits
, dst_node
, dst_idx
, src_node
, src_idx
)
1856 re_match_context_t
*mctx
;
1857 re_node_set
*limits
;
1858 int dst_node
, dst_idx
, src_node
, src_idx
;
1860 re_dfa_t
*const dfa
= mctx
->dfa
;
1861 int lim_idx
, src_pos
, dst_pos
;
1863 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1864 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1865 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1868 struct re_backref_cache_entry
*ent
;
1869 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1870 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1872 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1873 subexp_idx
, dst_node
, dst_idx
,
1875 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1876 subexp_idx
, src_node
, src_idx
,
1880 <src> <dst> ( <subexp> )
1881 ( <subexp> ) <src> <dst>
1882 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1883 if (src_pos
== dst_pos
)
1884 continue; /* This is unrelated limitation. */
1892 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
, from_node
, bkref_idx
)
1893 re_match_context_t
*mctx
;
1894 int boundaries
, subexp_idx
, from_node
, bkref_idx
;
1896 re_dfa_t
*const dfa
= mctx
->dfa
;
1897 re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1900 /* Else, we are on the boundary: examine the nodes on the epsilon
1902 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1904 int node
= eclosures
->elems
[node_idx
];
1905 switch (dfa
->nodes
[node
].type
)
1908 if (bkref_idx
!= -1)
1910 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1915 if (ent
->node
!= node
)
1918 if (subexp_idx
<= 8 * sizeof (ent
->eps_reachable_subexps_map
)
1919 && !(ent
->eps_reachable_subexps_map
& (1 << subexp_idx
)))
1922 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1923 OP_CLOSE_SUBEXP cases below. But, if the
1924 destination node is the same node as the source
1925 node, don't recurse because it would cause an
1926 infinite loop: a regex that exhibits this behavior
1928 dst
= dfa
->edests
[node
].elems
[0];
1929 if (dst
== from_node
)
1933 else /* if (boundaries & 2) */
1938 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1940 if (cpos
== -1 /* && (boundaries & 1) */)
1942 if (cpos
== 0 && (boundaries
& 2))
1945 ent
->eps_reachable_subexps_map
&= ~(1 << subexp_idx
);
1947 while (ent
++->more
);
1951 case OP_OPEN_SUBEXP
:
1952 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1956 case OP_CLOSE_SUBEXP
:
1957 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1966 return (boundaries
& 2) ? 1 : 0;
1970 check_dst_limits_calc_pos (mctx
, limit
, subexp_idx
, from_node
, str_idx
, bkref_idx
)
1971 re_match_context_t
*mctx
;
1972 int limit
, subexp_idx
, from_node
, str_idx
, bkref_idx
;
1974 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1977 /* If we are outside the range of the subexpression, return -1 or 1. */
1978 if (str_idx
< lim
->subexp_from
)
1981 if (lim
->subexp_to
< str_idx
)
1984 /* If we are within the subexpression, return 0. */
1985 boundaries
= (str_idx
== lim
->subexp_from
);
1986 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
1987 if (boundaries
== 0)
1990 /* Else, examine epsilon closure. */
1991 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1992 from_node
, bkref_idx
);
1995 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1996 which are against limitations from DEST_NODES. */
1998 static reg_errcode_t
1999 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
2001 re_node_set
*dest_nodes
;
2002 const re_node_set
*candidates
;
2003 re_node_set
*limits
;
2004 struct re_backref_cache_entry
*bkref_ents
;
2008 int node_idx
, lim_idx
;
2010 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2013 struct re_backref_cache_entry
*ent
;
2014 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2016 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2017 continue; /* This is unrelated limitation. */
2019 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2020 if (ent
->subexp_to
== str_idx
)
2024 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2026 int node
= dest_nodes
->elems
[node_idx
];
2027 re_token_type_t type
= dfa
->nodes
[node
].type
;
2028 if (type
== OP_OPEN_SUBEXP
2029 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2031 else if (type
== OP_CLOSE_SUBEXP
2032 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2036 /* Check the limitation of the open subexpression. */
2037 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2040 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2042 if (BE (err
!= REG_NOERROR
, 0))
2046 /* Check the limitation of the close subexpression. */
2048 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2050 int node
= dest_nodes
->elems
[node_idx
];
2051 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2053 && !re_node_set_contains (dfa
->eclosures
+ node
,
2056 /* It is against this limitation.
2057 Remove it form the current sifted state. */
2058 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2060 if (BE (err
!= REG_NOERROR
, 0))
2066 else /* (ent->subexp_to != str_idx) */
2068 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2070 int node
= dest_nodes
->elems
[node_idx
];
2071 re_token_type_t type
= dfa
->nodes
[node
].type
;
2072 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2074 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2076 /* It is against this limitation.
2077 Remove it form the current sifted state. */
2078 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2080 if (BE (err
!= REG_NOERROR
, 0))
2089 static reg_errcode_t
2090 sift_states_bkref (mctx
, sctx
, str_idx
, candidates
)
2091 re_match_context_t
*mctx
;
2092 re_sift_context_t
*sctx
;
2094 const re_node_set
*candidates
;
2096 re_dfa_t
*const dfa
= mctx
->dfa
;
2099 re_sift_context_t local_sctx
;
2100 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2102 if (first_idx
== -1)
2105 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2107 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2110 re_token_type_t type
;
2111 struct re_backref_cache_entry
*entry
;
2112 node
= candidates
->elems
[node_idx
];
2113 type
= dfa
->nodes
[node
].type
;
2114 /* Avoid infinite loop for the REs like "()\1+". */
2115 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2117 if (type
!= OP_BACK_REF
)
2120 entry
= mctx
->bkref_ents
+ first_idx
;
2121 enabled_idx
= first_idx
;
2124 int subexp_len
, to_idx
, dst_node
;
2125 re_dfastate_t
*cur_state
;
2127 if (entry
->node
!= node
)
2129 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2130 to_idx
= str_idx
+ subexp_len
;
2131 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2132 : dfa
->edests
[node
].elems
[0]);
2134 if (to_idx
> sctx
->last_str_idx
2135 || sctx
->sifted_states
[to_idx
] == NULL
2136 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2137 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2138 str_idx
, dst_node
, to_idx
))
2141 if (local_sctx
.sifted_states
== NULL
)
2144 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2145 if (BE (err
!= REG_NOERROR
, 0))
2148 local_sctx
.last_node
= node
;
2149 local_sctx
.last_str_idx
= str_idx
;
2150 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2151 if (BE (err
< 0, 0))
2156 cur_state
= local_sctx
.sifted_states
[str_idx
];
2157 err
= sift_states_backward (mctx
, &local_sctx
);
2158 if (BE (err
!= REG_NOERROR
, 0))
2160 if (sctx
->limited_states
!= NULL
)
2162 err
= merge_state_array (dfa
, sctx
->limited_states
,
2163 local_sctx
.sifted_states
,
2165 if (BE (err
!= REG_NOERROR
, 0))
2168 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2169 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2171 /* mctx->bkref_ents may have changed, reload the pointer. */
2172 entry
= mctx
->bkref_ents
+ enabled_idx
;
2174 while (enabled_idx
++, entry
++->more
);
2178 if (local_sctx
.sifted_states
!= NULL
)
2180 re_node_set_free (&local_sctx
.limits
);
2187 #ifdef RE_ENABLE_I18N
2189 sift_states_iter_mb (mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2190 const re_match_context_t
*mctx
;
2191 re_sift_context_t
*sctx
;
2192 int node_idx
, str_idx
, max_str_idx
;
2194 re_dfa_t
*const dfa
= mctx
->dfa
;
2196 /* Check the node can accept `multi byte'. */
2197 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2198 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2199 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2200 dfa
->nexts
[node_idx
]))
2201 /* The node can't accept the `multi byte', or the
2202 destination was already thrown away, then the node
2203 could't accept the current input `multi byte'. */
2205 /* Otherwise, it is sure that the node could accept
2206 `naccepted' bytes input. */
2209 #endif /* RE_ENABLE_I18N */
2212 /* Functions for state transition. */
2214 /* Return the next state to which the current state STATE will transit by
2215 accepting the current input byte, and update STATE_LOG if necessary.
2216 If STATE can accept a multibyte char/collating element/back reference
2217 update the destination of STATE_LOG. */
2219 static re_dfastate_t
*
2220 transit_state (err
, mctx
, state
)
2222 re_match_context_t
*mctx
;
2223 re_dfastate_t
*state
;
2225 re_dfastate_t
**trtable
;
2228 #ifdef RE_ENABLE_I18N
2229 /* If the current state can accept multibyte. */
2230 if (BE (state
->accept_mb
, 0))
2232 *err
= transit_state_mb (mctx
, state
);
2233 if (BE (*err
!= REG_NOERROR
, 0))
2236 #endif /* RE_ENABLE_I18N */
2238 /* Then decide the next state with the single byte. */
2241 /* don't use transition table */
2242 return transit_state_sb (err
, mctx
, state
);
2245 /* Use transition table */
2246 ch
= re_string_fetch_byte (&mctx
->input
);
2249 trtable
= state
->trtable
;
2250 if (BE (trtable
!= NULL
, 1))
2253 trtable
= state
->word_trtable
;
2254 if (BE (trtable
!= NULL
, 1))
2256 unsigned int context
;
2258 = re_string_context_at (&mctx
->input
,
2259 re_string_cur_idx (&mctx
->input
) - 1,
2261 if (IS_WORD_CONTEXT (context
))
2262 return trtable
[ch
+ SBC_MAX
];
2267 if (!build_trtable (mctx
->dfa
, state
))
2273 /* Retry, we now have a transition table. */
2277 /* Update the state_log if we need */
2279 merge_state_with_log (err
, mctx
, next_state
)
2281 re_match_context_t
*mctx
;
2282 re_dfastate_t
*next_state
;
2284 re_dfa_t
*const dfa
= mctx
->dfa
;
2285 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2287 if (cur_idx
> mctx
->state_log_top
)
2289 mctx
->state_log
[cur_idx
] = next_state
;
2290 mctx
->state_log_top
= cur_idx
;
2292 else if (mctx
->state_log
[cur_idx
] == 0)
2294 mctx
->state_log
[cur_idx
] = next_state
;
2298 re_dfastate_t
*pstate
;
2299 unsigned int context
;
2300 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2301 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302 the destination of a multibyte char/collating element/
2303 back reference. Then the next state is the union set of
2304 these destinations and the results of the transition table. */
2305 pstate
= mctx
->state_log
[cur_idx
];
2306 log_nodes
= pstate
->entrance_nodes
;
2307 if (next_state
!= NULL
)
2309 table_nodes
= next_state
->entrance_nodes
;
2310 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2312 if (BE (*err
!= REG_NOERROR
, 0))
2316 next_nodes
= *log_nodes
;
2317 /* Note: We already add the nodes of the initial state,
2318 then we don't need to add them here. */
2320 context
= re_string_context_at (&mctx
->input
,
2321 re_string_cur_idx (&mctx
->input
) - 1,
2323 next_state
= mctx
->state_log
[cur_idx
]
2324 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2325 /* We don't need to check errors here, since the return value of
2326 this function is next_state and ERR is already set. */
2328 if (table_nodes
!= NULL
)
2329 re_node_set_free (&next_nodes
);
2332 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2334 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335 later. We must check them here, since the back references in the
2336 next state might use them. */
2337 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2339 if (BE (*err
!= REG_NOERROR
, 0))
2342 /* If the next state has back references. */
2343 if (next_state
->has_backref
)
2345 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2346 if (BE (*err
!= REG_NOERROR
, 0))
2348 next_state
= mctx
->state_log
[cur_idx
];
2355 /* Skip bytes in the input that correspond to part of a
2356 multi-byte match, then look in the log for a state
2357 from which to restart matching. */
2359 find_recover_state (err
, mctx
)
2361 re_match_context_t
*mctx
;
2363 re_dfastate_t
*cur_state
= NULL
;
2366 int max
= mctx
->state_log_top
;
2367 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2371 if (++cur_str_idx
> max
)
2373 re_string_skip_bytes (&mctx
->input
, 1);
2375 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2377 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2379 while (err
== REG_NOERROR
&& cur_state
== NULL
);
2383 /* Helper functions for transit_state. */
2385 /* From the node set CUR_NODES, pick up the nodes whose types are
2386 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2387 expression. And register them to use them later for evaluating the
2388 correspoding back references. */
2390 static reg_errcode_t
2391 check_subexp_matching_top (mctx
, cur_nodes
, str_idx
)
2392 re_match_context_t
*mctx
;
2393 re_node_set
*cur_nodes
;
2396 re_dfa_t
*const dfa
= mctx
->dfa
;
2400 /* TODO: This isn't efficient.
2401 Because there might be more than one nodes whose types are
2402 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2405 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2407 int node
= cur_nodes
->elems
[node_idx
];
2408 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2409 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2410 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2412 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2413 if (BE (err
!= REG_NOERROR
, 0))
2421 /* Return the next state to which the current state STATE will transit by
2422 accepting the current input byte. */
2424 static re_dfastate_t
*
2425 transit_state_sb (err
, mctx
, state
)
2427 re_match_context_t
*mctx
;
2428 re_dfastate_t
*state
;
2430 re_dfa_t
*const dfa
= mctx
->dfa
;
2431 re_node_set next_nodes
;
2432 re_dfastate_t
*next_state
;
2433 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2434 unsigned int context
;
2436 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2437 if (BE (*err
!= REG_NOERROR
, 0))
2439 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2441 int cur_node
= state
->nodes
.elems
[node_cnt
];
2442 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2444 *err
= re_node_set_merge (&next_nodes
,
2445 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2446 if (BE (*err
!= REG_NOERROR
, 0))
2448 re_node_set_free (&next_nodes
);
2453 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2454 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2455 /* We don't need to check errors here, since the return value of
2456 this function is next_state and ERR is already set. */
2458 re_node_set_free (&next_nodes
);
2459 re_string_skip_bytes (&mctx
->input
, 1);
2464 #ifdef RE_ENABLE_I18N
2465 static reg_errcode_t
2466 transit_state_mb (mctx
, pstate
)
2467 re_match_context_t
*mctx
;
2468 re_dfastate_t
*pstate
;
2470 re_dfa_t
*const dfa
= mctx
->dfa
;
2474 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2476 re_node_set dest_nodes
, *new_nodes
;
2477 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2478 int naccepted
, dest_idx
;
2479 unsigned int context
;
2480 re_dfastate_t
*dest_state
;
2482 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2485 if (dfa
->nodes
[cur_node_idx
].constraint
)
2487 context
= re_string_context_at (&mctx
->input
,
2488 re_string_cur_idx (&mctx
->input
),
2490 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2495 /* How many bytes the node can accept? */
2496 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2497 re_string_cur_idx (&mctx
->input
));
2501 /* The node can accepts `naccepted' bytes. */
2502 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2503 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2504 : mctx
->max_mb_elem_len
);
2505 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2506 if (BE (err
!= REG_NOERROR
, 0))
2509 assert (dfa
->nexts
[cur_node_idx
] != -1);
2511 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2513 dest_state
= mctx
->state_log
[dest_idx
];
2514 if (dest_state
== NULL
)
2515 dest_nodes
= *new_nodes
;
2518 err
= re_node_set_init_union (&dest_nodes
,
2519 dest_state
->entrance_nodes
, new_nodes
);
2520 if (BE (err
!= REG_NOERROR
, 0))
2523 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1, mctx
->eflags
);
2524 mctx
->state_log
[dest_idx
]
2525 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2526 if (dest_state
!= NULL
)
2527 re_node_set_free (&dest_nodes
);
2528 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2533 #endif /* RE_ENABLE_I18N */
2535 static reg_errcode_t
2536 transit_state_bkref (mctx
, nodes
)
2537 re_match_context_t
*mctx
;
2538 const re_node_set
*nodes
;
2540 re_dfa_t
*const dfa
= mctx
->dfa
;
2543 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2545 for (i
= 0; i
< nodes
->nelem
; ++i
)
2547 int dest_str_idx
, prev_nelem
, bkc_idx
;
2548 int node_idx
= nodes
->elems
[i
];
2549 unsigned int context
;
2550 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2551 re_node_set
*new_dest_nodes
;
2553 /* Check whether `node' is a backreference or not. */
2554 if (node
->type
!= OP_BACK_REF
)
2557 if (node
->constraint
)
2559 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2561 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2565 /* `node' is a backreference.
2566 Check the substring which the substring matched. */
2567 bkc_idx
= mctx
->nbkref_ents
;
2568 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2569 if (BE (err
!= REG_NOERROR
, 0))
2572 /* And add the epsilon closures (which is `new_dest_nodes') of
2573 the backreference to appropriate state_log. */
2575 assert (dfa
->nexts
[node_idx
] != -1);
2577 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2580 re_dfastate_t
*dest_state
;
2581 struct re_backref_cache_entry
*bkref_ent
;
2582 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2583 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2585 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2586 new_dest_nodes
= (subexp_len
== 0
2587 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2588 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2589 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2590 - bkref_ent
->subexp_from
);
2591 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2593 dest_state
= mctx
->state_log
[dest_str_idx
];
2594 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2595 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2596 /* Add `new_dest_node' to state_log. */
2597 if (dest_state
== NULL
)
2599 mctx
->state_log
[dest_str_idx
]
2600 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2602 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2603 && err
!= REG_NOERROR
, 0))
2608 re_node_set dest_nodes
;
2609 err
= re_node_set_init_union (&dest_nodes
,
2610 dest_state
->entrance_nodes
,
2612 if (BE (err
!= REG_NOERROR
, 0))
2614 re_node_set_free (&dest_nodes
);
2617 mctx
->state_log
[dest_str_idx
]
2618 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2619 re_node_set_free (&dest_nodes
);
2620 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2621 && err
!= REG_NOERROR
, 0))
2624 /* We need to check recursively if the backreference can epsilon
2627 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2629 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2631 if (BE (err
!= REG_NOERROR
, 0))
2633 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2634 if (BE (err
!= REG_NOERROR
, 0))
2644 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2645 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2646 Note that we might collect inappropriate candidates here.
2647 However, the cost of checking them strictly here is too high, then we
2648 delay these checking for prune_impossible_nodes(). */
2650 static reg_errcode_t
2651 get_subexp (mctx
, bkref_node
, bkref_str_idx
)
2652 re_match_context_t
*mctx
;
2653 int bkref_node
, bkref_str_idx
;
2655 re_dfa_t
*const dfa
= mctx
->dfa
;
2656 int subexp_num
, sub_top_idx
;
2657 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2658 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2659 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2660 if (cache_idx
!= -1)
2662 const struct re_backref_cache_entry
*entry
= mctx
->bkref_ents
+ cache_idx
;
2664 if (entry
->node
== bkref_node
)
2665 return REG_NOERROR
; /* We already checked it. */
2666 while (entry
++->more
);
2669 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2671 /* For each sub expression */
2672 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2675 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2676 re_sub_match_last_t
*sub_last
;
2677 int sub_last_idx
, sl_str
, bkref_str_off
;
2679 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2680 continue; /* It isn't related. */
2682 sl_str
= sub_top
->str_idx
;
2683 bkref_str_off
= bkref_str_idx
;
2684 /* At first, check the last node of sub expressions we already
2686 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2689 sub_last
= sub_top
->lasts
[sub_last_idx
];
2690 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2691 /* The matched string by the sub expression match with the substring
2692 at the back reference? */
2693 if (sl_str_diff
> 0)
2695 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2697 /* Not enough chars for a successful match. */
2698 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2701 err
= clean_state_log_if_needed (mctx
,
2704 if (BE (err
!= REG_NOERROR
, 0))
2706 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2708 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2709 break; /* We don't need to search this sub expression any more. */
2711 bkref_str_off
+= sl_str_diff
;
2712 sl_str
+= sl_str_diff
;
2713 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2716 /* Reload buf, since the preceding call might have reallocated
2718 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2720 if (err
== REG_NOMATCH
)
2722 if (BE (err
!= REG_NOERROR
, 0))
2726 if (sub_last_idx
< sub_top
->nlasts
)
2728 if (sub_last_idx
> 0)
2730 /* Then, search for the other last nodes of the sub expression. */
2731 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2733 int cls_node
, sl_str_off
;
2734 const re_node_set
*nodes
;
2735 sl_str_off
= sl_str
- sub_top
->str_idx
;
2736 /* The matched string by the sub expression match with the substring
2737 at the back reference? */
2740 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2742 /* If we are at the end of the input, we cannot match. */
2743 if (bkref_str_off
>= mctx
->input
.len
)
2746 err
= extend_buffers (mctx
);
2747 if (BE (err
!= REG_NOERROR
, 0))
2750 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2752 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2753 break; /* We don't need to search this sub expression
2756 if (mctx
->state_log
[sl_str
] == NULL
)
2758 /* Does this state have a ')' of the sub expression? */
2759 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2760 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, OP_CLOSE_SUBEXP
);
2763 if (sub_top
->path
== NULL
)
2765 sub_top
->path
= calloc (sizeof (state_array_t
),
2766 sl_str
- sub_top
->str_idx
+ 1);
2767 if (sub_top
->path
== NULL
)
2770 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2771 in the current context? */
2772 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2773 sub_top
->str_idx
, cls_node
, sl_str
, OP_CLOSE_SUBEXP
);
2774 if (err
== REG_NOMATCH
)
2776 if (BE (err
!= REG_NOERROR
, 0))
2778 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2779 if (BE (sub_last
== NULL
, 0))
2781 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2783 if (err
== REG_NOMATCH
)
2790 /* Helper functions for get_subexp(). */
2792 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2793 If it can arrive, register the sub expression expressed with SUB_TOP
2796 static reg_errcode_t
2797 get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2798 re_match_context_t
*mctx
;
2799 const re_sub_match_top_t
*sub_top
;
2800 re_sub_match_last_t
*sub_last
;
2801 int bkref_node
, bkref_str
;
2805 /* Can the subexpression arrive the back reference? */
2806 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2807 sub_last
->str_idx
, bkref_node
, bkref_str
, OP_OPEN_SUBEXP
);
2808 if (err
!= REG_NOERROR
)
2810 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2812 if (BE (err
!= REG_NOERROR
, 0))
2814 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2815 return clean_state_log_if_needed (mctx
, to_idx
);
2818 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2819 Search '(' if FL_OPEN, or search ')' otherwise.
2820 TODO: This function isn't efficient...
2821 Because there might be more than one nodes whose types are
2822 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2827 find_subexp_node (dfa
, nodes
, subexp_idx
, type
)
2828 const re_dfa_t
*dfa
;
2829 const re_node_set
*nodes
;
2830 int subexp_idx
, type
;
2833 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2835 int cls_node
= nodes
->elems
[cls_idx
];
2836 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2837 if (node
->type
== type
2838 && node
->opr
.idx
== subexp_idx
)
2844 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2845 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2847 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2849 static reg_errcode_t
2850 check_arrival (mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2852 re_match_context_t
*mctx
;
2853 state_array_t
*path
;
2854 int top_node
, top_str
, last_node
, last_str
, type
;
2856 re_dfa_t
*const dfa
= mctx
->dfa
;
2858 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2859 re_dfastate_t
*cur_state
= NULL
;
2860 re_node_set
*cur_nodes
, next_nodes
;
2861 re_dfastate_t
**backup_state_log
;
2862 unsigned int context
;
2864 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2865 /* Extend the buffer if we need. */
2866 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2868 re_dfastate_t
**new_array
;
2869 int old_alloc
= path
->alloc
;
2870 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2871 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2872 if (new_array
== NULL
)
2874 path
->alloc
= old_alloc
;
2877 path
->array
= new_array
;
2878 memset (new_array
+ old_alloc
, '\0',
2879 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2882 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2884 /* Temporary modify MCTX. */
2885 backup_state_log
= mctx
->state_log
;
2886 backup_cur_idx
= mctx
->input
.cur_idx
;
2887 mctx
->state_log
= path
->array
;
2888 mctx
->input
.cur_idx
= str_idx
;
2890 /* Setup initial node set. */
2891 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2892 if (str_idx
== top_str
)
2894 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2895 if (BE (err
!= REG_NOERROR
, 0))
2897 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2898 if (BE (err
!= REG_NOERROR
, 0))
2900 re_node_set_free (&next_nodes
);
2906 cur_state
= mctx
->state_log
[str_idx
];
2907 if (cur_state
&& cur_state
->has_backref
)
2909 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2910 if (BE ( err
!= REG_NOERROR
, 0))
2914 re_node_set_init_empty (&next_nodes
);
2916 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2918 if (next_nodes
.nelem
)
2920 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2922 if (BE ( err
!= REG_NOERROR
, 0))
2924 re_node_set_free (&next_nodes
);
2928 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2929 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2931 re_node_set_free (&next_nodes
);
2934 mctx
->state_log
[str_idx
] = cur_state
;
2937 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2939 re_node_set_empty (&next_nodes
);
2940 if (mctx
->state_log
[str_idx
+ 1])
2942 err
= re_node_set_merge (&next_nodes
,
2943 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2944 if (BE (err
!= REG_NOERROR
, 0))
2946 re_node_set_free (&next_nodes
);
2952 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2953 &cur_state
->non_eps_nodes
, &next_nodes
);
2954 if (BE (err
!= REG_NOERROR
, 0))
2956 re_node_set_free (&next_nodes
);
2961 if (next_nodes
.nelem
)
2963 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2964 if (BE (err
!= REG_NOERROR
, 0))
2966 re_node_set_free (&next_nodes
);
2969 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2971 if (BE ( err
!= REG_NOERROR
, 0))
2973 re_node_set_free (&next_nodes
);
2977 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2978 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2979 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2981 re_node_set_free (&next_nodes
);
2984 mctx
->state_log
[str_idx
] = cur_state
;
2985 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2987 re_node_set_free (&next_nodes
);
2988 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2989 : &mctx
->state_log
[last_str
]->nodes
);
2990 path
->next_idx
= str_idx
;
2993 mctx
->state_log
= backup_state_log
;
2994 mctx
->input
.cur_idx
= backup_cur_idx
;
2996 /* Then check the current node set has the node LAST_NODE. */
2997 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3003 /* Helper functions for check_arrival. */
3005 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3007 TODO: This function is similar to the functions transit_state*(),
3008 however this function has many additional works.
3009 Can't we unify them? */
3011 static reg_errcode_t
3012 check_arrival_add_next_nodes (mctx
, str_idx
, cur_nodes
, next_nodes
)
3013 re_match_context_t
*mctx
;
3015 re_node_set
*cur_nodes
, *next_nodes
;
3017 re_dfa_t
*const dfa
= mctx
->dfa
;
3021 re_node_set union_set
;
3022 re_node_set_init_empty (&union_set
);
3023 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3026 int cur_node
= cur_nodes
->elems
[cur_idx
];
3028 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3029 assert (!IS_EPSILON_NODE (type
));
3031 #ifdef RE_ENABLE_I18N
3032 /* If the node may accept `multi byte'. */
3033 if (dfa
->nodes
[cur_node
].accept_mb
)
3035 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3039 re_dfastate_t
*dest_state
;
3040 int next_node
= dfa
->nexts
[cur_node
];
3041 int next_idx
= str_idx
+ naccepted
;
3042 dest_state
= mctx
->state_log
[next_idx
];
3043 re_node_set_empty (&union_set
);
3046 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3047 if (BE (err
!= REG_NOERROR
, 0))
3049 re_node_set_free (&union_set
);
3053 result
= re_node_set_insert (&union_set
, next_node
);
3054 if (BE (result
< 0, 0))
3056 re_node_set_free (&union_set
);
3059 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3061 if (BE (mctx
->state_log
[next_idx
] == NULL
3062 && err
!= REG_NOERROR
, 0))
3064 re_node_set_free (&union_set
);
3069 #endif /* RE_ENABLE_I18N */
3071 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3073 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3074 if (BE (result
< 0, 0))
3076 re_node_set_free (&union_set
);
3081 re_node_set_free (&union_set
);
3085 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3086 CUR_NODES, however exclude the nodes which are:
3087 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3088 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3091 static reg_errcode_t
3092 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, type
)
3094 re_node_set
*cur_nodes
;
3095 int ex_subexp
, type
;
3098 int idx
, outside_node
;
3099 re_node_set new_nodes
;
3101 assert (cur_nodes
->nelem
);
3103 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3104 if (BE (err
!= REG_NOERROR
, 0))
3106 /* Create a new node set NEW_NODES with the nodes which are epsilon
3107 closures of the node in CUR_NODES. */
3109 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3111 int cur_node
= cur_nodes
->elems
[idx
];
3112 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3113 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3114 if (outside_node
== -1)
3116 /* There are no problematic nodes, just merge them. */
3117 err
= re_node_set_merge (&new_nodes
, eclosure
);
3118 if (BE (err
!= REG_NOERROR
, 0))
3120 re_node_set_free (&new_nodes
);
3126 /* There are problematic nodes, re-calculate incrementally. */
3127 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3129 if (BE (err
!= REG_NOERROR
, 0))
3131 re_node_set_free (&new_nodes
);
3136 re_node_set_free (cur_nodes
);
3137 *cur_nodes
= new_nodes
;
3141 /* Helper function for check_arrival_expand_ecl.
3142 Check incrementally the epsilon closure of TARGET, and if it isn't
3143 problematic append it to DST_NODES. */
3145 static reg_errcode_t
3146 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, type
)
3148 int target
, ex_subexp
, type
;
3149 re_node_set
*dst_nodes
;
3152 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3156 if (dfa
->nodes
[cur_node
].type
== type
3157 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3159 if (type
== OP_CLOSE_SUBEXP
)
3161 err
= re_node_set_insert (dst_nodes
, cur_node
);
3162 if (BE (err
== -1, 0))
3167 err
= re_node_set_insert (dst_nodes
, cur_node
);
3168 if (BE (err
== -1, 0))
3170 if (dfa
->edests
[cur_node
].nelem
== 0)
3172 if (dfa
->edests
[cur_node
].nelem
== 2)
3174 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3175 dfa
->edests
[cur_node
].elems
[1],
3177 if (BE (err
!= REG_NOERROR
, 0))
3180 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3186 /* For all the back references in the current state, calculate the
3187 destination of the back references by the appropriate entry
3188 in MCTX->BKREF_ENTS. */
3190 static reg_errcode_t
3191 expand_bkref_cache (mctx
, cur_nodes
, cur_str
, subexp_num
,
3193 re_match_context_t
*mctx
;
3194 int cur_str
, subexp_num
, type
;
3195 re_node_set
*cur_nodes
;
3197 re_dfa_t
*const dfa
= mctx
->dfa
;
3199 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3200 struct re_backref_cache_entry
*ent
;
3202 if (cache_idx_start
== -1)
3206 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3209 int to_idx
, next_node
;
3211 /* Is this entry ENT is appropriate? */
3212 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3215 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3216 /* Calculate the destination of the back reference, and append it
3217 to MCTX->STATE_LOG. */
3218 if (to_idx
== cur_str
)
3220 /* The backreference did epsilon transit, we must re-check all the
3221 node in the current state. */
3222 re_node_set new_dests
;
3223 reg_errcode_t err2
, err3
;
3224 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3225 if (re_node_set_contains (cur_nodes
, next_node
))
3227 err
= re_node_set_init_1 (&new_dests
, next_node
);
3228 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3229 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3230 re_node_set_free (&new_dests
);
3231 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3232 || err3
!= REG_NOERROR
, 0))
3234 err
= (err
!= REG_NOERROR
? err
3235 : (err2
!= REG_NOERROR
? err2
: err3
));
3238 /* TODO: It is still inefficient... */
3243 re_node_set union_set
;
3244 next_node
= dfa
->nexts
[ent
->node
];
3245 if (mctx
->state_log
[to_idx
])
3248 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3251 err
= re_node_set_init_copy (&union_set
,
3252 &mctx
->state_log
[to_idx
]->nodes
);
3253 ret
= re_node_set_insert (&union_set
, next_node
);
3254 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3256 re_node_set_free (&union_set
);
3257 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3263 err
= re_node_set_init_1 (&union_set
, next_node
);
3264 if (BE (err
!= REG_NOERROR
, 0))
3267 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3268 re_node_set_free (&union_set
);
3269 if (BE (mctx
->state_log
[to_idx
] == NULL
3270 && err
!= REG_NOERROR
, 0))
3274 while (ent
++->more
);
3278 /* Build transition table for the state.
3279 Return 1 if succeeded, otherwise return NULL. */
3282 build_trtable (dfa
, state
)
3284 re_dfastate_t
*state
;
3287 int i
, j
, ch
, need_word_trtable
= 0;
3288 unsigned int elem
, mask
;
3289 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3290 int ndests
; /* Number of the destination states from `state'. */
3291 re_dfastate_t
**trtable
;
3292 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3293 re_node_set follows
, *dests_node
;
3297 /* We build DFA states which corresponds to the destination nodes
3298 from `state'. `dests_node[i]' represents the nodes which i-th
3299 destination state contains, and `dests_ch[i]' represents the
3300 characters which i-th destination state accepts. */
3302 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3303 dests_node
= (re_node_set
*)
3304 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3308 dests_node
= (re_node_set
*)
3309 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3310 if (BE (dests_node
== NULL
, 0))
3312 dests_node_malloced
= 1;
3314 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3316 /* Initialize transiton table. */
3317 state
->word_trtable
= state
->trtable
= NULL
;
3319 /* At first, group all nodes belonging to `state' into several
3321 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3322 if (BE (ndests
<= 0, 0))
3324 if (dests_node_malloced
)
3326 /* Return 0 in case of an error, 1 otherwise. */
3329 state
->trtable
= (re_dfastate_t
**)
3330 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3336 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3337 if (BE (err
!= REG_NOERROR
, 0))
3341 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3342 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3343 dest_states
= (re_dfastate_t
**)
3344 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3348 dest_states
= (re_dfastate_t
**)
3349 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3350 if (BE (dest_states
== NULL
, 0))
3353 if (dest_states_malloced
)
3355 re_node_set_free (&follows
);
3356 for (i
= 0; i
< ndests
; ++i
)
3357 re_node_set_free (dests_node
+ i
);
3358 if (dests_node_malloced
)
3362 dest_states_malloced
= 1;
3364 dest_states_word
= dest_states
+ ndests
;
3365 dest_states_nl
= dest_states_word
+ ndests
;
3366 bitset_empty (acceptable
);
3368 /* Then build the states for all destinations. */
3369 for (i
= 0; i
< ndests
; ++i
)
3372 re_node_set_empty (&follows
);
3373 /* Merge the follows of this destination states. */
3374 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3376 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3377 if (next_node
!= -1)
3379 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3380 if (BE (err
!= REG_NOERROR
, 0))
3384 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3385 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3387 /* If the new state has context constraint,
3388 build appropriate states for these contexts. */
3389 if (dest_states
[i
]->has_constraint
)
3391 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3393 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3396 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3397 need_word_trtable
= 1;
3399 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3401 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3406 dest_states_word
[i
] = dest_states
[i
];
3407 dest_states_nl
[i
] = dest_states
[i
];
3409 bitset_merge (acceptable
, dests_ch
[i
]);
3412 if (!BE (need_word_trtable
, 0))
3414 /* We don't care about whether the following character is a word
3415 character, or we are in a single-byte character set so we can
3416 discern by looking at the character code: allocate a
3417 256-entry transition table. */
3418 trtable
= state
->trtable
=
3419 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3420 if (BE (trtable
== NULL
, 0))
3423 /* For all characters ch...: */
3424 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3425 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3427 mask
<<= 1, elem
>>= 1, ++ch
)
3428 if (BE (elem
& 1, 0))
3430 /* There must be exactly one destination which accepts
3431 character ch. See group_nodes_into_DFAstates. */
3432 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3435 /* j-th destination accepts the word character ch. */
3436 if (dfa
->word_char
[i
] & mask
)
3437 trtable
[ch
] = dest_states_word
[j
];
3439 trtable
[ch
] = dest_states
[j
];
3444 /* We care about whether the following character is a word
3445 character, and we are in a multi-byte character set: discern
3446 by looking at the character code: build two 256-entry
3447 transition tables, one starting at trtable[0] and one
3448 starting at trtable[SBC_MAX]. */
3449 trtable
= state
->word_trtable
=
3450 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3451 if (BE (trtable
== NULL
, 0))
3454 /* For all characters ch...: */
3455 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3456 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3458 mask
<<= 1, elem
>>= 1, ++ch
)
3459 if (BE (elem
& 1, 0))
3461 /* There must be exactly one destination which accepts
3462 character ch. See group_nodes_into_DFAstates. */
3463 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3466 /* j-th destination accepts the word character ch. */
3467 trtable
[ch
] = dest_states
[j
];
3468 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3473 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3475 /* The current state accepts newline character. */
3476 for (j
= 0; j
< ndests
; ++j
)
3477 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3479 /* k-th destination accepts newline character. */
3480 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3481 if (need_word_trtable
)
3482 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3483 /* There must be only one destination which accepts
3484 newline. See group_nodes_into_DFAstates. */
3489 if (dest_states_malloced
)
3492 re_node_set_free (&follows
);
3493 for (i
= 0; i
< ndests
; ++i
)
3494 re_node_set_free (dests_node
+ i
);
3496 if (dests_node_malloced
)
3502 /* Group all nodes belonging to STATE into several destinations.
3503 Then for all destinations, set the nodes belonging to the destination
3504 to DESTS_NODE[i] and set the characters accepted by the destination
3505 to DEST_CH[i]. This function return the number of destinations. */
3508 group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
)
3510 const re_dfastate_t
*state
;
3511 re_node_set
*dests_node
;
3517 int ndests
; /* Number of the destinations from `state'. */
3518 bitset accepts
; /* Characters a node can accept. */
3519 const re_node_set
*cur_nodes
= &state
->nodes
;
3520 bitset_empty (accepts
);
3523 /* For all the nodes belonging to `state', */
3524 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3526 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3527 re_token_type_t type
= node
->type
;
3528 unsigned int constraint
= node
->constraint
;
3530 /* Enumerate all single byte character this node can accept. */
3531 if (type
== CHARACTER
)
3532 bitset_set (accepts
, node
->opr
.c
);
3533 else if (type
== SIMPLE_BRACKET
)
3535 bitset_merge (accepts
, node
->opr
.sbcset
);
3537 else if (type
== OP_PERIOD
)
3539 #ifdef RE_ENABLE_I18N
3540 if (dfa
->mb_cur_max
> 1)
3541 bitset_merge (accepts
, dfa
->sb_char
);
3544 bitset_set_all (accepts
);
3545 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3546 bitset_clear (accepts
, '\n');
3547 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3548 bitset_clear (accepts
, '\0');
3550 #ifdef RE_ENABLE_I18N
3551 else if (type
== OP_UTF8_PERIOD
)
3553 memset (accepts
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
3554 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3555 bitset_clear (accepts
, '\n');
3556 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3557 bitset_clear (accepts
, '\0');
3563 /* Check the `accepts' and sift the characters which are not
3564 match it the context. */
3567 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3569 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3570 bitset_empty (accepts
);
3571 if (accepts_newline
)
3572 bitset_set (accepts
, NEWLINE_CHAR
);
3576 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3578 bitset_empty (accepts
);
3582 if (constraint
& NEXT_WORD_CONSTRAINT
)
3584 unsigned int any_set
= 0;
3585 if (type
== CHARACTER
&& !node
->word_char
)
3587 bitset_empty (accepts
);
3590 #ifdef RE_ENABLE_I18N
3591 if (dfa
->mb_cur_max
> 1)
3592 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3593 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3596 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3597 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3601 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3603 unsigned int any_set
= 0;
3604 if (type
== CHARACTER
&& node
->word_char
)
3606 bitset_empty (accepts
);
3609 #ifdef RE_ENABLE_I18N
3610 if (dfa
->mb_cur_max
> 1)
3611 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3612 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3615 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3616 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3622 /* Then divide `accepts' into DFA states, or create a new
3623 state. Above, we make sure that accepts is not empty. */
3624 for (j
= 0; j
< ndests
; ++j
)
3626 bitset intersec
; /* Intersection sets, see below. */
3628 /* Flags, see below. */
3629 int has_intersec
, not_subset
, not_consumed
;
3631 /* Optimization, skip if this state doesn't accept the character. */
3632 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3635 /* Enumerate the intersection set of this state and `accepts'. */
3637 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3638 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3639 /* And skip if the intersection set is empty. */
3643 /* Then check if this state is a subset of `accepts'. */
3644 not_subset
= not_consumed
= 0;
3645 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3647 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3648 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3651 /* If this state isn't a subset of `accepts', create a
3652 new group state, which has the `remains'. */
3655 bitset_copy (dests_ch
[ndests
], remains
);
3656 bitset_copy (dests_ch
[j
], intersec
);
3657 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3658 if (BE (err
!= REG_NOERROR
, 0))
3663 /* Put the position in the current group. */
3664 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3665 if (BE (result
< 0, 0))
3668 /* If all characters are consumed, go to next node. */
3672 /* Some characters remain, create a new group. */
3675 bitset_copy (dests_ch
[ndests
], accepts
);
3676 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3677 if (BE (err
!= REG_NOERROR
, 0))
3680 bitset_empty (accepts
);
3685 for (j
= 0; j
< ndests
; ++j
)
3686 re_node_set_free (dests_node
+ j
);
3690 #ifdef RE_ENABLE_I18N
3691 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3692 Return the number of the bytes the node accepts.
3693 STR_IDX is the current index of the input string.
3695 This function handles the nodes which can accept one character, or
3696 one collating element like '.', '[a-z]', opposite to the other nodes
3697 can only accept one byte. */
3700 check_node_accept_bytes (dfa
, node_idx
, input
, str_idx
)
3702 int node_idx
, str_idx
;
3703 const re_string_t
*input
;
3705 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3706 int char_len
, elem_len
;
3709 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3711 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3712 if (BE (c
< 0xc2, 1))
3715 if (str_idx
+ 2 > input
->len
)
3718 d
= re_string_byte_at (input
, str_idx
+ 1);
3720 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3724 if (c
== 0xe0 && d
< 0xa0)
3730 if (c
== 0xf0 && d
< 0x90)
3736 if (c
== 0xf8 && d
< 0x88)
3742 if (c
== 0xfc && d
< 0x84)
3748 if (str_idx
+ char_len
> input
->len
)
3751 for (i
= 1; i
< char_len
; ++i
)
3753 d
= re_string_byte_at (input
, str_idx
+ i
);
3754 if (d
< 0x80 || d
> 0xbf)
3760 char_len
= re_string_char_size_at (input
, str_idx
);
3761 if (node
->type
== OP_PERIOD
)
3765 /* FIXME: I don't think this if is needed, as both '\n'
3766 and '\0' are char_len == 1. */
3767 /* '.' accepts any one character except the following two cases. */
3768 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3769 re_string_byte_at (input
, str_idx
) == '\n') ||
3770 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3771 re_string_byte_at (input
, str_idx
) == '\0'))
3776 elem_len
= re_string_elem_size_at (input
, str_idx
);
3777 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3780 if (node
->type
== COMPLEX_BRACKET
)
3782 const re_charset_t
*cset
= node
->opr
.mbcset
;
3784 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3790 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3791 ? re_string_wchar_at (input
, str_idx
) : 0);
3793 /* match with multibyte character? */
3794 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3795 if (wc
== cset
->mbchars
[i
])
3797 match_len
= char_len
;
3798 goto check_node_accept_bytes_match
;
3800 /* match with character_class? */
3801 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3803 wctype_t wt
= cset
->char_classes
[i
];
3804 if (__iswctype (wc
, wt
))
3806 match_len
= char_len
;
3807 goto check_node_accept_bytes_match
;
3812 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3815 unsigned int in_collseq
= 0;
3816 const int32_t *table
, *indirect
;
3817 const unsigned char *weights
, *extra
;
3818 const char *collseqwc
;
3820 /* This #include defines a local function! */
3821 # include <locale/weight.h>
3823 /* match with collating_symbol? */
3824 if (cset
->ncoll_syms
)
3825 extra
= (const unsigned char *)
3826 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3827 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3829 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3830 /* Compare the length of input collating element and
3831 the length of current collating element. */
3832 if (*coll_sym
!= elem_len
)
3834 /* Compare each bytes. */
3835 for (j
= 0; j
< *coll_sym
; j
++)
3836 if (pin
[j
] != coll_sym
[1 + j
])
3840 /* Match if every bytes is equal. */
3842 goto check_node_accept_bytes_match
;
3848 if (elem_len
<= char_len
)
3850 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3851 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3854 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3856 /* match with range expression? */
3857 for (i
= 0; i
< cset
->nranges
; ++i
)
3858 if (cset
->range_starts
[i
] <= in_collseq
3859 && in_collseq
<= cset
->range_ends
[i
])
3861 match_len
= elem_len
;
3862 goto check_node_accept_bytes_match
;
3865 /* match with equivalence_class? */
3866 if (cset
->nequiv_classes
)
3868 const unsigned char *cp
= pin
;
3869 table
= (const int32_t *)
3870 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3871 weights
= (const unsigned char *)
3872 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3873 extra
= (const unsigned char *)
3874 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3875 indirect
= (const int32_t *)
3876 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3877 idx
= findidx (&cp
);
3879 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3881 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3882 size_t weight_len
= weights
[idx
];
3883 if (weight_len
== weights
[equiv_class_idx
])
3886 while (cnt
<= weight_len
3887 && (weights
[equiv_class_idx
+ 1 + cnt
]
3888 == weights
[idx
+ 1 + cnt
]))
3890 if (cnt
> weight_len
)
3892 match_len
= elem_len
;
3893 goto check_node_accept_bytes_match
;
3902 /* match with range expression? */
3904 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3906 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3909 for (i
= 0; i
< cset
->nranges
; ++i
)
3911 cmp_buf
[0] = cset
->range_starts
[i
];
3912 cmp_buf
[4] = cset
->range_ends
[i
];
3913 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3914 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3916 match_len
= char_len
;
3917 goto check_node_accept_bytes_match
;
3921 check_node_accept_bytes_match
:
3922 if (!cset
->non_match
)
3929 return (elem_len
> char_len
) ? elem_len
: char_len
;
3937 find_collation_sequence_value (mbs
, mbs_len
)
3938 const unsigned char *mbs
;
3941 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3946 /* No valid character. Match it as a single byte character. */
3947 const unsigned char *collseq
= (const unsigned char *)
3948 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3949 return collseq
[mbs
[0]];
3956 const unsigned char *extra
= (const unsigned char *)
3957 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3958 int32_t extrasize
= (const unsigned char *)
3959 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3961 for (idx
= 0; idx
< extrasize
;)
3963 int mbs_cnt
, found
= 0;
3964 int32_t elem_mbs_len
;
3965 /* Skip the name of collating element name. */
3966 idx
= idx
+ extra
[idx
] + 1;
3967 elem_mbs_len
= extra
[idx
++];
3968 if (mbs_len
== elem_mbs_len
)
3970 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3971 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3973 if (mbs_cnt
== elem_mbs_len
)
3974 /* Found the entry. */
3977 /* Skip the byte sequence of the collating element. */
3978 idx
+= elem_mbs_len
;
3979 /* Adjust for the alignment. */
3980 idx
= (idx
+ 3) & ~3;
3981 /* Skip the collation sequence value. */
3982 idx
+= sizeof (uint32_t);
3983 /* Skip the wide char sequence of the collating element. */
3984 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3985 /* If we found the entry, return the sequence value. */
3987 return *(uint32_t *) (extra
+ idx
);
3988 /* Skip the collation sequence value. */
3989 idx
+= sizeof (uint32_t);
3995 #endif /* RE_ENABLE_I18N */
3997 /* Check whether the node accepts the byte which is IDX-th
3998 byte of the INPUT. */
4001 check_node_accept (mctx
, node
, idx
)
4002 const re_match_context_t
*mctx
;
4003 const re_token_t
*node
;
4007 ch
= re_string_byte_at (&mctx
->input
, idx
);
4011 if (node
->opr
.c
!= ch
)
4015 case SIMPLE_BRACKET
:
4016 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4020 #ifdef RE_ENABLE_I18N
4021 case OP_UTF8_PERIOD
:
4027 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4028 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4036 if (node
->constraint
)
4038 /* The node has constraints. Check whether the current context
4039 satisfies the constraints. */
4040 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4042 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4049 /* Extend the buffers, if the buffers have run out. */
4051 static reg_errcode_t
4052 extend_buffers (mctx
)
4053 re_match_context_t
*mctx
;
4056 re_string_t
*pstr
= &mctx
->input
;
4058 /* Double the lengthes of the buffers. */
4059 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4060 if (BE (ret
!= REG_NOERROR
, 0))
4063 if (mctx
->state_log
!= NULL
)
4065 /* And double the length of state_log. */
4066 /* XXX We have no indication of the size of this buffer. If this
4067 allocation fail we have no indication that the state_log array
4068 does not have the right size. */
4069 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4070 pstr
->bufs_len
+ 1);
4071 if (BE (new_array
== NULL
, 0))
4073 mctx
->state_log
= new_array
;
4076 /* Then reconstruct the buffers. */
4079 #ifdef RE_ENABLE_I18N
4080 if (pstr
->mb_cur_max
> 1)
4082 ret
= build_wcs_upper_buffer (pstr
);
4083 if (BE (ret
!= REG_NOERROR
, 0))
4087 #endif /* RE_ENABLE_I18N */
4088 build_upper_buffer (pstr
);
4092 #ifdef RE_ENABLE_I18N
4093 if (pstr
->mb_cur_max
> 1)
4094 build_wcs_buffer (pstr
);
4096 #endif /* RE_ENABLE_I18N */
4098 if (pstr
->trans
!= NULL
)
4099 re_string_translate_buffer (pstr
);
4106 /* Functions for matching context. */
4108 /* Initialize MCTX. */
4110 static reg_errcode_t
4111 match_ctx_init (mctx
, eflags
, n
)
4112 re_match_context_t
*mctx
;
4115 mctx
->eflags
= eflags
;
4116 mctx
->match_last
= -1;
4119 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4120 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4121 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4124 /* Already zero-ed by the caller.
4126 mctx->bkref_ents = NULL;
4127 mctx->nbkref_ents = 0;
4128 mctx->nsub_tops = 0; */
4129 mctx
->abkref_ents
= n
;
4130 mctx
->max_mb_elem_len
= 1;
4131 mctx
->asub_tops
= n
;
4135 /* Clean the entries which depend on the current input in MCTX.
4136 This function must be invoked when the matcher changes the start index
4137 of the input, or changes the input string. */
4140 match_ctx_clean (mctx
)
4141 re_match_context_t
*mctx
;
4144 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4147 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4148 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4150 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4151 re_free (last
->path
.array
);
4154 re_free (top
->lasts
);
4157 re_free (top
->path
->array
);
4158 re_free (top
->path
);
4163 mctx
->nsub_tops
= 0;
4164 mctx
->nbkref_ents
= 0;
4167 /* Free all the memory associated with MCTX. */
4170 match_ctx_free (mctx
)
4171 re_match_context_t
*mctx
;
4173 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4174 match_ctx_clean (mctx
);
4175 re_free (mctx
->sub_tops
);
4176 re_free (mctx
->bkref_ents
);
4179 /* Add a new backreference entry to MCTX.
4180 Note that we assume that caller never call this function with duplicate
4181 entry, and call with STR_IDX which isn't smaller than any existing entry.
4184 static reg_errcode_t
4185 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
4186 re_match_context_t
*mctx
;
4187 int node
, str_idx
, from
, to
;
4189 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4191 struct re_backref_cache_entry
* new_entry
;
4192 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4193 mctx
->abkref_ents
* 2);
4194 if (BE (new_entry
== NULL
, 0))
4196 re_free (mctx
->bkref_ents
);
4199 mctx
->bkref_ents
= new_entry
;
4200 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4201 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4202 mctx
->abkref_ents
*= 2;
4204 if (mctx
->nbkref_ents
> 0
4205 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4206 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4208 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4209 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4210 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4211 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4213 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4214 If bit N is clear, means that this entry won't epsilon-transition to
4215 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4216 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4219 A backreference does not epsilon-transition unless it is empty, so set
4220 to all zeros if FROM != TO. */
4221 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4222 = (from
== to
? ~0 : 0);
4224 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4225 if (mctx
->max_mb_elem_len
< to
- from
)
4226 mctx
->max_mb_elem_len
= to
- from
;
4230 /* Search for the first entry which has the same str_idx, or -1 if none is
4231 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4234 search_cur_bkref_entry (mctx
, str_idx
)
4235 re_match_context_t
*mctx
;
4238 int left
, right
, mid
, last
;
4239 last
= right
= mctx
->nbkref_ents
;
4240 for (left
= 0; left
< right
;)
4242 mid
= (left
+ right
) / 2;
4243 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4248 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4254 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4257 static reg_errcode_t
4258 match_ctx_add_subtop (mctx
, node
, str_idx
)
4259 re_match_context_t
*mctx
;
4263 assert (mctx
->sub_tops
!= NULL
);
4264 assert (mctx
->asub_tops
> 0);
4266 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4268 int new_asub_tops
= mctx
->asub_tops
* 2;
4269 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4270 re_sub_match_top_t
*,
4272 if (BE (new_array
== NULL
, 0))
4274 mctx
->sub_tops
= new_array
;
4275 mctx
->asub_tops
= new_asub_tops
;
4277 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4278 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4280 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4281 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4285 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4286 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4288 static re_sub_match_last_t
*
4289 match_ctx_add_sublast (subtop
, node
, str_idx
)
4290 re_sub_match_top_t
*subtop
;
4293 re_sub_match_last_t
*new_entry
;
4294 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4296 int new_alasts
= 2 * subtop
->alasts
+ 1;
4297 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4298 re_sub_match_last_t
*,
4300 if (BE (new_array
== NULL
, 0))
4302 subtop
->lasts
= new_array
;
4303 subtop
->alasts
= new_alasts
;
4305 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4306 if (BE (new_entry
!= NULL
, 1))
4308 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4309 new_entry
->node
= node
;
4310 new_entry
->str_idx
= str_idx
;
4317 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
)
4318 re_sift_context_t
*sctx
;
4319 re_dfastate_t
**sifted_sts
, **limited_sts
;
4320 int last_node
, last_str_idx
;
4322 sctx
->sifted_states
= sifted_sts
;
4323 sctx
->limited_states
= limited_sts
;
4324 sctx
->last_node
= last_node
;
4325 sctx
->last_str_idx
= last_str_idx
;
4326 re_node_set_init_empty (&sctx
->limits
);