1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
43 int eflags
) internal_function
;
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
) internal_function
;
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
52 int ret_len
) internal_function
;
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
) internal_function
;
55 static inline re_dfastate_t
*acquire_init_state_context
56 (reg_errcode_t
*err
, const re_match_context_t
*mctx
, int idx
)
57 __attribute ((always_inline
)) internal_function
;
58 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
60 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
63 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
64 unsigned int context
) internal_function
;
65 static int check_halt_state_context (const re_match_context_t
*mctx
,
66 const re_dfastate_t
*state
, int idx
)
68 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
,
69 regmatch_t
*prev_idx_match
, int cur_node
,
70 int cur_idx
, int nmatch
) internal_function
;
71 static int proceed_next_node (const re_match_context_t
*mctx
,
72 int nregs
, regmatch_t
*regs
,
73 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
74 struct re_fail_stack_t
*fs
) internal_function
;
75 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
76 int str_idx
, int *dests
, int nregs
,
78 re_node_set
*eps_via_nodes
) internal_function
;
79 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
80 regmatch_t
*regs
, re_node_set
*eps_via_nodes
) internal_function
;
81 static reg_errcode_t
set_regs (const regex_t
*preg
,
82 const re_match_context_t
*mctx
,
83 size_t nmatch
, regmatch_t
*pmatch
,
84 int fl_backtrack
) internal_function
;
85 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
) internal_function
;
88 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
89 re_sift_context_t
*sctx
,
90 int node_idx
, int str_idx
, int max_str_idx
) internal_function
;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t
sift_states_backward (re_match_context_t
*mctx
,
93 re_sift_context_t
*sctx
) internal_function
;
94 static reg_errcode_t
build_sifted_states (re_match_context_t
*mctx
,
95 re_sift_context_t
*sctx
, int str_idx
,
96 re_node_set
*cur_dest
) internal_function
;
97 static reg_errcode_t
update_cur_sifted_state (re_match_context_t
*mctx
,
98 re_sift_context_t
*sctx
,
100 re_node_set
*dest_nodes
) internal_function
;
101 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
102 re_node_set
*dest_nodes
,
103 const re_node_set
*candidates
) internal_function
;
104 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
105 re_node_set
*dest_nodes
,
106 const re_node_set
*and_nodes
) internal_function
;
107 static int check_dst_limits (re_match_context_t
*mctx
, re_node_set
*limits
,
108 int dst_node
, int dst_idx
, int src_node
,
109 int src_idx
) internal_function
;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t
*mctx
,
111 int boundaries
, int subexp_idx
,
112 int from_node
, int bkref_idx
) internal_function
;
113 static int check_dst_limits_calc_pos (re_match_context_t
*mctx
,
114 int limit
, int subexp_idx
,
115 int node
, int str_idx
,
116 int bkref_idx
) internal_function
;
117 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
118 re_node_set
*dest_nodes
,
119 const re_node_set
*candidates
,
121 struct re_backref_cache_entry
*bkref_ents
,
122 int str_idx
) internal_function
;
123 static reg_errcode_t
sift_states_bkref (re_match_context_t
*mctx
,
124 re_sift_context_t
*sctx
,
125 int str_idx
, const re_node_set
*candidates
) internal_function
;
126 static reg_errcode_t
clean_state_log_if_needed (re_match_context_t
*mctx
,
127 int next_state_log_idx
) internal_function
;
128 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
129 re_dfastate_t
**src
, int num
) internal_function
;
130 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
131 re_match_context_t
*mctx
) internal_function
;
132 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
133 re_match_context_t
*mctx
,
134 re_dfastate_t
*state
) internal_function
;
135 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
136 re_match_context_t
*mctx
,
137 re_dfastate_t
*next_state
) internal_function
;
138 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
139 re_node_set
*cur_nodes
,
140 int str_idx
) internal_function
;
142 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
143 re_match_context_t
*mctx
,
144 re_dfastate_t
*pstate
) internal_function
;
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
148 re_dfastate_t
*pstate
) internal_function
;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
151 const re_node_set
*nodes
) internal_function
;
152 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
153 int bkref_node
, int bkref_str_idx
) internal_function
;
154 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
155 const re_sub_match_top_t
*sub_top
,
156 re_sub_match_last_t
*sub_last
,
157 int bkref_node
, int bkref_str
) internal_function
;
158 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
159 int subexp_idx
, int type
) internal_function
;
160 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
161 state_array_t
*path
, int top_node
,
162 int top_str
, int last_node
, int last_str
,
163 int type
) internal_function
;
164 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
166 re_node_set
*cur_nodes
,
167 re_node_set
*next_nodes
) internal_function
;
168 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
169 re_node_set
*cur_nodes
,
170 int ex_subexp
, int type
) internal_function
;
171 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
172 re_node_set
*dst_nodes
,
173 int target
, int ex_subexp
,
174 int type
) internal_function
;
175 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
176 re_node_set
*cur_nodes
, int cur_str
,
177 int subexp_num
, int type
) internal_function
;
178 static re_dfastate_t
**build_trtable (re_dfa_t
*dfa
,
179 re_dfastate_t
*state
) internal_function
;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t
*dfa
, int node_idx
,
182 const re_string_t
*input
, int idx
) internal_function
;
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
185 size_t name_len
) internal_function
;
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t
*dfa
,
189 const re_dfastate_t
*state
,
190 re_node_set
*states_node
,
191 bitset
*states_ch
) internal_function
;
192 static int check_node_accept (const re_match_context_t
*mctx
,
193 const re_token_t
*node
, int idx
) internal_function
;
194 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
) internal_function
;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
214 const regex_t
*__restrict preg
;
215 const char *__restrict string
;
223 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
226 if (eflags
& REG_STARTEND
)
228 start
= pmatch
[0].rm_so
;
229 length
= pmatch
[0].rm_eo
;
234 length
= strlen (string
);
237 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
238 length
, 0, NULL
, eflags
);
240 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
241 length
, nmatch
, pmatch
, eflags
);
242 return err
!= REG_NOERROR
;
246 # include <shlib-compat.h>
247 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
250 __typeof__ (__regexec
) __compat_regexec
;
253 attribute_compat_text_section
254 __compat_regexec (const regex_t
*__restrict preg
,
255 const char *__restrict string
, size_t nmatch
,
256 regmatch_t pmatch
[], int eflags
)
258 return regexec (preg
, string
, nmatch
, pmatch
,
259 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
261 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
265 /* Entry points for GNU code. */
267 /* re_match, re_search, re_match_2, re_search_2
269 The former two functions operate on STRING with length LENGTH,
270 while the later two operate on concatenation of STRING1 and STRING2
271 with lengths LENGTH1 and LENGTH2, respectively.
273 re_match() matches the compiled pattern in BUFP against the string,
274 starting at index START.
276 re_search() first tries matching at index START, then it tries to match
277 starting from index START + 1, and so on. The last start position tried
278 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
281 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
282 the first STOP characters of the concatenation of the strings should be
285 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
286 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
287 computed relative to the concatenation, not relative to the individual
290 On success, re_match* functions return the length of the match, re_search*
291 return the position of the start of the match. Return value -1 means no
292 match was found and -2 indicates an internal error. */
295 re_match (bufp
, string
, length
, start
, regs
)
296 struct re_pattern_buffer
*bufp
;
299 struct re_registers
*regs
;
301 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
304 weak_alias (__re_match
, re_match
)
308 re_search (bufp
, string
, length
, start
, range
, regs
)
309 struct re_pattern_buffer
*bufp
;
311 int length
, start
, range
;
312 struct re_registers
*regs
;
314 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
317 weak_alias (__re_search
, re_search
)
321 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
322 struct re_pattern_buffer
*bufp
;
323 const char *string1
, *string2
;
324 int length1
, length2
, start
, stop
;
325 struct re_registers
*regs
;
327 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
328 start
, 0, regs
, stop
, 1);
331 weak_alias (__re_match_2
, re_match_2
)
335 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
336 struct re_pattern_buffer
*bufp
;
337 const char *string1
, *string2
;
338 int length1
, length2
, start
, range
, stop
;
339 struct re_registers
*regs
;
341 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
342 start
, range
, regs
, stop
, 0);
345 weak_alias (__re_search_2
, re_search_2
)
349 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
351 struct re_pattern_buffer
*bufp
;
352 const char *string1
, *string2
;
353 int length1
, length2
, start
, range
, stop
, ret_len
;
354 struct re_registers
*regs
;
358 int len
= length1
+ length2
;
361 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
364 /* Concatenate the strings. */
368 char *s
= re_malloc (char, len
);
370 if (BE (s
== NULL
, 0))
372 memcpy (s
, string1
, length1
);
373 memcpy (s
+ length1
, string2
, length2
);
382 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
385 re_free ((char *) str
);
389 /* The parameters have the same meaning as those of re_search.
390 Additional parameters:
391 If RET_LEN is nonzero the length of the match is returned (re_match style);
392 otherwise the position of the match is returned. */
395 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
396 struct re_pattern_buffer
*bufp
;
398 int length
, start
, range
, stop
, ret_len
;
399 struct re_registers
*regs
;
401 reg_errcode_t result
;
406 /* Check for out-of-range. */
407 if (BE (start
< 0 || start
> length
, 0))
409 if (BE (start
+ range
> length
, 0))
410 range
= length
- start
;
411 else if (BE (start
+ range
< 0, 0))
414 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
415 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
417 /* Compile fastmap if we haven't yet. */
418 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
419 re_compile_fastmap (bufp
);
421 if (BE (bufp
->no_sub
, 0))
424 /* We need at least 1 register. */
427 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
428 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
430 nregs
= regs
->num_regs
;
431 if (BE (nregs
< 1, 0))
433 /* Nothing can be copied to regs. */
439 nregs
= bufp
->re_nsub
+ 1;
440 pmatch
= re_malloc (regmatch_t
, nregs
);
441 if (BE (pmatch
== NULL
, 0))
444 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
445 nregs
, pmatch
, eflags
);
449 /* I hope we needn't fill ther regs with -1's when no match was found. */
450 if (result
!= REG_NOERROR
)
452 else if (regs
!= NULL
)
454 /* If caller wants register contents data back, copy them. */
455 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
456 bufp
->regs_allocated
);
457 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
461 if (BE (rval
== 0, 1))
465 assert (pmatch
[0].rm_so
== start
);
466 rval
= pmatch
[0].rm_eo
- start
;
469 rval
= pmatch
[0].rm_so
;
476 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
477 struct re_registers
*regs
;
479 int nregs
, regs_allocated
;
481 int rval
= REGS_REALLOCATE
;
483 int need_regs
= nregs
+ 1;
484 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
487 /* Have the register data arrays been allocated? */
488 if (regs_allocated
== REGS_UNALLOCATED
)
489 { /* No. So allocate them with malloc. */
490 regs
->start
= re_malloc (regoff_t
, need_regs
);
491 regs
->end
= re_malloc (regoff_t
, need_regs
);
492 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
493 return REGS_UNALLOCATED
;
494 regs
->num_regs
= need_regs
;
496 else if (regs_allocated
== REGS_REALLOCATE
)
497 { /* Yes. If we need more elements than were already
498 allocated, reallocate them. If we need fewer, just
500 if (BE (need_regs
> regs
->num_regs
, 0))
502 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
503 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
504 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
505 return REGS_UNALLOCATED
;
506 regs
->start
= new_start
;
508 regs
->num_regs
= need_regs
;
513 assert (regs_allocated
== REGS_FIXED
);
514 /* This function may not be called with REGS_FIXED and nregs too big. */
515 assert (regs
->num_regs
>= nregs
);
520 for (i
= 0; i
< nregs
; ++i
)
522 regs
->start
[i
] = pmatch
[i
].rm_so
;
523 regs
->end
[i
] = pmatch
[i
].rm_eo
;
525 for ( ; i
< regs
->num_regs
; ++i
)
526 regs
->start
[i
] = regs
->end
[i
] = -1;
531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
532 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
533 this memory for recording register information. STARTS and ENDS
534 must be allocated using the malloc library routine, and must each
535 be at least NUM_REGS * sizeof (regoff_t) bytes long.
537 If NUM_REGS == 0, then subsequent matches should allocate their own
540 Unless this function is called, the first search or match using
541 PATTERN_BUFFER will allocate its own register data, without
542 freeing the old data. */
545 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
546 struct re_pattern_buffer
*bufp
;
547 struct re_registers
*regs
;
549 regoff_t
*starts
, *ends
;
553 bufp
->regs_allocated
= REGS_REALLOCATE
;
554 regs
->num_regs
= num_regs
;
555 regs
->start
= starts
;
560 bufp
->regs_allocated
= REGS_UNALLOCATED
;
562 regs
->start
= regs
->end
= (regoff_t
*) 0;
566 weak_alias (__re_set_registers
, re_set_registers
)
569 /* Entry points compatible with 4.2 BSD regex library. We don't define
570 them unless specifically requested. */
572 #if defined _REGEX_RE_COMP || defined _LIBC
580 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
582 #endif /* _REGEX_RE_COMP */
584 /* Internal entry point. */
586 /* Searches for a compiled pattern PREG in the string STRING, whose
587 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
588 mingings with regexec. START, and RANGE have the same meanings
590 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
591 otherwise return the error code.
592 Note: We assume front end functions already check ranges.
593 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
596 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
600 int length
, start
, range
, stop
, eflags
;
605 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
606 int left_lim
, right_lim
, incr
;
607 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
609 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
610 re_match_context_t mctx
= { .dfa
= dfa
};
612 re_match_context_t mctx
;
614 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
615 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
616 unsigned RE_TRANSLATE_TYPE t
= (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
618 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
619 memset (&mctx
, '\0', sizeof (re_match_context_t
));
623 /* Check if the DFA haven't been compiled. */
624 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
625 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
626 || dfa
->init_state_begbuf
== NULL
, 0))
630 /* We assume front-end functions already check them. */
631 assert (start
+ range
>= 0 && start
+ range
<= length
);
634 /* If initial states with non-begbuf contexts have no elements,
635 the regex must be anchored. If preg->newline_anchor is set,
636 we'll never use init_state_nl, so do not check it. */
637 if (dfa
->init_state
->nodes
.nelem
== 0
638 && dfa
->init_state_word
->nodes
.nelem
== 0
639 && (dfa
->init_state_nl
->nodes
.nelem
== 0
640 || !preg
->newline_anchor
))
642 if (start
!= 0 && start
+ range
!= 0)
647 /* We must check the longest matching, if nmatch > 0. */
648 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
650 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
651 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
652 if (BE (err
!= REG_NOERROR
, 0))
654 mctx
.input
.stop
= stop
;
655 mctx
.input
.raw_stop
= stop
;
656 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
658 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
659 if (BE (err
!= REG_NOERROR
, 0))
662 /* We will log all the DFA states through which the dfa pass,
663 if nmatch > 1, or this dfa has "multibyte node", which is a
664 back-reference or a node which can accept multibyte character or
665 multi character collating element. */
666 if (nmatch
> 1 || dfa
->has_mb_node
)
668 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
669 if (BE (mctx
.state_log
== NULL
, 0))
676 mctx
.state_log
= NULL
;
679 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
680 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
682 /* Check incrementally whether of not the input string match. */
683 incr
= (range
< 0) ? -1 : 1;
684 left_lim
= (range
< 0) ? start
+ range
: start
;
685 right_lim
= (range
< 0) ? start
: start
+ range
;
686 sb
= dfa
->mb_cur_max
== 1;
689 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
690 | (range
>= 0 ? 2 : 0)
691 | (t
!= NULL
? 1 : 0))
694 for (;; match_first
+= incr
)
697 if (match_first
< left_lim
|| right_lim
< match_first
)
700 /* Advance as rapidly as possible through the string, until we
701 find a plausible place to start matching. This may be done
702 with varying efficiency, so there are various possibilities:
703 only the most common of them are specialized, in order to
704 save on code size. We use a switch statement for speed. */
712 /* Fastmap with single-byte translation, match forward. */
713 while (BE (match_first
< right_lim
, 1)
714 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
716 goto forward_match_found_start_or_reached_end
;
719 /* Fastmap without translation, match forward. */
720 while (BE (match_first
< right_lim
, 1)
721 && !fastmap
[(unsigned char) string
[match_first
]])
724 forward_match_found_start_or_reached_end
:
725 if (BE (match_first
== right_lim
, 0))
727 ch
= match_first
>= length
728 ? 0 : (unsigned char) string
[match_first
];
729 if (!fastmap
[t
? t
[ch
] : ch
])
736 /* Fastmap without multi-byte translation, match backwards. */
737 while (match_first
>= left_lim
)
739 ch
= match_first
>= length
740 ? 0 : (unsigned char) string
[match_first
];
741 if (fastmap
[t
? t
[ch
] : ch
])
745 if (match_first
< left_lim
)
750 /* In this case, we can't determine easily the current byte,
751 since it might be a component byte of a multibyte
752 character. Then we use the constructed buffer instead. */
755 /* If MATCH_FIRST is out of the valid range, reconstruct the
757 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
758 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
760 err
= re_string_reconstruct (&mctx
.input
, match_first
,
762 if (BE (err
!= REG_NOERROR
, 0))
765 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
767 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
768 Note that MATCH_FIRST must not be smaller than 0. */
769 ch
= (match_first
>= length
770 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
774 if (match_first
< left_lim
|| match_first
> right_lim
)
783 /* Reconstruct the buffers so that the matcher can assume that
784 the matching starts from the beginning of the buffer. */
785 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
786 if (BE (err
!= REG_NOERROR
, 0))
789 #ifdef RE_ENABLE_I18N
790 /* Don't consider this char as a possible match start if it part,
791 yet isn't the head, of a multibyte character. */
792 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
796 /* It seems to be appropriate one, then use the matcher. */
797 /* We assume that the matching starts from 0. */
798 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
799 match_last
= check_matching (&mctx
, fl_longest_match
,
800 range
>= 0 ? &match_first
: NULL
);
801 if (match_last
!= -1)
803 if (BE (match_last
== -2, 0))
810 mctx
.match_last
= match_last
;
811 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
813 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
814 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
817 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
820 err
= prune_impossible_nodes (&mctx
);
821 if (err
== REG_NOERROR
)
823 if (BE (err
!= REG_NOMATCH
, 0))
828 break; /* We found a match. */
832 match_ctx_clean (&mctx
);
836 assert (match_last
!= -1);
837 assert (err
== REG_NOERROR
);
840 /* Set pmatch[] if we need. */
845 /* Initialize registers. */
846 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
847 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
849 /* Set the points where matching start/end. */
851 pmatch
[0].rm_eo
= mctx
.match_last
;
853 if (!preg
->no_sub
&& nmatch
> 1)
855 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
856 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
857 if (BE (err
!= REG_NOERROR
, 0))
861 /* At last, add the offset to the each registers, since we slided
862 the buffers so that we could assume that the matching starts
864 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
865 if (pmatch
[reg_idx
].rm_so
!= -1)
867 #ifdef RE_ENABLE_I18N
868 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
870 if (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
)
871 pmatch
[reg_idx
].rm_so
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
873 pmatch
[reg_idx
].rm_so
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
];
874 if (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
)
875 pmatch
[reg_idx
].rm_eo
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
877 pmatch
[reg_idx
].rm_eo
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
];
880 assert (mctx
.input
.offsets_needed
== 0);
882 pmatch
[reg_idx
].rm_so
+= match_first
;
883 pmatch
[reg_idx
].rm_eo
+= match_first
;
888 reg_idx
+ 1 < nmatch
&& reg_idx
< preg
->re_nsub
;
890 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
892 pmatch
[reg_idx
+ 1].rm_so
893 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
894 pmatch
[reg_idx
+ 1].rm_eo
895 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
900 re_free (mctx
.state_log
);
902 match_ctx_free (&mctx
);
903 re_string_destruct (&mctx
.input
);
908 prune_impossible_nodes (mctx
)
909 re_match_context_t
*mctx
;
911 re_dfa_t
*const dfa
= mctx
->dfa
;
912 int halt_node
, match_last
;
914 re_dfastate_t
**sifted_states
;
915 re_dfastate_t
**lim_states
= NULL
;
916 re_sift_context_t sctx
;
918 assert (mctx
->state_log
!= NULL
);
920 match_last
= mctx
->match_last
;
921 halt_node
= mctx
->last_node
;
922 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
923 if (BE (sifted_states
== NULL
, 0))
930 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
931 if (BE (lim_states
== NULL
, 0))
938 memset (lim_states
, '\0',
939 sizeof (re_dfastate_t
*) * (match_last
+ 1));
940 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
942 ret
= sift_states_backward (mctx
, &sctx
);
943 re_node_set_free (&sctx
.limits
);
944 if (BE (ret
!= REG_NOERROR
, 0))
946 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
956 } while (mctx
->state_log
[match_last
] == NULL
957 || !mctx
->state_log
[match_last
]->halt
);
958 halt_node
= check_halt_state_context (mctx
,
959 mctx
->state_log
[match_last
],
962 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
964 re_free (lim_states
);
966 if (BE (ret
!= REG_NOERROR
, 0))
971 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
972 ret
= sift_states_backward (mctx
, &sctx
);
973 re_node_set_free (&sctx
.limits
);
974 if (BE (ret
!= REG_NOERROR
, 0))
977 re_free (mctx
->state_log
);
978 mctx
->state_log
= sifted_states
;
979 sifted_states
= NULL
;
980 mctx
->last_node
= halt_node
;
981 mctx
->match_last
= match_last
;
984 re_free (sifted_states
);
985 re_free (lim_states
);
989 /* Acquire an initial state and return it.
990 We must select appropriate initial state depending on the context,
991 since initial states may have constraints like "\<", "^", etc.. */
993 static inline re_dfastate_t
*
994 acquire_init_state_context (err
, mctx
, idx
)
996 const re_match_context_t
*mctx
;
999 re_dfa_t
*const dfa
= mctx
->dfa
;
1000 if (dfa
->init_state
->has_constraint
)
1002 unsigned int context
;
1003 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1004 if (IS_WORD_CONTEXT (context
))
1005 return dfa
->init_state_word
;
1006 else if (IS_ORDINARY_CONTEXT (context
))
1007 return dfa
->init_state
;
1008 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1009 return dfa
->init_state_begbuf
;
1010 else if (IS_NEWLINE_CONTEXT (context
))
1011 return dfa
->init_state_nl
;
1012 else if (IS_BEGBUF_CONTEXT (context
))
1014 /* It is relatively rare case, then calculate on demand. */
1015 return re_acquire_state_context (err
, dfa
,
1016 dfa
->init_state
->entrance_nodes
,
1020 /* Must not happen? */
1021 return dfa
->init_state
;
1024 return dfa
->init_state
;
1027 /* Check whether the regular expression match input string INPUT or not,
1028 and return the index where the matching end, return -1 if not match,
1029 or return -2 in case of an error.
1030 FL_LONGEST_MATCH means we want the POSIX longest matching.
1031 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1032 next place where we may want to try matching.
1033 Note that the matcher assume that the maching starts from the current
1034 index of the buffer. */
1037 check_matching (mctx
, fl_longest_match
, p_match_first
)
1038 re_match_context_t
*mctx
;
1039 int fl_longest_match
;
1042 re_dfa_t
*const dfa
= mctx
->dfa
;
1045 int match_last
= -1;
1046 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1047 re_dfastate_t
*cur_state
;
1048 int at_init_state
= p_match_first
!= NULL
;
1049 int next_start_idx
= cur_str_idx
;
1052 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1053 /* An initial state must not be NULL (invalid). */
1054 if (BE (cur_state
== NULL
, 0))
1056 assert (err
== REG_ESPACE
);
1060 if (mctx
->state_log
!= NULL
)
1062 mctx
->state_log
[cur_str_idx
] = cur_state
;
1064 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1065 later. E.g. Processing back references. */
1066 if (BE (dfa
->nbackref
, 0))
1069 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1070 if (BE (err
!= REG_NOERROR
, 0))
1073 if (cur_state
->has_backref
)
1075 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1076 if (BE (err
!= REG_NOERROR
, 0))
1082 /* If the RE accepts NULL string. */
1083 if (BE (cur_state
->halt
, 0))
1085 if (!cur_state
->has_constraint
1086 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1088 if (!fl_longest_match
)
1092 match_last
= cur_str_idx
;
1098 while (!re_string_eoi (&mctx
->input
))
1100 re_dfastate_t
*old_state
= cur_state
;
1101 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1103 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1104 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1105 && mctx
->input
.valid_len
< mctx
->input
.len
))
1107 err
= extend_buffers (mctx
);
1108 if (BE (err
!= REG_NOERROR
, 0))
1110 assert (err
== REG_ESPACE
);
1115 cur_state
= transit_state (&err
, mctx
, cur_state
);
1116 if (mctx
->state_log
!= NULL
)
1117 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1119 if (cur_state
== NULL
)
1121 /* Reached the invalid state or an error. Try to recover a valid
1122 state using the state log, if available and if we have not
1123 already found a valid (even if not the longest) match. */
1124 if (BE (err
!= REG_NOERROR
, 0))
1127 if (mctx
->state_log
== NULL
1128 || (match
&& !fl_longest_match
)
1129 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1133 if (BE (at_init_state
, 0))
1135 if (old_state
== cur_state
)
1136 next_start_idx
= next_char_idx
;
1141 if (cur_state
->halt
)
1143 /* Reached a halt state.
1144 Check the halt state can satisfy the current context. */
1145 if (!cur_state
->has_constraint
1146 || check_halt_state_context (mctx
, cur_state
,
1147 re_string_cur_idx (&mctx
->input
)))
1149 /* We found an appropriate halt state. */
1150 match_last
= re_string_cur_idx (&mctx
->input
);
1153 /* We found a match, do not modify match_first below. */
1154 p_match_first
= NULL
;
1155 if (!fl_longest_match
)
1162 *p_match_first
+= next_start_idx
;
1167 /* Check NODE match the current context. */
1169 static int check_halt_node_context (dfa
, node
, context
)
1170 const re_dfa_t
*dfa
;
1172 unsigned int context
;
1174 re_token_type_t type
= dfa
->nodes
[node
].type
;
1175 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1176 if (type
!= END_OF_RE
)
1180 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1185 /* Check the halt state STATE match the current context.
1186 Return 0 if not match, if the node, STATE has, is a halt node and
1187 match the context, return the node. */
1190 check_halt_state_context (mctx
, state
, idx
)
1191 const re_match_context_t
*mctx
;
1192 const re_dfastate_t
*state
;
1196 unsigned int context
;
1198 assert (state
->halt
);
1200 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1201 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1202 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1203 return state
->nodes
.elems
[i
];
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208 corresponding to the DFA).
1209 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1213 proceed_next_node (mctx
, nregs
, regs
, pidx
, node
, eps_via_nodes
, fs
)
1214 const re_match_context_t
*mctx
;
1216 int nregs
, *pidx
, node
;
1217 re_node_set
*eps_via_nodes
;
1218 struct re_fail_stack_t
*fs
;
1220 re_dfa_t
*const dfa
= mctx
->dfa
;
1221 int i
, err
, dest_node
;
1223 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1225 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1226 int ndest
, dest_nodes
[2];
1227 err
= re_node_set_insert (eps_via_nodes
, node
);
1228 if (BE (err
< 0, 0))
1230 /* Pick up valid destinations. */
1231 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1233 int candidate
= dfa
->edests
[node
].elems
[i
];
1234 if (!re_node_set_contains (cur_nodes
, candidate
))
1236 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1237 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1241 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1242 /* In order to avoid infinite loop like "(a*)*". */
1243 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1244 return dest_nodes
[1];
1246 && push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
,
1249 return dest_nodes
[0];
1254 re_token_type_t type
= dfa
->nodes
[node
].type
;
1256 #ifdef RE_ENABLE_I18N
1257 if (ACCEPT_MB_NODE (type
))
1258 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1260 #endif /* RE_ENABLE_I18N */
1261 if (type
== OP_BACK_REF
)
1263 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1264 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1267 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1271 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1272 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1280 err
= re_node_set_insert (eps_via_nodes
, node
);
1281 if (BE (err
< 0, 0))
1283 dest_node
= dfa
->edests
[node
].elems
[0];
1284 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1291 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1293 dest_node
= dfa
->nexts
[node
];
1294 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1295 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1296 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1299 re_node_set_empty (eps_via_nodes
);
1306 static reg_errcode_t
1307 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1308 struct re_fail_stack_t
*fs
;
1309 int str_idx
, *dests
, nregs
;
1311 re_node_set
*eps_via_nodes
;
1314 int num
= fs
->num
++;
1315 if (fs
->num
== fs
->alloc
)
1317 struct re_fail_stack_ent_t
*new_array
;
1318 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1320 if (new_array
== NULL
)
1323 fs
->stack
= new_array
;
1325 fs
->stack
[num
].idx
= str_idx
;
1326 fs
->stack
[num
].node
= dests
[1];
1327 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1328 if (fs
->stack
[num
].regs
== NULL
)
1330 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1331 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1336 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1337 struct re_fail_stack_t
*fs
;
1340 re_node_set
*eps_via_nodes
;
1342 int num
= --fs
->num
;
1344 *pidx
= fs
->stack
[num
].idx
;
1345 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1346 re_node_set_free (eps_via_nodes
);
1347 re_free (fs
->stack
[num
].regs
);
1348 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1349 return fs
->stack
[num
].node
;
1352 /* Set the positions where the subexpressions are starts/ends to registers
1354 Note: We assume that pmatch[0] is already set, and
1355 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1357 static reg_errcode_t
1358 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1359 const regex_t
*preg
;
1360 const re_match_context_t
*mctx
;
1365 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1366 int idx
, cur_node
, real_nmatch
;
1367 re_node_set eps_via_nodes
;
1368 struct re_fail_stack_t
*fs
;
1369 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1370 regmatch_t
*prev_idx_match
;
1373 assert (nmatch
> 1);
1374 assert (mctx
->state_log
!= NULL
);
1379 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1380 if (fs
->stack
== NULL
)
1386 cur_node
= dfa
->init_node
;
1387 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1388 re_node_set_init_empty (&eps_via_nodes
);
1390 prev_idx_match
= (regmatch_t
*) alloca (sizeof (regmatch_t
) * real_nmatch
);
1391 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * real_nmatch
);
1393 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1395 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, real_nmatch
);
1397 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1402 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1403 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1405 if (reg_idx
== nmatch
)
1407 re_node_set_free (&eps_via_nodes
);
1408 return free_fail_stack_return (fs
);
1410 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1415 re_node_set_free (&eps_via_nodes
);
1420 /* Proceed to next node. */
1421 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1422 &eps_via_nodes
, fs
);
1424 if (BE (cur_node
< 0, 0))
1426 if (BE (cur_node
== -2, 0))
1428 re_node_set_free (&eps_via_nodes
);
1429 free_fail_stack_return (fs
);
1433 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1437 re_node_set_free (&eps_via_nodes
);
1442 re_node_set_free (&eps_via_nodes
);
1443 return free_fail_stack_return (fs
);
1446 static reg_errcode_t
1447 free_fail_stack_return (fs
)
1448 struct re_fail_stack_t
*fs
;
1453 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1455 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1456 re_free (fs
->stack
[fs_idx
].regs
);
1458 re_free (fs
->stack
);
1464 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, cur_idx
, nmatch
)
1466 regmatch_t
*pmatch
, *prev_idx_match
;
1467 int cur_node
, cur_idx
, nmatch
;
1469 int type
= dfa
->nodes
[cur_node
].type
;
1470 if (type
== OP_OPEN_SUBEXP
)
1472 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1474 /* We are at the first node of this sub expression. */
1475 if (reg_num
< nmatch
)
1477 pmatch
[reg_num
].rm_so
= cur_idx
;
1478 pmatch
[reg_num
].rm_eo
= -1;
1481 else if (type
== OP_CLOSE_SUBEXP
)
1483 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1484 if (reg_num
< nmatch
)
1486 /* We are at the last node of this sub expression. */
1487 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1489 pmatch
[reg_num
].rm_eo
= cur_idx
;
1490 /* This is a non-empty match or we are not inside an optional
1491 subexpression. Accept this right away. */
1492 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1496 if (dfa
->nodes
[cur_node
].opt_subexp
1497 && prev_idx_match
[reg_num
].rm_so
!= -1)
1498 /* We transited through an empty match for an optional
1499 subexpression, like (a?)*, and this is not the subexp's
1500 first match. Copy back the old content of the registers
1501 so that matches of an inner subexpression are undone as
1502 well, like in ((a?))*. */
1503 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1505 /* We completed a subexpression, but it may be part of
1506 an optional one, so do not update PREV_IDX_MATCH. */
1507 pmatch
[reg_num
].rm_eo
= cur_idx
;
1513 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1514 and sift the nodes in each states according to the following rules.
1515 Updated state_log will be wrote to STATE_LOG.
1517 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1518 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1519 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1520 the LAST_NODE, we throw away the node `a'.
1521 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1522 string `s' and transit to `b':
1523 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1525 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1526 thrown away, we throw away the node `a'.
1527 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1528 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1530 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1531 we throw away the node `a'. */
1533 #define STATE_NODE_CONTAINS(state,node) \
1534 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1536 static reg_errcode_t
1537 sift_states_backward (mctx
, sctx
)
1538 re_match_context_t
*mctx
;
1539 re_sift_context_t
*sctx
;
1543 int str_idx
= sctx
->last_str_idx
;
1544 re_node_set cur_dest
;
1547 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1550 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1551 transit to the last_node and the last_node itself. */
1552 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1553 if (BE (err
!= REG_NOERROR
, 0))
1555 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1556 if (BE (err
!= REG_NOERROR
, 0))
1559 /* Then check each states in the state_log. */
1562 /* Update counters. */
1563 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1564 if (null_cnt
> mctx
->max_mb_elem_len
)
1566 memset (sctx
->sifted_states
, '\0',
1567 sizeof (re_dfastate_t
*) * str_idx
);
1568 re_node_set_free (&cur_dest
);
1571 re_node_set_empty (&cur_dest
);
1574 if (mctx
->state_log
[str_idx
])
1576 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1577 if (BE (err
!= REG_NOERROR
, 0))
1581 /* Add all the nodes which satisfy the following conditions:
1582 - It can epsilon transit to a node in CUR_DEST.
1584 And update state_log. */
1585 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1586 if (BE (err
!= REG_NOERROR
, 0))
1591 re_node_set_free (&cur_dest
);
1595 static reg_errcode_t
1596 build_sifted_states (mctx
, sctx
, str_idx
, cur_dest
)
1597 re_match_context_t
*mctx
;
1598 re_sift_context_t
*sctx
;
1600 re_node_set
*cur_dest
;
1602 re_dfa_t
*const dfa
= mctx
->dfa
;
1603 re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1606 /* Then build the next sifted state.
1607 We build the next sifted state on `cur_dest', and update
1608 `sifted_states[str_idx]' with `cur_dest'.
1610 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1611 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1612 for (i
= 0; i
< cur_src
->nelem
; i
++)
1614 int prev_node
= cur_src
->elems
[i
];
1616 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1619 if (IS_EPSILON_NODE (type
))
1621 #ifdef RE_ENABLE_I18N
1622 /* If the node may accept `multi byte'. */
1623 if (ACCEPT_MB_NODE (type
))
1624 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1625 str_idx
, sctx
->last_str_idx
);
1626 #endif /* RE_ENABLE_I18N */
1628 /* We don't check backreferences here.
1629 See update_cur_sifted_state(). */
1631 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1632 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1633 dfa
->nexts
[prev_node
]))
1639 if (sctx
->limits
.nelem
)
1641 int to_idx
= str_idx
+ naccepted
;
1642 if (check_dst_limits (mctx
, &sctx
->limits
,
1643 dfa
->nexts
[prev_node
], to_idx
,
1644 prev_node
, str_idx
))
1647 ret
= re_node_set_insert (cur_dest
, prev_node
);
1648 if (BE (ret
== -1, 0))
1655 /* Helper functions. */
1657 static reg_errcode_t
1658 clean_state_log_if_needed (mctx
, next_state_log_idx
)
1659 re_match_context_t
*mctx
;
1660 int next_state_log_idx
;
1662 int top
= mctx
->state_log_top
;
1664 if (next_state_log_idx
>= mctx
->input
.bufs_len
1665 || (next_state_log_idx
>= mctx
->input
.valid_len
1666 && mctx
->input
.valid_len
< mctx
->input
.len
))
1669 err
= extend_buffers (mctx
);
1670 if (BE (err
!= REG_NOERROR
, 0))
1674 if (top
< next_state_log_idx
)
1676 memset (mctx
->state_log
+ top
+ 1, '\0',
1677 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1678 mctx
->state_log_top
= next_state_log_idx
;
1683 static reg_errcode_t
1684 merge_state_array (dfa
, dst
, src
, num
)
1686 re_dfastate_t
**dst
;
1687 re_dfastate_t
**src
;
1692 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1694 if (dst
[st_idx
] == NULL
)
1695 dst
[st_idx
] = src
[st_idx
];
1696 else if (src
[st_idx
] != NULL
)
1698 re_node_set merged_set
;
1699 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1700 &src
[st_idx
]->nodes
);
1701 if (BE (err
!= REG_NOERROR
, 0))
1703 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1704 re_node_set_free (&merged_set
);
1705 if (BE (err
!= REG_NOERROR
, 0))
1712 static reg_errcode_t
1713 update_cur_sifted_state (mctx
, sctx
, str_idx
, dest_nodes
)
1714 re_match_context_t
*mctx
;
1715 re_sift_context_t
*sctx
;
1717 re_node_set
*dest_nodes
;
1719 re_dfa_t
*const dfa
= mctx
->dfa
;
1721 const re_node_set
*candidates
;
1722 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1723 : &mctx
->state_log
[str_idx
]->nodes
);
1725 if (dest_nodes
->nelem
== 0)
1726 sctx
->sifted_states
[str_idx
] = NULL
;
1731 /* At first, add the nodes which can epsilon transit to a node in
1733 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1734 if (BE (err
!= REG_NOERROR
, 0))
1737 /* Then, check the limitations in the current sift_context. */
1738 if (sctx
->limits
.nelem
)
1740 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1741 mctx
->bkref_ents
, str_idx
);
1742 if (BE (err
!= REG_NOERROR
, 0))
1747 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1748 if (BE (err
!= REG_NOERROR
, 0))
1752 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1754 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1755 if (BE (err
!= REG_NOERROR
, 0))
1761 static reg_errcode_t
1762 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1764 re_node_set
*dest_nodes
;
1765 const re_node_set
*candidates
;
1769 re_node_set src_copy
;
1771 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1772 if (BE (err
!= REG_NOERROR
, 0))
1774 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1776 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1778 + src_copy
.elems
[src_idx
]);
1779 if (BE (err
!= REG_NOERROR
, 0))
1781 re_node_set_free (&src_copy
);
1785 re_node_set_free (&src_copy
);
1789 static reg_errcode_t
1790 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1793 re_node_set
*dest_nodes
;
1794 const re_node_set
*candidates
;
1798 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1799 re_node_set except_nodes
;
1800 re_node_set_init_empty (&except_nodes
);
1801 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1803 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1804 if (cur_node
== node
)
1806 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1808 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1809 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1810 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1811 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1812 && re_node_set_contains (dest_nodes
, edst1
))
1814 && !re_node_set_contains (inv_eclosure
, edst2
)
1815 && re_node_set_contains (dest_nodes
, edst2
)))
1817 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1818 dfa
->inveclosures
+ cur_node
);
1819 if (BE (err
!= REG_NOERROR
, 0))
1821 re_node_set_free (&except_nodes
);
1827 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1829 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1830 if (!re_node_set_contains (&except_nodes
, cur_node
))
1832 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1833 re_node_set_remove_at (dest_nodes
, idx
);
1836 re_node_set_free (&except_nodes
);
1841 check_dst_limits (mctx
, limits
, dst_node
, dst_idx
, src_node
, src_idx
)
1842 re_match_context_t
*mctx
;
1843 re_node_set
*limits
;
1844 int dst_node
, dst_idx
, src_node
, src_idx
;
1846 re_dfa_t
*const dfa
= mctx
->dfa
;
1847 int lim_idx
, src_pos
, dst_pos
;
1849 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1850 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1851 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1854 struct re_backref_cache_entry
*ent
;
1855 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1856 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1858 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1859 subexp_idx
, dst_node
, dst_idx
,
1861 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1862 subexp_idx
, src_node
, src_idx
,
1866 <src> <dst> ( <subexp> )
1867 ( <subexp> ) <src> <dst>
1868 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1869 if (src_pos
== dst_pos
)
1870 continue; /* This is unrelated limitation. */
1878 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
, from_node
, bkref_idx
)
1879 re_match_context_t
*mctx
;
1880 int boundaries
, subexp_idx
, from_node
, bkref_idx
;
1882 re_dfa_t
*const dfa
= mctx
->dfa
;
1883 re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1886 /* Else, we are on the boundary: examine the nodes on the epsilon
1888 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1890 int node
= eclosures
->elems
[node_idx
];
1891 switch (dfa
->nodes
[node
].type
)
1895 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1900 if (ent
->node
!= node
)
1903 if (subexp_idx
<= 8 * sizeof (ent
->eps_reachable_subexps_map
)
1904 && (ent
->eps_reachable_subexps_map
1905 & (1 << (subexp_idx
- 1))) == 0)
1908 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1909 OP_CLOSE_SUBEXP cases below. But, if the
1910 destination node is the same node as the source
1911 node, don't recurse because it would cause an
1912 infinite loop: a regex that exhibits this behavior
1914 dst
= dfa
->edests
[node
].elems
[0];
1915 if (dst
== from_node
)
1919 else /* if (boundaries & 2) */
1923 cpos
= check_dst_limits_calc_pos_1 (mctx
, boundaries
,
1924 subexp_idx
, dst
, bkref_idx
);
1926 if (cpos
== -1 /* && (boundaries & 1) */)
1929 if (cpos
== 0 && (boundaries
& 2))
1932 ent
->eps_reachable_subexps_map
&= ~(1 << (subexp_idx
- 1));
1934 while (ent
++->more
);
1938 case OP_OPEN_SUBEXP
:
1939 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1943 case OP_CLOSE_SUBEXP
:
1944 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1953 return (boundaries
& 2) ? 1 : 0;
1957 check_dst_limits_calc_pos (mctx
, limit
, subexp_idx
, from_node
, str_idx
, bkref_idx
)
1958 re_match_context_t
*mctx
;
1959 int limit
, subexp_idx
, from_node
, str_idx
, bkref_idx
;
1961 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1964 /* If we are outside the range of the subexpression, return -1 or 1. */
1965 if (str_idx
< lim
->subexp_from
)
1968 if (lim
->subexp_to
< str_idx
)
1971 /* If we are within the subexpression, return 0. */
1972 boundaries
= (str_idx
== lim
->subexp_from
);
1973 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
1974 if (boundaries
== 0)
1977 /* Else, examine epsilon closure. */
1978 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1979 from_node
, bkref_idx
);
1982 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1983 which are against limitations from DEST_NODES. */
1985 static reg_errcode_t
1986 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1988 re_node_set
*dest_nodes
;
1989 const re_node_set
*candidates
;
1990 re_node_set
*limits
;
1991 struct re_backref_cache_entry
*bkref_ents
;
1995 int node_idx
, lim_idx
;
1997 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2000 struct re_backref_cache_entry
*ent
;
2001 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2003 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2004 continue; /* This is unrelated limitation. */
2006 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
2007 if (ent
->subexp_to
== str_idx
)
2011 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2013 int node
= dest_nodes
->elems
[node_idx
];
2014 re_token_type_t type
= dfa
->nodes
[node
].type
;
2015 if (type
== OP_OPEN_SUBEXP
2016 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2018 else if (type
== OP_CLOSE_SUBEXP
2019 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2023 /* Check the limitation of the open subexpression. */
2024 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2027 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2029 if (BE (err
!= REG_NOERROR
, 0))
2033 /* Check the limitation of the close subexpression. */
2035 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2037 int node
= dest_nodes
->elems
[node_idx
];
2038 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2040 && !re_node_set_contains (dfa
->eclosures
+ node
,
2043 /* It is against this limitation.
2044 Remove it form the current sifted state. */
2045 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2047 if (BE (err
!= REG_NOERROR
, 0))
2053 else /* (ent->subexp_to != str_idx) */
2055 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2057 int node
= dest_nodes
->elems
[node_idx
];
2058 re_token_type_t type
= dfa
->nodes
[node
].type
;
2059 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2061 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2063 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
2064 || (type
== OP_OPEN_SUBEXP
))
2066 /* It is against this limitation.
2067 Remove it form the current sifted state. */
2068 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2070 if (BE (err
!= REG_NOERROR
, 0))
2080 static reg_errcode_t
2081 sift_states_bkref (mctx
, sctx
, str_idx
, candidates
)
2082 re_match_context_t
*mctx
;
2083 re_sift_context_t
*sctx
;
2085 const re_node_set
*candidates
;
2087 re_dfa_t
*const dfa
= mctx
->dfa
;
2090 re_sift_context_t local_sctx
;
2091 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2093 if (first_idx
== -1)
2096 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2098 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2101 re_token_type_t type
;
2102 struct re_backref_cache_entry
*entry
;
2103 node
= candidates
->elems
[node_idx
];
2104 type
= dfa
->nodes
[node
].type
;
2105 /* Avoid infinite loop for the REs like "()\1+". */
2106 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2108 if (type
!= OP_BACK_REF
)
2111 entry
= mctx
->bkref_ents
+ first_idx
;
2112 enabled_idx
= first_idx
;
2115 int subexp_len
, to_idx
, dst_node
;
2116 re_dfastate_t
*cur_state
;
2118 if (entry
->node
!= node
)
2120 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2121 to_idx
= str_idx
+ subexp_len
;
2122 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2123 : dfa
->edests
[node
].elems
[0]);
2125 if (to_idx
> sctx
->last_str_idx
2126 || sctx
->sifted_states
[to_idx
] == NULL
2127 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2128 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2129 str_idx
, dst_node
, to_idx
))
2132 if (local_sctx
.sifted_states
== NULL
)
2135 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2136 if (BE (err
!= REG_NOERROR
, 0))
2139 local_sctx
.last_node
= node
;
2140 local_sctx
.last_str_idx
= str_idx
;
2141 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2142 if (BE (err
< 0, 0))
2147 cur_state
= local_sctx
.sifted_states
[str_idx
];
2148 err
= sift_states_backward (mctx
, &local_sctx
);
2149 if (BE (err
!= REG_NOERROR
, 0))
2151 if (sctx
->limited_states
!= NULL
)
2153 err
= merge_state_array (dfa
, sctx
->limited_states
,
2154 local_sctx
.sifted_states
,
2156 if (BE (err
!= REG_NOERROR
, 0))
2159 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2160 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2162 /* mctx->bkref_ents may have changed, reload the pointer. */
2163 entry
= mctx
->bkref_ents
+ enabled_idx
;
2165 while (enabled_idx
++, entry
++->more
);
2169 if (local_sctx
.sifted_states
!= NULL
)
2171 re_node_set_free (&local_sctx
.limits
);
2178 #ifdef RE_ENABLE_I18N
2180 sift_states_iter_mb (mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2181 const re_match_context_t
*mctx
;
2182 re_sift_context_t
*sctx
;
2183 int node_idx
, str_idx
, max_str_idx
;
2185 re_dfa_t
*const dfa
= mctx
->dfa
;
2187 /* Check the node can accept `multi byte'. */
2188 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2189 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2190 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2191 dfa
->nexts
[node_idx
]))
2192 /* The node can't accept the `multi byte', or the
2193 destination was already thrown away, then the node
2194 could't accept the current input `multi byte'. */
2196 /* Otherwise, it is sure that the node could accept
2197 `naccepted' bytes input. */
2200 #endif /* RE_ENABLE_I18N */
2203 /* Functions for state transition. */
2205 /* Return the next state to which the current state STATE will transit by
2206 accepting the current input byte, and update STATE_LOG if necessary.
2207 If STATE can accept a multibyte char/collating element/back reference
2208 update the destination of STATE_LOG. */
2210 static re_dfastate_t
*
2211 transit_state (err
, mctx
, state
)
2213 re_match_context_t
*mctx
;
2214 re_dfastate_t
*state
;
2216 re_dfa_t
*const dfa
= mctx
->dfa
;
2217 re_dfastate_t
**trtable
;
2220 #ifdef RE_ENABLE_I18N
2221 /* If the current state can accept multibyte. */
2222 if (BE (state
->accept_mb
, 0))
2224 *err
= transit_state_mb (mctx
, state
);
2225 if (BE (*err
!= REG_NOERROR
, 0))
2228 #endif /* RE_ENABLE_I18N */
2230 /* Then decide the next state with the single byte. */
2233 /* Use transition table */
2234 ch
= re_string_fetch_byte (&mctx
->input
);
2235 trtable
= state
->trtable
;
2236 if (trtable
== NULL
)
2238 trtable
= build_trtable (dfa
, state
);
2239 if (trtable
== NULL
)
2245 if (BE (state
->word_trtable
, 0))
2247 unsigned int context
;
2249 = re_string_context_at (&mctx
->input
,
2250 re_string_cur_idx (&mctx
->input
) - 1,
2252 if (IS_WORD_CONTEXT (context
))
2253 return trtable
[ch
+ SBC_MAX
];
2262 /* don't use transition table */
2263 return transit_state_sb (err
, mctx
, state
);
2267 /* Update the state_log if we need */
2269 merge_state_with_log (err
, mctx
, next_state
)
2271 re_match_context_t
*mctx
;
2272 re_dfastate_t
*next_state
;
2274 re_dfa_t
*const dfa
= mctx
->dfa
;
2275 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2277 if (cur_idx
> mctx
->state_log_top
)
2279 mctx
->state_log
[cur_idx
] = next_state
;
2280 mctx
->state_log_top
= cur_idx
;
2282 else if (mctx
->state_log
[cur_idx
] == 0)
2284 mctx
->state_log
[cur_idx
] = next_state
;
2288 re_dfastate_t
*pstate
;
2289 unsigned int context
;
2290 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2291 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2292 the destination of a multibyte char/collating element/
2293 back reference. Then the next state is the union set of
2294 these destinations and the results of the transition table. */
2295 pstate
= mctx
->state_log
[cur_idx
];
2296 log_nodes
= pstate
->entrance_nodes
;
2297 if (next_state
!= NULL
)
2299 table_nodes
= next_state
->entrance_nodes
;
2300 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2302 if (BE (*err
!= REG_NOERROR
, 0))
2306 next_nodes
= *log_nodes
;
2307 /* Note: We already add the nodes of the initial state,
2308 then we don't need to add them here. */
2310 context
= re_string_context_at (&mctx
->input
,
2311 re_string_cur_idx (&mctx
->input
) - 1,
2313 next_state
= mctx
->state_log
[cur_idx
]
2314 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2315 /* We don't need to check errors here, since the return value of
2316 this function is next_state and ERR is already set. */
2318 if (table_nodes
!= NULL
)
2319 re_node_set_free (&next_nodes
);
2322 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2324 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2325 later. We must check them here, since the back references in the
2326 next state might use them. */
2327 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2329 if (BE (*err
!= REG_NOERROR
, 0))
2332 /* If the next state has back references. */
2333 if (next_state
->has_backref
)
2335 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2336 if (BE (*err
!= REG_NOERROR
, 0))
2338 next_state
= mctx
->state_log
[cur_idx
];
2345 /* Skip bytes in the input that correspond to part of a
2346 multi-byte match, then look in the log for a state
2347 from which to restart matching. */
2349 find_recover_state (err
, mctx
)
2351 re_match_context_t
*mctx
;
2353 re_dfastate_t
*cur_state
= NULL
;
2356 int max
= mctx
->state_log_top
;
2357 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2361 if (++cur_str_idx
> max
)
2363 re_string_skip_bytes (&mctx
->input
, 1);
2365 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2367 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2369 while (err
== REG_NOERROR
&& cur_state
== NULL
);
2373 /* Helper functions for transit_state. */
2375 /* From the node set CUR_NODES, pick up the nodes whose types are
2376 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2377 expression. And register them to use them later for evaluating the
2378 correspoding back references. */
2380 static reg_errcode_t
2381 check_subexp_matching_top (mctx
, cur_nodes
, str_idx
)
2382 re_match_context_t
*mctx
;
2383 re_node_set
*cur_nodes
;
2386 re_dfa_t
*const dfa
= mctx
->dfa
;
2390 /* TODO: This isn't efficient.
2391 Because there might be more than one nodes whose types are
2392 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2395 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2397 int node
= cur_nodes
->elems
[node_idx
];
2398 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2399 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2400 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2402 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2403 if (BE (err
!= REG_NOERROR
, 0))
2411 /* Return the next state to which the current state STATE will transit by
2412 accepting the current input byte. */
2414 static re_dfastate_t
*
2415 transit_state_sb (err
, mctx
, state
)
2417 re_match_context_t
*mctx
;
2418 re_dfastate_t
*state
;
2420 re_dfa_t
*const dfa
= mctx
->dfa
;
2421 re_node_set next_nodes
;
2422 re_dfastate_t
*next_state
;
2423 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2424 unsigned int context
;
2426 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2427 if (BE (*err
!= REG_NOERROR
, 0))
2429 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2431 int cur_node
= state
->nodes
.elems
[node_cnt
];
2432 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2434 *err
= re_node_set_merge (&next_nodes
,
2435 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2436 if (BE (*err
!= REG_NOERROR
, 0))
2438 re_node_set_free (&next_nodes
);
2443 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2444 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2445 /* We don't need to check errors here, since the return value of
2446 this function is next_state and ERR is already set. */
2448 re_node_set_free (&next_nodes
);
2449 re_string_skip_bytes (&mctx
->input
, 1);
2454 #ifdef RE_ENABLE_I18N
2455 static reg_errcode_t
2456 transit_state_mb (mctx
, pstate
)
2457 re_match_context_t
*mctx
;
2458 re_dfastate_t
*pstate
;
2460 re_dfa_t
*const dfa
= mctx
->dfa
;
2464 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2466 re_node_set dest_nodes
, *new_nodes
;
2467 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2468 int naccepted
= 0, dest_idx
;
2469 unsigned int context
;
2470 re_dfastate_t
*dest_state
;
2472 if (dfa
->nodes
[cur_node_idx
].constraint
)
2474 context
= re_string_context_at (&mctx
->input
,
2475 re_string_cur_idx (&mctx
->input
),
2477 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2482 /* How many bytes the node can accept? */
2483 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2484 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2485 re_string_cur_idx (&mctx
->input
));
2489 /* The node can accepts `naccepted' bytes. */
2490 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2491 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2492 : mctx
->max_mb_elem_len
);
2493 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2494 if (BE (err
!= REG_NOERROR
, 0))
2497 assert (dfa
->nexts
[cur_node_idx
] != -1);
2499 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2500 then we use pstate->nodes.elems[i] instead. */
2501 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2503 dest_state
= mctx
->state_log
[dest_idx
];
2504 if (dest_state
== NULL
)
2505 dest_nodes
= *new_nodes
;
2508 err
= re_node_set_init_union (&dest_nodes
,
2509 dest_state
->entrance_nodes
, new_nodes
);
2510 if (BE (err
!= REG_NOERROR
, 0))
2513 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1, mctx
->eflags
);
2514 mctx
->state_log
[dest_idx
]
2515 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2516 if (dest_state
!= NULL
)
2517 re_node_set_free (&dest_nodes
);
2518 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2523 #endif /* RE_ENABLE_I18N */
2525 static reg_errcode_t
2526 transit_state_bkref (mctx
, nodes
)
2527 re_match_context_t
*mctx
;
2528 const re_node_set
*nodes
;
2530 re_dfa_t
*const dfa
= mctx
->dfa
;
2533 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2535 for (i
= 0; i
< nodes
->nelem
; ++i
)
2537 int dest_str_idx
, prev_nelem
, bkc_idx
;
2538 int node_idx
= nodes
->elems
[i
];
2539 unsigned int context
;
2540 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2541 re_node_set
*new_dest_nodes
;
2543 /* Check whether `node' is a backreference or not. */
2544 if (node
->type
!= OP_BACK_REF
)
2547 if (node
->constraint
)
2549 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2551 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2555 /* `node' is a backreference.
2556 Check the substring which the substring matched. */
2557 bkc_idx
= mctx
->nbkref_ents
;
2558 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2559 if (BE (err
!= REG_NOERROR
, 0))
2562 /* And add the epsilon closures (which is `new_dest_nodes') of
2563 the backreference to appropriate state_log. */
2565 assert (dfa
->nexts
[node_idx
] != -1);
2567 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2570 re_dfastate_t
*dest_state
;
2571 struct re_backref_cache_entry
*bkref_ent
;
2572 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2573 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2575 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2576 new_dest_nodes
= (subexp_len
== 0
2577 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2578 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2579 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2580 - bkref_ent
->subexp_from
);
2581 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2583 dest_state
= mctx
->state_log
[dest_str_idx
];
2584 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2585 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2586 /* Add `new_dest_node' to state_log. */
2587 if (dest_state
== NULL
)
2589 mctx
->state_log
[dest_str_idx
]
2590 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2592 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2593 && err
!= REG_NOERROR
, 0))
2598 re_node_set dest_nodes
;
2599 err
= re_node_set_init_union (&dest_nodes
,
2600 dest_state
->entrance_nodes
,
2602 if (BE (err
!= REG_NOERROR
, 0))
2604 re_node_set_free (&dest_nodes
);
2607 mctx
->state_log
[dest_str_idx
]
2608 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2609 re_node_set_free (&dest_nodes
);
2610 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2611 && err
!= REG_NOERROR
, 0))
2614 /* We need to check recursively if the backreference can epsilon
2617 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2619 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2621 if (BE (err
!= REG_NOERROR
, 0))
2623 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2624 if (BE (err
!= REG_NOERROR
, 0))
2634 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2635 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2636 Note that we might collect inappropriate candidates here.
2637 However, the cost of checking them strictly here is too high, then we
2638 delay these checking for prune_impossible_nodes(). */
2640 static reg_errcode_t
2641 get_subexp (mctx
, bkref_node
, bkref_str_idx
)
2642 re_match_context_t
*mctx
;
2643 int bkref_node
, bkref_str_idx
;
2645 re_dfa_t
*const dfa
= mctx
->dfa
;
2646 int subexp_num
, sub_top_idx
;
2647 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2648 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2649 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2650 if (cache_idx
!= -1)
2652 const struct re_backref_cache_entry
*entry
= mctx
->bkref_ents
+ cache_idx
;
2654 if (entry
->node
== bkref_node
)
2655 return REG_NOERROR
; /* We already checked it. */
2656 while (entry
++->more
);
2659 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
- 1;
2661 /* For each sub expression */
2662 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2665 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2666 re_sub_match_last_t
*sub_last
;
2667 int sub_last_idx
, sl_str
, bkref_str_off
;
2669 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2670 continue; /* It isn't related. */
2672 sl_str
= sub_top
->str_idx
;
2673 bkref_str_off
= bkref_str_idx
;
2674 /* At first, check the last node of sub expressions we already
2676 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2679 sub_last
= sub_top
->lasts
[sub_last_idx
];
2680 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2681 /* The matched string by the sub expression match with the substring
2682 at the back reference? */
2683 if (sl_str_diff
> 0)
2685 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2687 /* Not enough chars for a successful match. */
2688 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2691 err
= clean_state_log_if_needed (mctx
,
2694 if (BE (err
!= REG_NOERROR
, 0))
2696 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2698 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2699 break; /* We don't need to search this sub expression any more. */
2701 bkref_str_off
+= sl_str_diff
;
2702 sl_str
+= sl_str_diff
;
2703 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2706 /* Reload buf, since the preceding call might have reallocated
2708 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2710 if (err
== REG_NOMATCH
)
2712 if (BE (err
!= REG_NOERROR
, 0))
2716 if (sub_last_idx
< sub_top
->nlasts
)
2718 if (sub_last_idx
> 0)
2720 /* Then, search for the other last nodes of the sub expression. */
2721 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2723 int cls_node
, sl_str_off
;
2724 const re_node_set
*nodes
;
2725 sl_str_off
= sl_str
- sub_top
->str_idx
;
2726 /* The matched string by the sub expression match with the substring
2727 at the back reference? */
2730 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2732 /* If we are at the end of the input, we cannot match. */
2733 if (bkref_str_off
>= mctx
->input
.len
)
2736 err
= extend_buffers (mctx
);
2737 if (BE (err
!= REG_NOERROR
, 0))
2740 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2742 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2743 break; /* We don't need to search this sub expression
2746 if (mctx
->state_log
[sl_str
] == NULL
)
2748 /* Does this state have a ')' of the sub expression? */
2749 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2750 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, OP_CLOSE_SUBEXP
);
2753 if (sub_top
->path
== NULL
)
2755 sub_top
->path
= calloc (sizeof (state_array_t
),
2756 sl_str
- sub_top
->str_idx
+ 1);
2757 if (sub_top
->path
== NULL
)
2760 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2761 in the current context? */
2762 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2763 sub_top
->str_idx
, cls_node
, sl_str
, OP_CLOSE_SUBEXP
);
2764 if (err
== REG_NOMATCH
)
2766 if (BE (err
!= REG_NOERROR
, 0))
2768 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2769 if (BE (sub_last
== NULL
, 0))
2771 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2773 if (err
== REG_NOMATCH
)
2780 /* Helper functions for get_subexp(). */
2782 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2783 If it can arrive, register the sub expression expressed with SUB_TOP
2786 static reg_errcode_t
2787 get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2788 re_match_context_t
*mctx
;
2789 const re_sub_match_top_t
*sub_top
;
2790 re_sub_match_last_t
*sub_last
;
2791 int bkref_node
, bkref_str
;
2795 /* Can the subexpression arrive the back reference? */
2796 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2797 sub_last
->str_idx
, bkref_node
, bkref_str
, OP_OPEN_SUBEXP
);
2798 if (err
!= REG_NOERROR
)
2800 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2802 if (BE (err
!= REG_NOERROR
, 0))
2804 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2805 return clean_state_log_if_needed (mctx
, to_idx
);
2808 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2809 Search '(' if FL_OPEN, or search ')' otherwise.
2810 TODO: This function isn't efficient...
2811 Because there might be more than one nodes whose types are
2812 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2817 find_subexp_node (dfa
, nodes
, subexp_idx
, type
)
2818 const re_dfa_t
*dfa
;
2819 const re_node_set
*nodes
;
2820 int subexp_idx
, type
;
2823 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2825 int cls_node
= nodes
->elems
[cls_idx
];
2826 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2827 if (node
->type
== type
2828 && node
->opr
.idx
== subexp_idx
)
2834 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2835 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2837 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2839 static reg_errcode_t
2840 check_arrival (mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2842 re_match_context_t
*mctx
;
2843 state_array_t
*path
;
2844 int top_node
, top_str
, last_node
, last_str
, type
;
2846 re_dfa_t
*const dfa
= mctx
->dfa
;
2848 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2849 re_dfastate_t
*cur_state
= NULL
;
2850 re_node_set
*cur_nodes
, next_nodes
;
2851 re_dfastate_t
**backup_state_log
;
2852 unsigned int context
;
2854 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2855 /* Extend the buffer if we need. */
2856 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2858 re_dfastate_t
**new_array
;
2859 int old_alloc
= path
->alloc
;
2860 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2861 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2862 if (new_array
== NULL
)
2864 path
->alloc
= old_alloc
;
2867 path
->array
= new_array
;
2868 memset (new_array
+ old_alloc
, '\0',
2869 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2872 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2874 /* Temporary modify MCTX. */
2875 backup_state_log
= mctx
->state_log
;
2876 backup_cur_idx
= mctx
->input
.cur_idx
;
2877 mctx
->state_log
= path
->array
;
2878 mctx
->input
.cur_idx
= str_idx
;
2880 /* Setup initial node set. */
2881 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2882 if (str_idx
== top_str
)
2884 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2885 if (BE (err
!= REG_NOERROR
, 0))
2887 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2888 if (BE (err
!= REG_NOERROR
, 0))
2890 re_node_set_free (&next_nodes
);
2896 cur_state
= mctx
->state_log
[str_idx
];
2897 if (cur_state
&& cur_state
->has_backref
)
2899 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2900 if (BE ( err
!= REG_NOERROR
, 0))
2904 re_node_set_init_empty (&next_nodes
);
2906 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2908 if (next_nodes
.nelem
)
2910 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2912 if (BE ( err
!= REG_NOERROR
, 0))
2914 re_node_set_free (&next_nodes
);
2918 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2919 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2921 re_node_set_free (&next_nodes
);
2924 mctx
->state_log
[str_idx
] = cur_state
;
2927 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2929 re_node_set_empty (&next_nodes
);
2930 if (mctx
->state_log
[str_idx
+ 1])
2932 err
= re_node_set_merge (&next_nodes
,
2933 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2934 if (BE (err
!= REG_NOERROR
, 0))
2936 re_node_set_free (&next_nodes
);
2942 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2943 &cur_state
->nodes
, &next_nodes
);
2944 if (BE (err
!= REG_NOERROR
, 0))
2946 re_node_set_free (&next_nodes
);
2951 if (next_nodes
.nelem
)
2953 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2954 if (BE (err
!= REG_NOERROR
, 0))
2956 re_node_set_free (&next_nodes
);
2959 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2961 if (BE ( err
!= REG_NOERROR
, 0))
2963 re_node_set_free (&next_nodes
);
2967 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2968 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2969 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2971 re_node_set_free (&next_nodes
);
2974 mctx
->state_log
[str_idx
] = cur_state
;
2975 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2977 re_node_set_free (&next_nodes
);
2978 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2979 : &mctx
->state_log
[last_str
]->nodes
);
2980 path
->next_idx
= str_idx
;
2983 mctx
->state_log
= backup_state_log
;
2984 mctx
->input
.cur_idx
= backup_cur_idx
;
2986 /* Then check the current node set has the node LAST_NODE. */
2987 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
2993 /* Helper functions for check_arrival. */
2995 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2997 TODO: This function is similar to the functions transit_state*(),
2998 however this function has many additional works.
2999 Can't we unify them? */
3001 static reg_errcode_t
3002 check_arrival_add_next_nodes (mctx
, str_idx
, cur_nodes
, next_nodes
)
3003 re_match_context_t
*mctx
;
3005 re_node_set
*cur_nodes
, *next_nodes
;
3007 re_dfa_t
*const dfa
= mctx
->dfa
;
3011 re_node_set union_set
;
3012 re_node_set_init_empty (&union_set
);
3013 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3016 int cur_node
= cur_nodes
->elems
[cur_idx
];
3017 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3018 if (IS_EPSILON_NODE (type
))
3020 #ifdef RE_ENABLE_I18N
3021 /* If the node may accept `multi byte'. */
3022 if (ACCEPT_MB_NODE (type
))
3024 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3028 re_dfastate_t
*dest_state
;
3029 int next_node
= dfa
->nexts
[cur_node
];
3030 int next_idx
= str_idx
+ naccepted
;
3031 dest_state
= mctx
->state_log
[next_idx
];
3032 re_node_set_empty (&union_set
);
3035 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3036 if (BE (err
!= REG_NOERROR
, 0))
3038 re_node_set_free (&union_set
);
3042 result
= re_node_set_insert (&union_set
, next_node
);
3043 if (BE (result
< 0, 0))
3045 re_node_set_free (&union_set
);
3048 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3050 if (BE (mctx
->state_log
[next_idx
] == NULL
3051 && err
!= REG_NOERROR
, 0))
3053 re_node_set_free (&union_set
);
3058 #endif /* RE_ENABLE_I18N */
3060 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3062 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3063 if (BE (result
< 0, 0))
3065 re_node_set_free (&union_set
);
3070 re_node_set_free (&union_set
);
3074 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3075 CUR_NODES, however exclude the nodes which are:
3076 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3077 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3080 static reg_errcode_t
3081 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, type
)
3083 re_node_set
*cur_nodes
;
3084 int ex_subexp
, type
;
3087 int idx
, outside_node
;
3088 re_node_set new_nodes
;
3090 assert (cur_nodes
->nelem
);
3092 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3093 if (BE (err
!= REG_NOERROR
, 0))
3095 /* Create a new node set NEW_NODES with the nodes which are epsilon
3096 closures of the node in CUR_NODES. */
3098 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3100 int cur_node
= cur_nodes
->elems
[idx
];
3101 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3102 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3103 if (outside_node
== -1)
3105 /* There are no problematic nodes, just merge them. */
3106 err
= re_node_set_merge (&new_nodes
, eclosure
);
3107 if (BE (err
!= REG_NOERROR
, 0))
3109 re_node_set_free (&new_nodes
);
3115 /* There are problematic nodes, re-calculate incrementally. */
3116 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3118 if (BE (err
!= REG_NOERROR
, 0))
3120 re_node_set_free (&new_nodes
);
3125 re_node_set_free (cur_nodes
);
3126 *cur_nodes
= new_nodes
;
3130 /* Helper function for check_arrival_expand_ecl.
3131 Check incrementally the epsilon closure of TARGET, and if it isn't
3132 problematic append it to DST_NODES. */
3134 static reg_errcode_t
3135 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, type
)
3137 int target
, ex_subexp
, type
;
3138 re_node_set
*dst_nodes
;
3141 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3145 if (dfa
->nodes
[cur_node
].type
== type
3146 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3148 if (type
== OP_CLOSE_SUBEXP
)
3150 err
= re_node_set_insert (dst_nodes
, cur_node
);
3151 if (BE (err
== -1, 0))
3156 err
= re_node_set_insert (dst_nodes
, cur_node
);
3157 if (BE (err
== -1, 0))
3159 if (dfa
->edests
[cur_node
].nelem
== 0)
3161 if (dfa
->edests
[cur_node
].nelem
== 2)
3163 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3164 dfa
->edests
[cur_node
].elems
[1],
3166 if (BE (err
!= REG_NOERROR
, 0))
3169 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3175 /* For all the back references in the current state, calculate the
3176 destination of the back references by the appropriate entry
3177 in MCTX->BKREF_ENTS. */
3179 static reg_errcode_t
3180 expand_bkref_cache (mctx
, cur_nodes
, cur_str
, subexp_num
,
3182 re_match_context_t
*mctx
;
3183 int cur_str
, subexp_num
, type
;
3184 re_node_set
*cur_nodes
;
3186 re_dfa_t
*const dfa
= mctx
->dfa
;
3188 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3189 struct re_backref_cache_entry
*ent
;
3191 if (cache_idx_start
== -1)
3195 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3198 int to_idx
, next_node
;
3200 /* Is this entry ENT is appropriate? */
3201 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3204 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3205 /* Calculate the destination of the back reference, and append it
3206 to MCTX->STATE_LOG. */
3207 if (to_idx
== cur_str
)
3209 /* The backreference did epsilon transit, we must re-check all the
3210 node in the current state. */
3211 re_node_set new_dests
;
3212 reg_errcode_t err2
, err3
;
3213 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3214 if (re_node_set_contains (cur_nodes
, next_node
))
3216 err
= re_node_set_init_1 (&new_dests
, next_node
);
3217 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3218 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3219 re_node_set_free (&new_dests
);
3220 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3221 || err3
!= REG_NOERROR
, 0))
3223 err
= (err
!= REG_NOERROR
? err
3224 : (err2
!= REG_NOERROR
? err2
: err3
));
3227 /* TODO: It is still inefficient... */
3232 re_node_set union_set
;
3233 next_node
= dfa
->nexts
[ent
->node
];
3234 if (mctx
->state_log
[to_idx
])
3237 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3240 err
= re_node_set_init_copy (&union_set
,
3241 &mctx
->state_log
[to_idx
]->nodes
);
3242 ret
= re_node_set_insert (&union_set
, next_node
);
3243 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3245 re_node_set_free (&union_set
);
3246 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3252 err
= re_node_set_init_1 (&union_set
, next_node
);
3253 if (BE (err
!= REG_NOERROR
, 0))
3256 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3257 re_node_set_free (&union_set
);
3258 if (BE (mctx
->state_log
[to_idx
] == NULL
3259 && err
!= REG_NOERROR
, 0))
3263 while (ent
++->more
);
3267 /* Build transition table for the state.
3268 Return the new table if succeeded, otherwise return NULL. */
3270 static re_dfastate_t
**
3271 build_trtable (dfa
, state
)
3273 re_dfastate_t
*state
;
3277 unsigned int elem
, mask
;
3278 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3279 int ndests
; /* Number of the destination states from `state'. */
3280 re_dfastate_t
**trtable
;
3281 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3282 re_node_set follows
, *dests_node
;
3286 /* We build DFA states which corresponds to the destination nodes
3287 from `state'. `dests_node[i]' represents the nodes which i-th
3288 destination state contains, and `dests_ch[i]' represents the
3289 characters which i-th destination state accepts. */
3291 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3292 dests_node
= (re_node_set
*)
3293 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3297 dests_node
= (re_node_set
*)
3298 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3299 if (BE (dests_node
== NULL
, 0))
3301 dests_node_malloced
= 1;
3303 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3305 /* Initialize transiton table. */
3306 state
->word_trtable
= 0;
3308 /* At first, group all nodes belonging to `state' into several
3310 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3311 if (BE (ndests
<= 0, 0))
3313 if (dests_node_malloced
)
3315 /* Return NULL in case of an error, trtable otherwise. */
3318 state
->trtable
= (re_dfastate_t
**)
3319 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);;
3320 return state
->trtable
;
3325 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3326 if (BE (err
!= REG_NOERROR
, 0))
3330 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3331 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3332 dest_states
= (re_dfastate_t
**)
3333 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3337 dest_states
= (re_dfastate_t
**)
3338 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3339 if (BE (dest_states
== NULL
, 0))
3342 if (dest_states_malloced
)
3344 re_node_set_free (&follows
);
3345 for (i
= 0; i
< ndests
; ++i
)
3346 re_node_set_free (dests_node
+ i
);
3347 if (dests_node_malloced
)
3351 dest_states_malloced
= 1;
3353 dest_states_word
= dest_states
+ ndests
;
3354 dest_states_nl
= dest_states_word
+ ndests
;
3355 bitset_empty (acceptable
);
3357 /* Then build the states for all destinations. */
3358 for (i
= 0; i
< ndests
; ++i
)
3361 re_node_set_empty (&follows
);
3362 /* Merge the follows of this destination states. */
3363 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3365 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3366 if (next_node
!= -1)
3368 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3369 if (BE (err
!= REG_NOERROR
, 0))
3373 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3374 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3376 /* If the new state has context constraint,
3377 build appropriate states for these contexts. */
3378 if (dest_states
[i
]->has_constraint
)
3380 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3382 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3385 if (dest_states
[i
] != dest_states_word
[i
]
3386 && dfa
->mb_cur_max
> 1)
3387 state
->word_trtable
= 1;
3389 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3391 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3396 dest_states_word
[i
] = dest_states
[i
];
3397 dest_states_nl
[i
] = dest_states
[i
];
3399 bitset_merge (acceptable
, dests_ch
[i
]);
3402 if (!BE (state
->word_trtable
, 0))
3404 /* We don't care about whether the following character is a word
3405 character, or we are in a single-byte character set so we can
3406 discern by looking at the character code: allocate a
3407 256-entry transition table. */
3408 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3409 if (BE (trtable
== NULL
, 0))
3412 /* For all characters ch...: */
3413 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3414 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3416 mask
<<= 1, elem
>>= 1, ++ch
)
3417 if (BE (elem
& 1, 0))
3419 /* There must be exactly one destination which accepts
3420 character ch. See group_nodes_into_DFAstates. */
3421 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3424 /* j-th destination accepts the word character ch. */
3425 if (dfa
->word_char
[i
] & mask
)
3426 trtable
[ch
] = dest_states_word
[j
];
3428 trtable
[ch
] = dest_states
[j
];
3433 /* We care about whether the following character is a word
3434 character, and we are in a multi-byte character set: discern
3435 by looking at the character code: build two 256-entry
3436 transition tables, one starting at trtable[0] and one
3437 starting at trtable[SBC_MAX]. */
3438 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*),
3440 if (BE (trtable
== NULL
, 0))
3443 /* For all characters ch...: */
3444 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3445 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3447 mask
<<= 1, elem
>>= 1, ++ch
)
3448 if (BE (elem
& 1, 0))
3450 /* There must be exactly one destination which accepts
3451 character ch. See group_nodes_into_DFAstates. */
3452 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3455 /* j-th destination accepts the word character ch. */
3456 trtable
[ch
] = dest_states
[j
];
3457 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3462 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3464 /* The current state accepts newline character. */
3465 for (j
= 0; j
< ndests
; ++j
)
3466 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3468 /* k-th destination accepts newline character. */
3469 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3470 if (state
->word_trtable
)
3471 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3472 /* There must be only one destination which accepts
3473 newline. See group_nodes_into_DFAstates. */
3478 if (dest_states_malloced
)
3481 re_node_set_free (&follows
);
3482 for (i
= 0; i
< ndests
; ++i
)
3483 re_node_set_free (dests_node
+ i
);
3485 if (dests_node_malloced
)
3488 state
->trtable
= trtable
;
3492 /* Group all nodes belonging to STATE into several destinations.
3493 Then for all destinations, set the nodes belonging to the destination
3494 to DESTS_NODE[i] and set the characters accepted by the destination
3495 to DEST_CH[i]. This function return the number of destinations. */
3498 group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
)
3500 const re_dfastate_t
*state
;
3501 re_node_set
*dests_node
;
3507 int ndests
; /* Number of the destinations from `state'. */
3508 bitset accepts
; /* Characters a node can accept. */
3509 const re_node_set
*cur_nodes
= &state
->nodes
;
3510 bitset_empty (accepts
);
3513 /* For all the nodes belonging to `state', */
3514 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3516 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3517 re_token_type_t type
= node
->type
;
3518 unsigned int constraint
= node
->constraint
;
3520 /* Enumerate all single byte character this node can accept. */
3521 if (type
== CHARACTER
)
3522 bitset_set (accepts
, node
->opr
.c
);
3523 else if (type
== SIMPLE_BRACKET
)
3525 bitset_merge (accepts
, node
->opr
.sbcset
);
3527 else if (type
== OP_PERIOD
)
3529 #ifdef RE_ENABLE_I18N
3530 if (dfa
->mb_cur_max
> 1)
3531 bitset_merge (accepts
, dfa
->sb_char
);
3534 bitset_set_all (accepts
);
3535 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3536 bitset_clear (accepts
, '\n');
3537 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3538 bitset_clear (accepts
, '\0');
3540 #ifdef RE_ENABLE_I18N
3541 else if (type
== OP_UTF8_PERIOD
)
3543 memset (accepts
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
3544 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3545 bitset_clear (accepts
, '\n');
3546 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3547 bitset_clear (accepts
, '\0');
3553 /* Check the `accepts' and sift the characters which are not
3554 match it the context. */
3557 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3559 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3560 bitset_empty (accepts
);
3561 if (accepts_newline
)
3562 bitset_set (accepts
, NEWLINE_CHAR
);
3566 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3568 bitset_empty (accepts
);
3572 if (constraint
& NEXT_WORD_CONSTRAINT
)
3574 unsigned int any_set
= 0;
3575 if (type
== CHARACTER
&& !node
->word_char
)
3577 bitset_empty (accepts
);
3580 #ifdef RE_ENABLE_I18N
3581 if (dfa
->mb_cur_max
> 1)
3582 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3583 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3586 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3587 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3591 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3593 unsigned int any_set
= 0;
3594 if (type
== CHARACTER
&& node
->word_char
)
3596 bitset_empty (accepts
);
3599 #ifdef RE_ENABLE_I18N
3600 if (dfa
->mb_cur_max
> 1)
3601 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3602 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3605 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3606 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3612 /* Then divide `accepts' into DFA states, or create a new
3613 state. Above, we make sure that accepts is not empty. */
3614 for (j
= 0; j
< ndests
; ++j
)
3616 bitset intersec
; /* Intersection sets, see below. */
3618 /* Flags, see below. */
3619 int has_intersec
, not_subset
, not_consumed
;
3621 /* Optimization, skip if this state doesn't accept the character. */
3622 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3625 /* Enumerate the intersection set of this state and `accepts'. */
3627 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3628 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3629 /* And skip if the intersection set is empty. */
3633 /* Then check if this state is a subset of `accepts'. */
3634 not_subset
= not_consumed
= 0;
3635 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3637 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3638 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3641 /* If this state isn't a subset of `accepts', create a
3642 new group state, which has the `remains'. */
3645 bitset_copy (dests_ch
[ndests
], remains
);
3646 bitset_copy (dests_ch
[j
], intersec
);
3647 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3648 if (BE (err
!= REG_NOERROR
, 0))
3653 /* Put the position in the current group. */
3654 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3655 if (BE (result
< 0, 0))
3658 /* If all characters are consumed, go to next node. */
3662 /* Some characters remain, create a new group. */
3665 bitset_copy (dests_ch
[ndests
], accepts
);
3666 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3667 if (BE (err
!= REG_NOERROR
, 0))
3670 bitset_empty (accepts
);
3675 for (j
= 0; j
< ndests
; ++j
)
3676 re_node_set_free (dests_node
+ j
);
3680 #ifdef RE_ENABLE_I18N
3681 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3682 Return the number of the bytes the node accepts.
3683 STR_IDX is the current index of the input string.
3685 This function handles the nodes which can accept one character, or
3686 one collating element like '.', '[a-z]', opposite to the other nodes
3687 can only accept one byte. */
3690 check_node_accept_bytes (dfa
, node_idx
, input
, str_idx
)
3692 int node_idx
, str_idx
;
3693 const re_string_t
*input
;
3695 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3696 int char_len
, elem_len
;
3699 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3701 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3702 if (BE (c
< 0xc2, 1))
3705 if (str_idx
+ 2 > input
->len
)
3708 d
= re_string_byte_at (input
, str_idx
+ 1);
3710 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3714 if (c
== 0xe0 && d
< 0xa0)
3720 if (c
== 0xf0 && d
< 0x90)
3726 if (c
== 0xf8 && d
< 0x88)
3732 if (c
== 0xfc && d
< 0x84)
3738 if (str_idx
+ char_len
> input
->len
)
3741 for (i
= 1; i
< char_len
; ++i
)
3743 d
= re_string_byte_at (input
, str_idx
+ i
);
3744 if (d
< 0x80 || d
> 0xbf)
3750 char_len
= re_string_char_size_at (input
, str_idx
);
3751 if (node
->type
== OP_PERIOD
)
3755 /* FIXME: I don't think this if is needed, as both '\n'
3756 and '\0' are char_len == 1. */
3757 /* '.' accepts any one character except the following two cases. */
3758 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3759 re_string_byte_at (input
, str_idx
) == '\n') ||
3760 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3761 re_string_byte_at (input
, str_idx
) == '\0'))
3766 elem_len
= re_string_elem_size_at (input
, str_idx
);
3767 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3770 if (node
->type
== COMPLEX_BRACKET
)
3772 const re_charset_t
*cset
= node
->opr
.mbcset
;
3774 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3780 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3781 ? re_string_wchar_at (input
, str_idx
) : 0);
3783 /* match with multibyte character? */
3784 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3785 if (wc
== cset
->mbchars
[i
])
3787 match_len
= char_len
;
3788 goto check_node_accept_bytes_match
;
3790 /* match with character_class? */
3791 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3793 wctype_t wt
= cset
->char_classes
[i
];
3794 if (__iswctype (wc
, wt
))
3796 match_len
= char_len
;
3797 goto check_node_accept_bytes_match
;
3802 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3805 unsigned int in_collseq
= 0;
3806 const int32_t *table
, *indirect
;
3807 const unsigned char *weights
, *extra
;
3808 const char *collseqwc
;
3810 /* This #include defines a local function! */
3811 # include <locale/weight.h>
3813 /* match with collating_symbol? */
3814 if (cset
->ncoll_syms
)
3815 extra
= (const unsigned char *)
3816 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3817 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3819 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3820 /* Compare the length of input collating element and
3821 the length of current collating element. */
3822 if (*coll_sym
!= elem_len
)
3824 /* Compare each bytes. */
3825 for (j
= 0; j
< *coll_sym
; j
++)
3826 if (pin
[j
] != coll_sym
[1 + j
])
3830 /* Match if every bytes is equal. */
3832 goto check_node_accept_bytes_match
;
3838 if (elem_len
<= char_len
)
3840 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3841 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3844 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3846 /* match with range expression? */
3847 for (i
= 0; i
< cset
->nranges
; ++i
)
3848 if (cset
->range_starts
[i
] <= in_collseq
3849 && in_collseq
<= cset
->range_ends
[i
])
3851 match_len
= elem_len
;
3852 goto check_node_accept_bytes_match
;
3855 /* match with equivalence_class? */
3856 if (cset
->nequiv_classes
)
3858 const unsigned char *cp
= pin
;
3859 table
= (const int32_t *)
3860 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3861 weights
= (const unsigned char *)
3862 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3863 extra
= (const unsigned char *)
3864 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3865 indirect
= (const int32_t *)
3866 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3867 idx
= findidx (&cp
);
3869 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3871 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3872 size_t weight_len
= weights
[idx
];
3873 if (weight_len
== weights
[equiv_class_idx
])
3876 while (cnt
<= weight_len
3877 && (weights
[equiv_class_idx
+ 1 + cnt
]
3878 == weights
[idx
+ 1 + cnt
]))
3880 if (cnt
> weight_len
)
3882 match_len
= elem_len
;
3883 goto check_node_accept_bytes_match
;
3892 /* match with range expression? */
3894 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3896 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3899 for (i
= 0; i
< cset
->nranges
; ++i
)
3901 cmp_buf
[0] = cset
->range_starts
[i
];
3902 cmp_buf
[4] = cset
->range_ends
[i
];
3903 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3904 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3906 match_len
= char_len
;
3907 goto check_node_accept_bytes_match
;
3911 check_node_accept_bytes_match
:
3912 if (!cset
->non_match
)
3919 return (elem_len
> char_len
) ? elem_len
: char_len
;
3927 find_collation_sequence_value (mbs
, mbs_len
)
3928 const unsigned char *mbs
;
3931 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3936 /* No valid character. Match it as a single byte character. */
3937 const unsigned char *collseq
= (const unsigned char *)
3938 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3939 return collseq
[mbs
[0]];
3946 const unsigned char *extra
= (const unsigned char *)
3947 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3948 int32_t extrasize
= (const unsigned char *)
3949 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3951 for (idx
= 0; idx
< extrasize
;)
3953 int mbs_cnt
, found
= 0;
3954 int32_t elem_mbs_len
;
3955 /* Skip the name of collating element name. */
3956 idx
= idx
+ extra
[idx
] + 1;
3957 elem_mbs_len
= extra
[idx
++];
3958 if (mbs_len
== elem_mbs_len
)
3960 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3961 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3963 if (mbs_cnt
== elem_mbs_len
)
3964 /* Found the entry. */
3967 /* Skip the byte sequence of the collating element. */
3968 idx
+= elem_mbs_len
;
3969 /* Adjust for the alignment. */
3970 idx
= (idx
+ 3) & ~3;
3971 /* Skip the collation sequence value. */
3972 idx
+= sizeof (uint32_t);
3973 /* Skip the wide char sequence of the collating element. */
3974 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3975 /* If we found the entry, return the sequence value. */
3977 return *(uint32_t *) (extra
+ idx
);
3978 /* Skip the collation sequence value. */
3979 idx
+= sizeof (uint32_t);
3985 #endif /* RE_ENABLE_I18N */
3987 /* Check whether the node accepts the byte which is IDX-th
3988 byte of the INPUT. */
3991 check_node_accept (mctx
, node
, idx
)
3992 const re_match_context_t
*mctx
;
3993 const re_token_t
*node
;
3996 re_dfa_t
*const dfa
= mctx
->dfa
;
3998 if (node
->constraint
)
4000 /* The node has constraints. Check whether the current context
4001 satisfies the constraints. */
4002 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4004 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4007 ch
= re_string_byte_at (&mctx
->input
, idx
);
4011 return node
->opr
.c
== ch
;
4012 case SIMPLE_BRACKET
:
4013 return bitset_contain (node
->opr
.sbcset
, ch
);
4014 #ifdef RE_ENABLE_I18N
4015 case OP_UTF8_PERIOD
:
4021 return !((ch
== '\n' && !(dfa
->syntax
& RE_DOT_NEWLINE
))
4022 || (ch
== '\0' && (dfa
->syntax
& RE_DOT_NOT_NULL
)));
4028 /* Extend the buffers, if the buffers have run out. */
4030 static reg_errcode_t
4031 extend_buffers (mctx
)
4032 re_match_context_t
*mctx
;
4035 re_string_t
*pstr
= &mctx
->input
;
4037 /* Double the lengthes of the buffers. */
4038 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4039 if (BE (ret
!= REG_NOERROR
, 0))
4042 if (mctx
->state_log
!= NULL
)
4044 /* And double the length of state_log. */
4045 /* XXX We have no indication of the size of this buffer. If this
4046 allocation fail we have no indication that the state_log array
4047 does not have the right size. */
4048 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4049 pstr
->bufs_len
+ 1);
4050 if (BE (new_array
== NULL
, 0))
4052 mctx
->state_log
= new_array
;
4055 /* Then reconstruct the buffers. */
4058 #ifdef RE_ENABLE_I18N
4059 if (pstr
->mb_cur_max
> 1)
4061 ret
= build_wcs_upper_buffer (pstr
);
4062 if (BE (ret
!= REG_NOERROR
, 0))
4066 #endif /* RE_ENABLE_I18N */
4067 build_upper_buffer (pstr
);
4071 #ifdef RE_ENABLE_I18N
4072 if (pstr
->mb_cur_max
> 1)
4073 build_wcs_buffer (pstr
);
4075 #endif /* RE_ENABLE_I18N */
4077 if (pstr
->trans
!= NULL
)
4078 re_string_translate_buffer (pstr
);
4085 /* Functions for matching context. */
4087 /* Initialize MCTX. */
4089 static reg_errcode_t
4090 match_ctx_init (mctx
, eflags
, n
)
4091 re_match_context_t
*mctx
;
4094 mctx
->eflags
= eflags
;
4095 mctx
->match_last
= -1;
4098 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4099 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4100 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4103 /* Already zero-ed by the caller.
4105 mctx->bkref_ents = NULL;
4106 mctx->nbkref_ents = 0;
4107 mctx->nsub_tops = 0; */
4108 mctx
->abkref_ents
= n
;
4109 mctx
->max_mb_elem_len
= 1;
4110 mctx
->asub_tops
= n
;
4114 /* Clean the entries which depend on the current input in MCTX.
4115 This function must be invoked when the matcher changes the start index
4116 of the input, or changes the input string. */
4119 match_ctx_clean (mctx
)
4120 re_match_context_t
*mctx
;
4123 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4126 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4127 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4129 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4130 re_free (last
->path
.array
);
4133 re_free (top
->lasts
);
4136 re_free (top
->path
->array
);
4137 re_free (top
->path
);
4142 mctx
->nsub_tops
= 0;
4143 mctx
->nbkref_ents
= 0;
4146 /* Free all the memory associated with MCTX. */
4149 match_ctx_free (mctx
)
4150 re_match_context_t
*mctx
;
4152 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4153 match_ctx_clean (mctx
);
4154 re_free (mctx
->sub_tops
);
4155 re_free (mctx
->bkref_ents
);
4158 /* Add a new backreference entry to MCTX.
4159 Note that we assume that caller never call this function with duplicate
4160 entry, and call with STR_IDX which isn't smaller than any existing entry.
4163 static reg_errcode_t
4164 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
4165 re_match_context_t
*mctx
;
4166 int node
, str_idx
, from
, to
;
4168 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4170 struct re_backref_cache_entry
* new_entry
;
4171 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4172 mctx
->abkref_ents
* 2);
4173 if (BE (new_entry
== NULL
, 0))
4175 re_free (mctx
->bkref_ents
);
4178 mctx
->bkref_ents
= new_entry
;
4179 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4180 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4181 mctx
->abkref_ents
*= 2;
4183 if (mctx
->nbkref_ents
> 0
4184 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4185 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4187 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4188 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4189 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4190 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4192 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4193 If bit N is clear, means that this entry won't epsilon-transition to
4194 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4195 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4198 A backreference does not epsilon-transition unless it is empty, so set
4199 to all zeros if FROM != TO. */
4200 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4201 = (from
== to
? ~0 : 0);
4203 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4204 if (mctx
->max_mb_elem_len
< to
- from
)
4205 mctx
->max_mb_elem_len
= to
- from
;
4209 /* Search for the first entry which has the same str_idx, or -1 if none is
4210 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4213 search_cur_bkref_entry (mctx
, str_idx
)
4214 re_match_context_t
*mctx
;
4217 int left
, right
, mid
, last
;
4218 last
= right
= mctx
->nbkref_ents
;
4219 for (left
= 0; left
< right
;)
4221 mid
= (left
+ right
) / 2;
4222 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4227 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4233 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4236 static reg_errcode_t
4237 match_ctx_add_subtop (mctx
, node
, str_idx
)
4238 re_match_context_t
*mctx
;
4242 assert (mctx
->sub_tops
!= NULL
);
4243 assert (mctx
->asub_tops
> 0);
4245 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4247 int new_asub_tops
= mctx
->asub_tops
* 2;
4248 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4249 re_sub_match_top_t
*,
4251 if (BE (new_array
== NULL
, 0))
4253 mctx
->sub_tops
= new_array
;
4254 mctx
->asub_tops
= new_asub_tops
;
4256 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4257 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4259 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4260 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4264 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4265 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4267 static re_sub_match_last_t
*
4268 match_ctx_add_sublast (subtop
, node
, str_idx
)
4269 re_sub_match_top_t
*subtop
;
4272 re_sub_match_last_t
*new_entry
;
4273 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4275 int new_alasts
= 2 * subtop
->alasts
+ 1;
4276 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4277 re_sub_match_last_t
*,
4279 if (BE (new_array
== NULL
, 0))
4281 subtop
->lasts
= new_array
;
4282 subtop
->alasts
= new_alasts
;
4284 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4285 if (BE (new_entry
!= NULL
, 1))
4287 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4288 new_entry
->node
= node
;
4289 new_entry
->str_idx
= str_idx
;
4296 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
)
4297 re_sift_context_t
*sctx
;
4298 re_dfastate_t
**sifted_sts
, **limited_sts
;
4299 int last_node
, last_str_idx
;
4301 sctx
->sifted_states
= sifted_sts
;
4302 sctx
->limited_states
= limited_sts
;
4303 sctx
->last_node
= last_node
;
4304 sctx
->last_str_idx
= last_str_idx
;
4305 re_node_set_init_empty (&sctx
->limits
);