1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static void match_ctx_free_subtops (re_match_context_t
*mctx
)
27 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
28 int str_idx
, int from
, int to
)
30 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
)
32 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
33 int str_idx
) internal_function
;
34 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
35 int node
, int str_idx
)
37 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
38 re_dfastate_t
**limited_sts
, int last_node
,
41 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
42 const char *string
, int length
,
43 int start
, int range
, int stop
,
44 size_t nmatch
, regmatch_t pmatch
[],
45 int eflags
) internal_function
;
46 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
47 const char *string1
, int length1
,
48 const char *string2
, int length2
,
49 int start
, int range
, struct re_registers
*regs
,
50 int stop
, int ret_len
) internal_function
;
51 static int re_search_stub (struct re_pattern_buffer
*bufp
,
52 const char *string
, int length
, int start
,
53 int range
, int stop
, struct re_registers
*regs
,
54 int ret_len
) internal_function
;
55 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
56 int nregs
, int regs_allocated
) internal_function
;
57 static inline re_dfastate_t
*acquire_init_state_context
58 (reg_errcode_t
*err
, const re_match_context_t
*mctx
, int idx
)
59 __attribute ((always_inline
)) internal_function
;
60 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
62 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
65 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
66 unsigned int context
) internal_function
;
67 static int check_halt_state_context (const re_match_context_t
*mctx
,
68 const re_dfastate_t
*state
, int idx
)
70 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
,
71 regmatch_t
*prev_idx_match
, int cur_node
,
72 int cur_idx
, int nmatch
) internal_function
;
73 static int proceed_next_node (const re_match_context_t
*mctx
,
74 int nregs
, regmatch_t
*regs
,
75 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
76 struct re_fail_stack_t
*fs
) internal_function
;
77 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
78 int str_idx
, int *dests
, int nregs
,
80 re_node_set
*eps_via_nodes
) internal_function
;
81 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
82 regmatch_t
*regs
, re_node_set
*eps_via_nodes
) internal_function
;
83 static reg_errcode_t
set_regs (const regex_t
*preg
,
84 const re_match_context_t
*mctx
,
85 size_t nmatch
, regmatch_t
*pmatch
,
86 int fl_backtrack
) internal_function
;
87 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
) internal_function
;
90 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
,
92 int node_idx
, int str_idx
, int max_str_idx
) internal_function
;
93 #endif /* RE_ENABLE_I18N */
94 static reg_errcode_t
sift_states_backward (re_match_context_t
*mctx
,
95 re_sift_context_t
*sctx
) internal_function
;
96 static reg_errcode_t
build_sifted_states (re_match_context_t
*mctx
,
97 re_sift_context_t
*sctx
, int str_idx
,
98 re_node_set
*cur_dest
) internal_function
;
99 static reg_errcode_t
update_cur_sifted_state (re_match_context_t
*mctx
,
100 re_sift_context_t
*sctx
,
102 re_node_set
*dest_nodes
) internal_function
;
103 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
104 re_node_set
*dest_nodes
,
105 const re_node_set
*candidates
) internal_function
;
106 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
107 re_node_set
*dest_nodes
,
108 const re_node_set
*and_nodes
) internal_function
;
109 static int check_dst_limits (re_match_context_t
*mctx
, re_node_set
*limits
,
110 int dst_node
, int dst_idx
, int src_node
,
111 int src_idx
) internal_function
;
112 static int check_dst_limits_calc_pos_1 (re_match_context_t
*mctx
,
113 int boundaries
, int subexp_idx
,
114 int from_node
, int bkref_idx
) internal_function
;
115 static int check_dst_limits_calc_pos (re_match_context_t
*mctx
,
116 int limit
, int subexp_idx
,
117 int node
, int str_idx
,
118 int bkref_idx
) internal_function
;
119 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
120 re_node_set
*dest_nodes
,
121 const re_node_set
*candidates
,
123 struct re_backref_cache_entry
*bkref_ents
,
124 int str_idx
) internal_function
;
125 static reg_errcode_t
sift_states_bkref (re_match_context_t
*mctx
,
126 re_sift_context_t
*sctx
,
127 int str_idx
, const re_node_set
*candidates
) internal_function
;
128 static reg_errcode_t
clean_state_log_if_needed (re_match_context_t
*mctx
,
129 int next_state_log_idx
) internal_function
;
130 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
131 re_dfastate_t
**src
, int num
) internal_function
;
132 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
133 re_match_context_t
*mctx
) internal_function
;
134 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
135 re_match_context_t
*mctx
,
136 re_dfastate_t
*state
) internal_function
;
137 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
138 re_match_context_t
*mctx
,
139 re_dfastate_t
*next_state
) internal_function
;
140 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
141 re_node_set
*cur_nodes
,
142 int str_idx
) internal_function
;
144 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
145 re_match_context_t
*mctx
,
146 re_dfastate_t
*pstate
) internal_function
;
148 #ifdef RE_ENABLE_I18N
149 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
150 re_dfastate_t
*pstate
) internal_function
;
151 #endif /* RE_ENABLE_I18N */
152 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
153 const re_node_set
*nodes
) internal_function
;
154 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
155 int bkref_node
, int bkref_str_idx
) internal_function
;
156 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
157 const re_sub_match_top_t
*sub_top
,
158 re_sub_match_last_t
*sub_last
,
159 int bkref_node
, int bkref_str
) internal_function
;
160 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
161 int subexp_idx
, int type
) internal_function
;
162 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
163 state_array_t
*path
, int top_node
,
164 int top_str
, int last_node
, int last_str
,
165 int type
) internal_function
;
166 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
168 re_node_set
*cur_nodes
,
169 re_node_set
*next_nodes
) internal_function
;
170 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
171 re_node_set
*cur_nodes
,
172 int ex_subexp
, int type
) internal_function
;
173 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
174 re_node_set
*dst_nodes
,
175 int target
, int ex_subexp
,
176 int type
) internal_function
;
177 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
178 re_node_set
*cur_nodes
, int cur_str
,
179 int subexp_num
, int type
) internal_function
;
180 static re_dfastate_t
**build_trtable (re_dfa_t
*dfa
,
181 re_dfastate_t
*state
) internal_function
;
182 #ifdef RE_ENABLE_I18N
183 static int check_node_accept_bytes (re_dfa_t
*dfa
, int node_idx
,
184 const re_string_t
*input
, int idx
) internal_function
;
186 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
187 size_t name_len
) internal_function
;
189 #endif /* RE_ENABLE_I18N */
190 static int group_nodes_into_DFAstates (re_dfa_t
*dfa
,
191 const re_dfastate_t
*state
,
192 re_node_set
*states_node
,
193 bitset
*states_ch
) internal_function
;
194 static int check_node_accept (const re_match_context_t
*mctx
,
195 const re_token_t
*node
, int idx
) internal_function
;
196 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
) internal_function
;
198 /* Entry point for POSIX code. */
200 /* regexec searches for a given pattern, specified by PREG, in the
203 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
204 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
205 least NMATCH elements, and we set them to the offsets of the
206 corresponding matched substrings.
208 EFLAGS specifies `execution flags' which affect matching: if
209 REG_NOTBOL is set, then ^ does not match at the beginning of the
210 string; if REG_NOTEOL is set, then $ does not match at the end.
212 We return 0 if we find a match and REG_NOMATCH if not. */
215 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
216 const regex_t
*__restrict preg
;
217 const char *__restrict string
;
225 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
228 if (eflags
& REG_STARTEND
)
230 start
= pmatch
[0].rm_so
;
231 length
= pmatch
[0].rm_eo
;
236 length
= strlen (string
);
239 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
240 length
, 0, NULL
, eflags
);
242 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
243 length
, nmatch
, pmatch
, eflags
);
244 return err
!= REG_NOERROR
;
248 # include <shlib-compat.h>
249 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
251 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
252 __typeof__ (__regexec
) __compat_regexec
;
255 attribute_compat_text_section
256 __compat_regexec (const regex_t
*__restrict preg
,
257 const char *__restrict string
, size_t nmatch
,
258 regmatch_t pmatch
[], int eflags
)
260 return regexec (preg
, string
, nmatch
, pmatch
,
261 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
263 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
267 /* Entry points for GNU code. */
269 /* re_match, re_search, re_match_2, re_search_2
271 The former two functions operate on STRING with length LENGTH,
272 while the later two operate on concatenation of STRING1 and STRING2
273 with lengths LENGTH1 and LENGTH2, respectively.
275 re_match() matches the compiled pattern in BUFP against the string,
276 starting at index START.
278 re_search() first tries matching at index START, then it tries to match
279 starting from index START + 1, and so on. The last start position tried
280 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
283 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
284 the first STOP characters of the concatenation of the strings should be
287 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
288 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
289 computed relative to the concatenation, not relative to the individual
292 On success, re_match* functions return the length of the match, re_search*
293 return the position of the start of the match. Return value -1 means no
294 match was found and -2 indicates an internal error. */
297 re_match (bufp
, string
, length
, start
, regs
)
298 struct re_pattern_buffer
*bufp
;
301 struct re_registers
*regs
;
303 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
306 weak_alias (__re_match
, re_match
)
310 re_search (bufp
, string
, length
, start
, range
, regs
)
311 struct re_pattern_buffer
*bufp
;
313 int length
, start
, range
;
314 struct re_registers
*regs
;
316 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
319 weak_alias (__re_search
, re_search
)
323 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
324 struct re_pattern_buffer
*bufp
;
325 const char *string1
, *string2
;
326 int length1
, length2
, start
, stop
;
327 struct re_registers
*regs
;
329 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
330 start
, 0, regs
, stop
, 1);
333 weak_alias (__re_match_2
, re_match_2
)
337 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
338 struct re_pattern_buffer
*bufp
;
339 const char *string1
, *string2
;
340 int length1
, length2
, start
, range
, stop
;
341 struct re_registers
*regs
;
343 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
344 start
, range
, regs
, stop
, 0);
347 weak_alias (__re_search_2
, re_search_2
)
351 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
353 struct re_pattern_buffer
*bufp
;
354 const char *string1
, *string2
;
355 int length1
, length2
, start
, range
, stop
, ret_len
;
356 struct re_registers
*regs
;
360 int len
= length1
+ length2
;
363 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
366 /* Concatenate the strings. */
370 char *s
= re_malloc (char, len
);
372 if (BE (s
== NULL
, 0))
374 memcpy (s
, string1
, length1
);
375 memcpy (s
+ length1
, string2
, length2
);
384 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
387 re_free ((char *) str
);
391 /* The parameters have the same meaning as those of re_search.
392 Additional parameters:
393 If RET_LEN is nonzero the length of the match is returned (re_match style);
394 otherwise the position of the match is returned. */
397 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
398 struct re_pattern_buffer
*bufp
;
400 int length
, start
, range
, stop
, ret_len
;
401 struct re_registers
*regs
;
403 reg_errcode_t result
;
408 /* Check for out-of-range. */
409 if (BE (start
< 0 || start
> length
, 0))
411 if (BE (start
+ range
> length
, 0))
412 range
= length
- start
;
413 else if (BE (start
+ range
< 0, 0))
416 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
417 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
419 /* Compile fastmap if we haven't yet. */
420 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
421 re_compile_fastmap (bufp
);
423 if (BE (bufp
->no_sub
, 0))
426 /* We need at least 1 register. */
429 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
430 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
432 nregs
= regs
->num_regs
;
433 if (BE (nregs
< 1, 0))
435 /* Nothing can be copied to regs. */
441 nregs
= bufp
->re_nsub
+ 1;
442 pmatch
= re_malloc (regmatch_t
, nregs
);
443 if (BE (pmatch
== NULL
, 0))
446 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
447 nregs
, pmatch
, eflags
);
451 /* I hope we needn't fill ther regs with -1's when no match was found. */
452 if (result
!= REG_NOERROR
)
454 else if (regs
!= NULL
)
456 /* If caller wants register contents data back, copy them. */
457 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
458 bufp
->regs_allocated
);
459 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
463 if (BE (rval
== 0, 1))
467 assert (pmatch
[0].rm_so
== start
);
468 rval
= pmatch
[0].rm_eo
- start
;
471 rval
= pmatch
[0].rm_so
;
478 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
479 struct re_registers
*regs
;
481 int nregs
, regs_allocated
;
483 int rval
= REGS_REALLOCATE
;
485 int need_regs
= nregs
+ 1;
486 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
489 /* Have the register data arrays been allocated? */
490 if (regs_allocated
== REGS_UNALLOCATED
)
491 { /* No. So allocate them with malloc. */
492 regs
->start
= re_malloc (regoff_t
, need_regs
);
493 regs
->end
= re_malloc (regoff_t
, need_regs
);
494 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
495 return REGS_UNALLOCATED
;
496 regs
->num_regs
= need_regs
;
498 else if (regs_allocated
== REGS_REALLOCATE
)
499 { /* Yes. If we need more elements than were already
500 allocated, reallocate them. If we need fewer, just
502 if (BE (need_regs
> regs
->num_regs
, 0))
504 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
505 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
506 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
507 return REGS_UNALLOCATED
;
508 regs
->start
= new_start
;
510 regs
->num_regs
= need_regs
;
515 assert (regs_allocated
== REGS_FIXED
);
516 /* This function may not be called with REGS_FIXED and nregs too big. */
517 assert (regs
->num_regs
>= nregs
);
522 for (i
= 0; i
< nregs
; ++i
)
524 regs
->start
[i
] = pmatch
[i
].rm_so
;
525 regs
->end
[i
] = pmatch
[i
].rm_eo
;
527 for ( ; i
< regs
->num_regs
; ++i
)
528 regs
->start
[i
] = regs
->end
[i
] = -1;
533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
534 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
535 this memory for recording register information. STARTS and ENDS
536 must be allocated using the malloc library routine, and must each
537 be at least NUM_REGS * sizeof (regoff_t) bytes long.
539 If NUM_REGS == 0, then subsequent matches should allocate their own
542 Unless this function is called, the first search or match using
543 PATTERN_BUFFER will allocate its own register data, without
544 freeing the old data. */
547 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
548 struct re_pattern_buffer
*bufp
;
549 struct re_registers
*regs
;
551 regoff_t
*starts
, *ends
;
555 bufp
->regs_allocated
= REGS_REALLOCATE
;
556 regs
->num_regs
= num_regs
;
557 regs
->start
= starts
;
562 bufp
->regs_allocated
= REGS_UNALLOCATED
;
564 regs
->start
= regs
->end
= (regoff_t
*) 0;
568 weak_alias (__re_set_registers
, re_set_registers
)
571 /* Entry points compatible with 4.2 BSD regex library. We don't define
572 them unless specifically requested. */
574 #if defined _REGEX_RE_COMP || defined _LIBC
582 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
584 #endif /* _REGEX_RE_COMP */
586 /* Internal entry point. */
588 /* Searches for a compiled pattern PREG in the string STRING, whose
589 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
590 mingings with regexec. START, and RANGE have the same meanings
592 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
593 otherwise return the error code.
594 Note: We assume front end functions already check ranges.
595 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
598 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
602 int length
, start
, range
, stop
, eflags
;
607 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
608 int left_lim
, right_lim
, incr
;
609 int fl_longest_match
, match_first
, match_last
= -1;
610 int fast_translate
, sb
;
611 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
612 re_match_context_t mctx
= { .dfa
= dfa
};
614 re_match_context_t mctx
;
616 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
617 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
);
619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
620 memset (&mctx
, '\0', sizeof (re_match_context_t
));
624 /* Check if the DFA haven't been compiled. */
625 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
626 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
627 || dfa
->init_state_begbuf
== NULL
, 0))
631 /* We assume front-end functions already check them. */
632 assert (start
+ range
>= 0 && start
+ range
<= length
);
635 /* If initial states with non-begbuf contexts have no elements,
636 the regex must be anchored. If preg->newline_anchor is set,
637 we'll never use init_state_nl, so do not check it. */
638 if (dfa
->init_state
->nodes
.nelem
== 0
639 && dfa
->init_state_word
->nodes
.nelem
== 0
640 && (dfa
->init_state_nl
->nodes
.nelem
== 0
641 || !preg
->newline_anchor
))
643 if (start
!= 0 && start
+ range
!= 0)
648 /* We must check the longest matching, if nmatch > 0. */
649 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
651 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
652 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
653 if (BE (err
!= REG_NOERROR
, 0))
655 mctx
.input
.stop
= stop
;
656 mctx
.input
.raw_stop
= stop
;
657 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
659 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
660 if (BE (err
!= REG_NOERROR
, 0))
663 /* We will log all the DFA states through which the dfa pass,
664 if nmatch > 1, or this dfa has "multibyte node", which is a
665 back-reference or a node which can accept multibyte character or
666 multi character collating element. */
667 if (nmatch
> 1 || dfa
->has_mb_node
)
669 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
670 if (BE (mctx
.state_log
== NULL
, 0))
677 mctx
.state_log
= NULL
;
680 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
681 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
683 /* Check incrementally whether of not the input string match. */
684 incr
= (range
< 0) ? -1 : 1;
685 left_lim
= (range
< 0) ? start
+ range
: start
;
686 right_lim
= (range
< 0) ? start
: start
+ range
;
687 sb
= dfa
->mb_cur_max
== 1;
688 fast_translate
= sb
|| !(preg
->syntax
& RE_ICASE
|| preg
->translate
);
692 /* At first get the current byte from input string. */
695 if (BE (fast_translate
, 1))
697 unsigned RE_TRANSLATE_TYPE t
698 = (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
699 if (BE (range
>= 0, 1))
701 if (BE (t
!= NULL
, 0))
703 while (BE (match_first
< right_lim
, 1)
704 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
709 while (BE (match_first
< right_lim
, 1)
710 && !fastmap
[(unsigned char) string
[match_first
]])
713 if (BE (match_first
== right_lim
, 0))
715 int ch
= match_first
>= length
716 ? 0 : (unsigned char) string
[match_first
];
717 if (!fastmap
[t
? t
[ch
] : ch
])
723 while (match_first
>= left_lim
)
725 int ch
= match_first
>= length
726 ? 0 : (unsigned char) string
[match_first
];
727 if (fastmap
[t
? t
[ch
] : ch
])
731 if (match_first
< left_lim
)
741 /* In this case, we can't determine easily the current byte,
742 since it might be a component byte of a multibyte
743 character. Then we use the constructed buffer
745 /* If MATCH_FIRST is out of the valid range, reconstruct the
747 if (mctx
.input
.raw_mbs_idx
+ mctx
.input
.valid_raw_len
749 || match_first
< mctx
.input
.raw_mbs_idx
)
751 err
= re_string_reconstruct (&mctx
.input
, match_first
,
753 if (BE (err
!= REG_NOERROR
, 0))
756 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
757 Note that MATCH_FIRST must not be smaller than 0. */
758 ch
= ((match_first
>= length
) ? 0
759 : re_string_byte_at (&mctx
.input
,
761 - mctx
.input
.raw_mbs_idx
));
766 while (match_first
>= left_lim
&& match_first
<= right_lim
);
772 /* Reconstruct the buffers so that the matcher can assume that
773 the matching starts from the beginning of the buffer. */
774 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
775 if (BE (err
!= REG_NOERROR
, 0))
777 #ifdef RE_ENABLE_I18N
778 /* Eliminate it when it is a component of a multibyte character
779 and isn't the head of a multibyte character. */
780 if (sb
|| re_string_first_byte (&mctx
.input
, 0))
783 /* It seems to be appropriate one, then use the matcher. */
784 /* We assume that the matching starts from 0. */
785 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
786 match_last
= check_matching (&mctx
, fl_longest_match
,
787 range
>= 0 ? &match_first
: NULL
);
788 if (match_last
!= -1)
790 if (BE (match_last
== -2, 0))
797 mctx
.match_last
= match_last
;
798 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
800 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
801 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
804 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
807 err
= prune_impossible_nodes (&mctx
);
808 if (err
== REG_NOERROR
)
810 if (BE (err
!= REG_NOMATCH
, 0))
815 break; /* We found a match. */
818 match_ctx_clean (&mctx
);
820 /* Update counter. */
822 if (match_first
< left_lim
|| right_lim
< match_first
)
826 /* Set pmatch[] if we need. */
827 if (match_last
!= -1 && nmatch
> 0)
831 /* Initialize registers. */
832 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
833 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
835 /* Set the points where matching start/end. */
837 pmatch
[0].rm_eo
= mctx
.match_last
;
839 if (!preg
->no_sub
&& nmatch
> 1)
841 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
842 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
843 if (BE (err
!= REG_NOERROR
, 0))
847 /* At last, add the offset to the each registers, since we slided
848 the buffers so that we could assume that the matching starts
850 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
851 if (pmatch
[reg_idx
].rm_so
!= -1)
853 #ifdef RE_ENABLE_I18N
854 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
856 if (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
)
857 pmatch
[reg_idx
].rm_so
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
859 pmatch
[reg_idx
].rm_so
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
];
860 if (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
)
861 pmatch
[reg_idx
].rm_eo
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
863 pmatch
[reg_idx
].rm_eo
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
];
866 assert (mctx
.input
.offsets_needed
== 0);
868 pmatch
[reg_idx
].rm_so
+= match_first
;
869 pmatch
[reg_idx
].rm_eo
+= match_first
;
872 err
= (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
874 re_free (mctx
.state_log
);
876 match_ctx_free (&mctx
);
877 re_string_destruct (&mctx
.input
);
882 prune_impossible_nodes (mctx
)
883 re_match_context_t
*mctx
;
885 re_dfa_t
*const dfa
= mctx
->dfa
;
886 int halt_node
, match_last
;
888 re_dfastate_t
**sifted_states
;
889 re_dfastate_t
**lim_states
= NULL
;
890 re_sift_context_t sctx
;
892 assert (mctx
->state_log
!= NULL
);
894 match_last
= mctx
->match_last
;
895 halt_node
= mctx
->last_node
;
896 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
897 if (BE (sifted_states
== NULL
, 0))
904 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
905 if (BE (lim_states
== NULL
, 0))
912 memset (lim_states
, '\0',
913 sizeof (re_dfastate_t
*) * (match_last
+ 1));
914 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
916 ret
= sift_states_backward (mctx
, &sctx
);
917 re_node_set_free (&sctx
.limits
);
918 if (BE (ret
!= REG_NOERROR
, 0))
920 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
930 } while (mctx
->state_log
[match_last
] == NULL
931 || !mctx
->state_log
[match_last
]->halt
);
932 halt_node
= check_halt_state_context (mctx
,
933 mctx
->state_log
[match_last
],
936 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
938 re_free (lim_states
);
940 if (BE (ret
!= REG_NOERROR
, 0))
945 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
946 ret
= sift_states_backward (mctx
, &sctx
);
947 re_node_set_free (&sctx
.limits
);
948 if (BE (ret
!= REG_NOERROR
, 0))
951 re_free (mctx
->state_log
);
952 mctx
->state_log
= sifted_states
;
953 sifted_states
= NULL
;
954 mctx
->last_node
= halt_node
;
955 mctx
->match_last
= match_last
;
958 re_free (sifted_states
);
959 re_free (lim_states
);
963 /* Acquire an initial state and return it.
964 We must select appropriate initial state depending on the context,
965 since initial states may have constraints like "\<", "^", etc.. */
967 static inline re_dfastate_t
*
968 acquire_init_state_context (err
, mctx
, idx
)
970 const re_match_context_t
*mctx
;
973 re_dfa_t
*const dfa
= mctx
->dfa
;
974 if (dfa
->init_state
->has_constraint
)
976 unsigned int context
;
977 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
978 if (IS_WORD_CONTEXT (context
))
979 return dfa
->init_state_word
;
980 else if (IS_ORDINARY_CONTEXT (context
))
981 return dfa
->init_state
;
982 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
983 return dfa
->init_state_begbuf
;
984 else if (IS_NEWLINE_CONTEXT (context
))
985 return dfa
->init_state_nl
;
986 else if (IS_BEGBUF_CONTEXT (context
))
988 /* It is relatively rare case, then calculate on demand. */
989 return re_acquire_state_context (err
, dfa
,
990 dfa
->init_state
->entrance_nodes
,
994 /* Must not happen? */
995 return dfa
->init_state
;
998 return dfa
->init_state
;
1001 /* Check whether the regular expression match input string INPUT or not,
1002 and return the index where the matching end, return -1 if not match,
1003 or return -2 in case of an error.
1004 FL_LONGEST_MATCH means we want the POSIX longest matching.
1005 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1006 next place where we may want to try matching.
1007 Note that the matcher assume that the maching starts from the current
1008 index of the buffer. */
1011 check_matching (mctx
, fl_longest_match
, p_match_first
)
1012 re_match_context_t
*mctx
;
1013 int fl_longest_match
;
1016 re_dfa_t
*const dfa
= mctx
->dfa
;
1019 int match_last
= -1;
1020 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1021 re_dfastate_t
*cur_state
;
1022 int at_init_state
= p_match_first
!= NULL
;
1023 int next_start_idx
= cur_str_idx
;
1026 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1027 /* An initial state must not be NULL (invalid). */
1028 if (BE (cur_state
== NULL
, 0))
1030 assert (err
== REG_ESPACE
);
1034 if (mctx
->state_log
!= NULL
)
1036 mctx
->state_log
[cur_str_idx
] = cur_state
;
1038 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1039 later. E.g. Processing back references. */
1040 if (BE (dfa
->nbackref
, 0))
1043 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1044 if (BE (err
!= REG_NOERROR
, 0))
1047 if (cur_state
->has_backref
)
1049 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1050 if (BE (err
!= REG_NOERROR
, 0))
1056 /* If the RE accepts NULL string. */
1057 if (BE (cur_state
->halt
, 0))
1059 if (!cur_state
->has_constraint
1060 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1062 if (!fl_longest_match
)
1066 match_last
= cur_str_idx
;
1072 while (!re_string_eoi (&mctx
->input
))
1074 re_dfastate_t
*old_state
= cur_state
;
1075 cur_state
= transit_state (&err
, mctx
, cur_state
);
1076 if (mctx
->state_log
!= NULL
)
1077 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1079 if (cur_state
== NULL
)
1081 /* Reached the invalid state or an error. Try to recover a valid
1082 state using the state log, if available and if we have not
1083 already found a valid (even if not the longest) match. */
1084 if (BE (err
!= REG_NOERROR
, 0))
1087 if (mctx
->state_log
== NULL
1088 || (match
&& !fl_longest_match
)
1089 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1095 if (old_state
== cur_state
)
1096 next_start_idx
= re_string_cur_idx (&mctx
->input
);
1101 if (cur_state
->halt
)
1103 /* Reached a halt state.
1104 Check the halt state can satisfy the current context. */
1105 if (!cur_state
->has_constraint
1106 || check_halt_state_context (mctx
, cur_state
,
1107 re_string_cur_idx (&mctx
->input
)))
1109 /* We found an appropriate halt state. */
1110 match_last
= re_string_cur_idx (&mctx
->input
);
1112 if (!fl_longest_match
)
1118 if (match_last
== -1 && p_match_first
)
1119 *p_match_first
+= next_start_idx
;
1124 /* Check NODE match the current context. */
1126 static int check_halt_node_context (dfa
, node
, context
)
1127 const re_dfa_t
*dfa
;
1129 unsigned int context
;
1131 re_token_type_t type
= dfa
->nodes
[node
].type
;
1132 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1133 if (type
!= END_OF_RE
)
1137 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1142 /* Check the halt state STATE match the current context.
1143 Return 0 if not match, if the node, STATE has, is a halt node and
1144 match the context, return the node. */
1147 check_halt_state_context (mctx
, state
, idx
)
1148 const re_match_context_t
*mctx
;
1149 const re_dfastate_t
*state
;
1153 unsigned int context
;
1155 assert (state
->halt
);
1157 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1158 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1159 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1160 return state
->nodes
.elems
[i
];
1164 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1165 corresponding to the DFA).
1166 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1170 proceed_next_node (mctx
, nregs
, regs
, pidx
, node
, eps_via_nodes
, fs
)
1171 const re_match_context_t
*mctx
;
1173 int nregs
, *pidx
, node
;
1174 re_node_set
*eps_via_nodes
;
1175 struct re_fail_stack_t
*fs
;
1177 re_dfa_t
*const dfa
= mctx
->dfa
;
1178 int i
, err
, dest_node
;
1180 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1182 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1183 int ndest
, dest_nodes
[2];
1184 err
= re_node_set_insert (eps_via_nodes
, node
);
1185 if (BE (err
< 0, 0))
1187 /* Pick up valid destinations. */
1188 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1190 int candidate
= dfa
->edests
[node
].elems
[i
];
1191 if (!re_node_set_contains (cur_nodes
, candidate
))
1193 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1194 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1198 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1199 /* In order to avoid infinite loop like "(a*)*". */
1200 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1201 return dest_nodes
[1];
1203 && push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
,
1206 return dest_nodes
[0];
1211 re_token_type_t type
= dfa
->nodes
[node
].type
;
1213 #ifdef RE_ENABLE_I18N
1214 if (ACCEPT_MB_NODE (type
))
1215 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1217 #endif /* RE_ENABLE_I18N */
1218 if (type
== OP_BACK_REF
)
1220 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1221 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1224 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1228 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1229 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1237 err
= re_node_set_insert (eps_via_nodes
, node
);
1238 if (BE (err
< 0, 0))
1240 dest_node
= dfa
->edests
[node
].elems
[0];
1241 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1248 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1250 dest_node
= dfa
->nexts
[node
];
1251 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1252 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1253 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1256 re_node_set_empty (eps_via_nodes
);
1263 static reg_errcode_t
1264 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1265 struct re_fail_stack_t
*fs
;
1266 int str_idx
, *dests
, nregs
;
1268 re_node_set
*eps_via_nodes
;
1271 int num
= fs
->num
++;
1272 if (fs
->num
== fs
->alloc
)
1274 struct re_fail_stack_ent_t
*new_array
;
1275 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1277 if (new_array
== NULL
)
1280 fs
->stack
= new_array
;
1282 fs
->stack
[num
].idx
= str_idx
;
1283 fs
->stack
[num
].node
= dests
[1];
1284 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1285 if (fs
->stack
[num
].regs
== NULL
)
1287 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1288 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1293 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1294 struct re_fail_stack_t
*fs
;
1297 re_node_set
*eps_via_nodes
;
1299 int num
= --fs
->num
;
1301 *pidx
= fs
->stack
[num
].idx
;
1302 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1303 re_node_set_free (eps_via_nodes
);
1304 re_free (fs
->stack
[num
].regs
);
1305 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1306 return fs
->stack
[num
].node
;
1309 /* Set the positions where the subexpressions are starts/ends to registers
1311 Note: We assume that pmatch[0] is already set, and
1312 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1314 static reg_errcode_t
1315 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1316 const regex_t
*preg
;
1317 const re_match_context_t
*mctx
;
1322 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1323 int idx
, cur_node
, real_nmatch
;
1324 re_node_set eps_via_nodes
;
1325 struct re_fail_stack_t
*fs
;
1326 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1327 regmatch_t
*prev_idx_match
;
1330 assert (nmatch
> 1);
1331 assert (mctx
->state_log
!= NULL
);
1336 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1337 if (fs
->stack
== NULL
)
1343 cur_node
= dfa
->init_node
;
1344 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1345 re_node_set_init_empty (&eps_via_nodes
);
1347 prev_idx_match
= (regmatch_t
*) alloca (sizeof (regmatch_t
) * real_nmatch
);
1348 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * real_nmatch
);
1350 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1352 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, real_nmatch
);
1354 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1359 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1360 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1362 if (reg_idx
== nmatch
)
1364 re_node_set_free (&eps_via_nodes
);
1365 return free_fail_stack_return (fs
);
1367 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1372 re_node_set_free (&eps_via_nodes
);
1377 /* Proceed to next node. */
1378 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1379 &eps_via_nodes
, fs
);
1381 if (BE (cur_node
< 0, 0))
1383 if (BE (cur_node
== -2, 0))
1385 re_node_set_free (&eps_via_nodes
);
1386 free_fail_stack_return (fs
);
1390 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1394 re_node_set_free (&eps_via_nodes
);
1399 re_node_set_free (&eps_via_nodes
);
1400 return free_fail_stack_return (fs
);
1403 static reg_errcode_t
1404 free_fail_stack_return (fs
)
1405 struct re_fail_stack_t
*fs
;
1410 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1412 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1413 re_free (fs
->stack
[fs_idx
].regs
);
1415 re_free (fs
->stack
);
1421 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, cur_idx
, nmatch
)
1423 regmatch_t
*pmatch
, *prev_idx_match
;
1424 int cur_node
, cur_idx
, nmatch
;
1426 int type
= dfa
->nodes
[cur_node
].type
;
1427 if (type
== OP_OPEN_SUBEXP
)
1429 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1431 /* We are at the first node of this sub expression. */
1432 if (reg_num
< nmatch
)
1434 pmatch
[reg_num
].rm_so
= cur_idx
;
1435 pmatch
[reg_num
].rm_eo
= -1;
1438 else if (type
== OP_CLOSE_SUBEXP
)
1440 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1441 if (reg_num
< nmatch
)
1443 /* We are at the last node of this sub expression. */
1444 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1446 pmatch
[reg_num
].rm_eo
= cur_idx
;
1447 /* This is a non-empty match or we are not inside an optional
1448 subexpression. Accept this right away. */
1449 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1453 if (dfa
->nodes
[cur_node
].opt_subexp
1454 && prev_idx_match
[reg_num
].rm_so
!= -1)
1455 /* We transited through an empty match for an optional
1456 subexpression, like (a?)*, and this is not the subexp's
1457 first match. Copy back the old content of the registers
1458 so that matches of an inner subexpression are undone as
1459 well, like in ((a?))*. */
1460 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1462 /* We completed a subexpression, but it may be part of
1463 an optional one, so do not update PREV_IDX_MATCH. */
1464 pmatch
[reg_num
].rm_eo
= cur_idx
;
1470 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1471 and sift the nodes in each states according to the following rules.
1472 Updated state_log will be wrote to STATE_LOG.
1474 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1475 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1476 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1477 the LAST_NODE, we throw away the node `a'.
1478 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1479 string `s' and transit to `b':
1480 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1482 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1483 thrown away, we throw away the node `a'.
1484 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1485 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1487 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1488 we throw away the node `a'. */
1490 #define STATE_NODE_CONTAINS(state,node) \
1491 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1493 static reg_errcode_t
1494 sift_states_backward (mctx
, sctx
)
1495 re_match_context_t
*mctx
;
1496 re_sift_context_t
*sctx
;
1500 int str_idx
= sctx
->last_str_idx
;
1501 re_node_set cur_dest
;
1504 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1507 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1508 transit to the last_node and the last_node itself. */
1509 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1510 if (BE (err
!= REG_NOERROR
, 0))
1512 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1513 if (BE (err
!= REG_NOERROR
, 0))
1516 /* Then check each states in the state_log. */
1519 /* Update counters. */
1520 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1521 if (null_cnt
> mctx
->max_mb_elem_len
)
1523 memset (sctx
->sifted_states
, '\0',
1524 sizeof (re_dfastate_t
*) * str_idx
);
1525 re_node_set_free (&cur_dest
);
1528 re_node_set_empty (&cur_dest
);
1531 if (mctx
->state_log
[str_idx
])
1533 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1534 if (BE (err
!= REG_NOERROR
, 0))
1538 /* Add all the nodes which satisfy the following conditions:
1539 - It can epsilon transit to a node in CUR_DEST.
1541 And update state_log. */
1542 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1543 if (BE (err
!= REG_NOERROR
, 0))
1548 re_node_set_free (&cur_dest
);
1552 static reg_errcode_t
1553 build_sifted_states (mctx
, sctx
, str_idx
, cur_dest
)
1554 re_match_context_t
*mctx
;
1555 re_sift_context_t
*sctx
;
1557 re_node_set
*cur_dest
;
1559 re_dfa_t
*const dfa
= mctx
->dfa
;
1560 re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1563 /* Then build the next sifted state.
1564 We build the next sifted state on `cur_dest', and update
1565 `sifted_states[str_idx]' with `cur_dest'.
1567 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1568 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1569 for (i
= 0; i
< cur_src
->nelem
; i
++)
1571 int prev_node
= cur_src
->elems
[i
];
1573 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1576 if (IS_EPSILON_NODE (type
))
1578 #ifdef RE_ENABLE_I18N
1579 /* If the node may accept `multi byte'. */
1580 if (ACCEPT_MB_NODE (type
))
1581 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1582 str_idx
, sctx
->last_str_idx
);
1583 #endif /* RE_ENABLE_I18N */
1585 /* We don't check backreferences here.
1586 See update_cur_sifted_state(). */
1588 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1589 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1590 dfa
->nexts
[prev_node
]))
1596 if (sctx
->limits
.nelem
)
1598 int to_idx
= str_idx
+ naccepted
;
1599 if (check_dst_limits (mctx
, &sctx
->limits
,
1600 dfa
->nexts
[prev_node
], to_idx
,
1601 prev_node
, str_idx
))
1604 ret
= re_node_set_insert (cur_dest
, prev_node
);
1605 if (BE (ret
== -1, 0))
1612 /* Helper functions. */
1614 static reg_errcode_t
1615 clean_state_log_if_needed (mctx
, next_state_log_idx
)
1616 re_match_context_t
*mctx
;
1617 int next_state_log_idx
;
1619 int top
= mctx
->state_log_top
;
1621 if (next_state_log_idx
>= mctx
->input
.bufs_len
1622 || (next_state_log_idx
>= mctx
->input
.valid_len
1623 && mctx
->input
.valid_len
< mctx
->input
.len
))
1626 err
= extend_buffers (mctx
);
1627 if (BE (err
!= REG_NOERROR
, 0))
1631 if (top
< next_state_log_idx
)
1633 memset (mctx
->state_log
+ top
+ 1, '\0',
1634 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1635 mctx
->state_log_top
= next_state_log_idx
;
1640 static reg_errcode_t
1641 merge_state_array (dfa
, dst
, src
, num
)
1643 re_dfastate_t
**dst
;
1644 re_dfastate_t
**src
;
1649 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1651 if (dst
[st_idx
] == NULL
)
1652 dst
[st_idx
] = src
[st_idx
];
1653 else if (src
[st_idx
] != NULL
)
1655 re_node_set merged_set
;
1656 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1657 &src
[st_idx
]->nodes
);
1658 if (BE (err
!= REG_NOERROR
, 0))
1660 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1661 re_node_set_free (&merged_set
);
1662 if (BE (err
!= REG_NOERROR
, 0))
1669 static reg_errcode_t
1670 update_cur_sifted_state (mctx
, sctx
, str_idx
, dest_nodes
)
1671 re_match_context_t
*mctx
;
1672 re_sift_context_t
*sctx
;
1674 re_node_set
*dest_nodes
;
1676 re_dfa_t
*const dfa
= mctx
->dfa
;
1678 const re_node_set
*candidates
;
1679 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1680 : &mctx
->state_log
[str_idx
]->nodes
);
1682 if (dest_nodes
->nelem
== 0)
1683 sctx
->sifted_states
[str_idx
] = NULL
;
1688 /* At first, add the nodes which can epsilon transit to a node in
1690 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1691 if (BE (err
!= REG_NOERROR
, 0))
1694 /* Then, check the limitations in the current sift_context. */
1695 if (sctx
->limits
.nelem
)
1697 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1698 mctx
->bkref_ents
, str_idx
);
1699 if (BE (err
!= REG_NOERROR
, 0))
1704 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1705 if (BE (err
!= REG_NOERROR
, 0))
1709 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1711 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1712 if (BE (err
!= REG_NOERROR
, 0))
1718 static reg_errcode_t
1719 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1721 re_node_set
*dest_nodes
;
1722 const re_node_set
*candidates
;
1726 re_node_set src_copy
;
1728 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1729 if (BE (err
!= REG_NOERROR
, 0))
1731 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1733 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1735 + src_copy
.elems
[src_idx
]);
1736 if (BE (err
!= REG_NOERROR
, 0))
1738 re_node_set_free (&src_copy
);
1742 re_node_set_free (&src_copy
);
1746 static reg_errcode_t
1747 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1750 re_node_set
*dest_nodes
;
1751 const re_node_set
*candidates
;
1755 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1756 re_node_set except_nodes
;
1757 re_node_set_init_empty (&except_nodes
);
1758 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1760 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1761 if (cur_node
== node
)
1763 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1765 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1766 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1767 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1768 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1769 && re_node_set_contains (dest_nodes
, edst1
))
1771 && !re_node_set_contains (inv_eclosure
, edst2
)
1772 && re_node_set_contains (dest_nodes
, edst2
)))
1774 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1775 dfa
->inveclosures
+ cur_node
);
1776 if (BE (err
!= REG_NOERROR
, 0))
1778 re_node_set_free (&except_nodes
);
1784 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1786 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1787 if (!re_node_set_contains (&except_nodes
, cur_node
))
1789 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1790 re_node_set_remove_at (dest_nodes
, idx
);
1793 re_node_set_free (&except_nodes
);
1798 check_dst_limits (mctx
, limits
, dst_node
, dst_idx
, src_node
, src_idx
)
1799 re_match_context_t
*mctx
;
1800 re_node_set
*limits
;
1801 int dst_node
, dst_idx
, src_node
, src_idx
;
1803 re_dfa_t
*const dfa
= mctx
->dfa
;
1804 int lim_idx
, src_pos
, dst_pos
;
1806 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1807 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1808 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1811 struct re_backref_cache_entry
*ent
;
1812 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1813 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1815 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1816 subexp_idx
, dst_node
, dst_idx
,
1818 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1819 subexp_idx
, src_node
, src_idx
,
1823 <src> <dst> ( <subexp> )
1824 ( <subexp> ) <src> <dst>
1825 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1826 if (src_pos
== dst_pos
)
1827 continue; /* This is unrelated limitation. */
1835 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
, from_node
, bkref_idx
)
1836 re_match_context_t
*mctx
;
1837 int boundaries
, subexp_idx
, from_node
, bkref_idx
;
1839 re_dfa_t
*const dfa
= mctx
->dfa
;
1840 re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1843 /* Else, we are on the boundary: examine the nodes on the epsilon
1845 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1847 int node
= eclosures
->elems
[node_idx
];
1848 switch (dfa
->nodes
[node
].type
)
1852 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1857 if (ent
->node
!= node
|| ent
->subexp_from
!= ent
->subexp_to
)
1860 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1861 OP_CLOSE_SUBEXP cases below. But, if the
1862 destination node is the same node as the source
1863 node, don't recurse because it would cause an
1864 infinite loop: a regex that exhibits this behavior
1866 dst
= dfa
->edests
[node
].elems
[0];
1867 if (dst
== from_node
)
1871 else /* if (boundaries & 2) */
1875 cpos
= check_dst_limits_calc_pos_1 (mctx
, boundaries
,
1876 subexp_idx
, dst
, bkref_idx
);
1878 if (cpos
== -1 && (boundaries
& 1))
1881 if (cpos
== 0 /* && (boundaries & 2) */)
1884 while (ent
++->more
);
1888 case OP_OPEN_SUBEXP
:
1889 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1893 case OP_CLOSE_SUBEXP
:
1894 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1903 return (boundaries
& 2) ? 1 : 0;
1907 check_dst_limits_calc_pos (mctx
, limit
, subexp_idx
, from_node
, str_idx
, bkref_idx
)
1908 re_match_context_t
*mctx
;
1909 int limit
, subexp_idx
, from_node
, str_idx
, bkref_idx
;
1911 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1914 /* If we are outside the range of the subexpression, return -1 or 1. */
1915 if (str_idx
< lim
->subexp_from
)
1918 if (lim
->subexp_to
< str_idx
)
1921 /* If we are within the subexpression, return 0. */
1922 boundaries
= (str_idx
== lim
->subexp_from
);
1923 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
1924 if (boundaries
== 0)
1927 /* Else, examine epsilon closure. */
1928 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1929 from_node
, bkref_idx
);
1932 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1933 which are against limitations from DEST_NODES. */
1935 static reg_errcode_t
1936 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1938 re_node_set
*dest_nodes
;
1939 const re_node_set
*candidates
;
1940 re_node_set
*limits
;
1941 struct re_backref_cache_entry
*bkref_ents
;
1945 int node_idx
, lim_idx
;
1947 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1950 struct re_backref_cache_entry
*ent
;
1951 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1953 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1954 continue; /* This is unrelated limitation. */
1956 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1957 if (ent
->subexp_to
== str_idx
)
1961 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1963 int node
= dest_nodes
->elems
[node_idx
];
1964 re_token_type_t type
= dfa
->nodes
[node
].type
;
1965 if (type
== OP_OPEN_SUBEXP
1966 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1968 else if (type
== OP_CLOSE_SUBEXP
1969 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1973 /* Check the limitation of the open subexpression. */
1974 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1977 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
1979 if (BE (err
!= REG_NOERROR
, 0))
1983 /* Check the limitation of the close subexpression. */
1985 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1987 int node
= dest_nodes
->elems
[node_idx
];
1988 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
1990 && !re_node_set_contains (dfa
->eclosures
+ node
,
1993 /* It is against this limitation.
1994 Remove it form the current sifted state. */
1995 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1997 if (BE (err
!= REG_NOERROR
, 0))
2003 else /* (ent->subexp_to != str_idx) */
2005 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2007 int node
= dest_nodes
->elems
[node_idx
];
2008 re_token_type_t type
= dfa
->nodes
[node
].type
;
2009 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2011 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2013 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
2014 || (type
== OP_OPEN_SUBEXP
))
2016 /* It is against this limitation.
2017 Remove it form the current sifted state. */
2018 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2020 if (BE (err
!= REG_NOERROR
, 0))
2030 static reg_errcode_t
2031 sift_states_bkref (mctx
, sctx
, str_idx
, candidates
)
2032 re_match_context_t
*mctx
;
2033 re_sift_context_t
*sctx
;
2035 const re_node_set
*candidates
;
2037 re_dfa_t
*const dfa
= mctx
->dfa
;
2040 re_sift_context_t local_sctx
;
2041 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2043 if (first_idx
== -1)
2046 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2048 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2051 re_token_type_t type
;
2052 struct re_backref_cache_entry
*entry
;
2053 node
= candidates
->elems
[node_idx
];
2054 type
= dfa
->nodes
[node
].type
;
2055 /* Avoid infinite loop for the REs like "()\1+". */
2056 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2058 if (type
!= OP_BACK_REF
)
2061 entry
= mctx
->bkref_ents
+ first_idx
;
2062 enabled_idx
= first_idx
;
2065 int subexp_len
, to_idx
, dst_node
;
2066 re_dfastate_t
*cur_state
;
2068 if (entry
->node
!= node
)
2070 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2071 to_idx
= str_idx
+ subexp_len
;
2072 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2073 : dfa
->edests
[node
].elems
[0]);
2075 if (to_idx
> sctx
->last_str_idx
2076 || sctx
->sifted_states
[to_idx
] == NULL
2077 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2078 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2079 str_idx
, dst_node
, to_idx
))
2082 if (local_sctx
.sifted_states
== NULL
)
2085 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2086 if (BE (err
!= REG_NOERROR
, 0))
2089 local_sctx
.last_node
= node
;
2090 local_sctx
.last_str_idx
= str_idx
;
2091 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2092 if (BE (err
< 0, 0))
2097 cur_state
= local_sctx
.sifted_states
[str_idx
];
2098 err
= sift_states_backward (mctx
, &local_sctx
);
2099 if (BE (err
!= REG_NOERROR
, 0))
2101 if (sctx
->limited_states
!= NULL
)
2103 err
= merge_state_array (dfa
, sctx
->limited_states
,
2104 local_sctx
.sifted_states
,
2106 if (BE (err
!= REG_NOERROR
, 0))
2109 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2110 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2112 /* mctx->bkref_ents may have changed, reload the pointer. */
2113 entry
= mctx
->bkref_ents
+ enabled_idx
;
2115 while (enabled_idx
++, entry
++->more
);
2119 if (local_sctx
.sifted_states
!= NULL
)
2121 re_node_set_free (&local_sctx
.limits
);
2128 #ifdef RE_ENABLE_I18N
2130 sift_states_iter_mb (mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2131 const re_match_context_t
*mctx
;
2132 re_sift_context_t
*sctx
;
2133 int node_idx
, str_idx
, max_str_idx
;
2135 re_dfa_t
*const dfa
= mctx
->dfa
;
2137 /* Check the node can accept `multi byte'. */
2138 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2139 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2140 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2141 dfa
->nexts
[node_idx
]))
2142 /* The node can't accept the `multi byte', or the
2143 destination was already thrown away, then the node
2144 could't accept the current input `multi byte'. */
2146 /* Otherwise, it is sure that the node could accept
2147 `naccepted' bytes input. */
2150 #endif /* RE_ENABLE_I18N */
2153 /* Functions for state transition. */
2155 /* Return the next state to which the current state STATE will transit by
2156 accepting the current input byte, and update STATE_LOG if necessary.
2157 If STATE can accept a multibyte char/collating element/back reference
2158 update the destination of STATE_LOG. */
2160 static re_dfastate_t
*
2161 transit_state (err
, mctx
, state
)
2163 re_match_context_t
*mctx
;
2164 re_dfastate_t
*state
;
2166 re_dfa_t
*const dfa
= mctx
->dfa
;
2167 re_dfastate_t
**trtable
;
2170 if (re_string_cur_idx (&mctx
->input
) + 1 >= mctx
->input
.bufs_len
2171 || (re_string_cur_idx (&mctx
->input
) + 1 >= mctx
->input
.valid_len
2172 && mctx
->input
.valid_len
< mctx
->input
.len
))
2174 *err
= extend_buffers (mctx
);
2175 if (BE (*err
!= REG_NOERROR
, 0))
2179 #ifdef RE_ENABLE_I18N
2180 /* If the current state can accept multibyte. */
2181 if (state
->accept_mb
)
2183 *err
= transit_state_mb (mctx
, state
);
2184 if (BE (*err
!= REG_NOERROR
, 0))
2187 #endif /* RE_ENABLE_I18N */
2189 /* Then decide the next state with the single byte. */
2192 /* Use transition table */
2193 ch
= re_string_fetch_byte (&mctx
->input
);
2194 trtable
= state
->trtable
;
2195 if (trtable
== NULL
)
2197 trtable
= build_trtable (dfa
, state
);
2198 if (trtable
== NULL
)
2204 if (BE (state
->word_trtable
, 0))
2206 unsigned int context
;
2208 = re_string_context_at (&mctx
->input
,
2209 re_string_cur_idx (&mctx
->input
) - 1,
2211 if (IS_WORD_CONTEXT (context
))
2212 return trtable
[ch
+ SBC_MAX
];
2221 /* don't use transition table */
2222 return transit_state_sb (err
, mctx
, state
);
2226 /* Update the state_log if we need */
2228 merge_state_with_log (err
, mctx
, next_state
)
2230 re_match_context_t
*mctx
;
2231 re_dfastate_t
*next_state
;
2233 re_dfa_t
*const dfa
= mctx
->dfa
;
2234 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2236 if (cur_idx
> mctx
->state_log_top
)
2238 mctx
->state_log
[cur_idx
] = next_state
;
2239 mctx
->state_log_top
= cur_idx
;
2241 else if (mctx
->state_log
[cur_idx
] == 0)
2243 mctx
->state_log
[cur_idx
] = next_state
;
2247 re_dfastate_t
*pstate
;
2248 unsigned int context
;
2249 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2250 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2251 the destination of a multibyte char/collating element/
2252 back reference. Then the next state is the union set of
2253 these destinations and the results of the transition table. */
2254 pstate
= mctx
->state_log
[cur_idx
];
2255 log_nodes
= pstate
->entrance_nodes
;
2256 if (next_state
!= NULL
)
2258 table_nodes
= next_state
->entrance_nodes
;
2259 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2261 if (BE (*err
!= REG_NOERROR
, 0))
2265 next_nodes
= *log_nodes
;
2266 /* Note: We already add the nodes of the initial state,
2267 then we don't need to add them here. */
2269 context
= re_string_context_at (&mctx
->input
,
2270 re_string_cur_idx (&mctx
->input
) - 1,
2272 next_state
= mctx
->state_log
[cur_idx
]
2273 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2274 /* We don't need to check errors here, since the return value of
2275 this function is next_state and ERR is already set. */
2277 if (table_nodes
!= NULL
)
2278 re_node_set_free (&next_nodes
);
2281 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2283 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2284 later. We must check them here, since the back references in the
2285 next state might use them. */
2286 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2288 if (BE (*err
!= REG_NOERROR
, 0))
2291 /* If the next state has back references. */
2292 if (next_state
->has_backref
)
2294 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2295 if (BE (*err
!= REG_NOERROR
, 0))
2297 next_state
= mctx
->state_log
[cur_idx
];
2304 /* Skip bytes in the input that correspond to part of a
2305 multi-byte match, then look in the log for a state
2306 from which to restart matching. */
2308 find_recover_state (err
, mctx
)
2310 re_match_context_t
*mctx
;
2312 re_dfastate_t
*cur_state
= NULL
;
2315 int max
= mctx
->state_log_top
;
2316 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2320 if (++cur_str_idx
> max
)
2322 re_string_skip_bytes (&mctx
->input
, 1);
2324 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2326 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2328 while (err
== REG_NOERROR
&& cur_state
== NULL
);
2332 /* Helper functions for transit_state. */
2334 /* From the node set CUR_NODES, pick up the nodes whose types are
2335 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2336 expression. And register them to use them later for evaluating the
2337 correspoding back references. */
2339 static reg_errcode_t
2340 check_subexp_matching_top (mctx
, cur_nodes
, str_idx
)
2341 re_match_context_t
*mctx
;
2342 re_node_set
*cur_nodes
;
2345 re_dfa_t
*const dfa
= mctx
->dfa
;
2349 /* TODO: This isn't efficient.
2350 Because there might be more than one nodes whose types are
2351 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2354 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2356 int node
= cur_nodes
->elems
[node_idx
];
2357 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2358 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2359 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2361 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2362 if (BE (err
!= REG_NOERROR
, 0))
2370 /* Return the next state to which the current state STATE will transit by
2371 accepting the current input byte. */
2373 static re_dfastate_t
*
2374 transit_state_sb (err
, mctx
, state
)
2376 re_match_context_t
*mctx
;
2377 re_dfastate_t
*state
;
2379 re_dfa_t
*const dfa
= mctx
->dfa
;
2380 re_node_set next_nodes
;
2381 re_dfastate_t
*next_state
;
2382 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2383 unsigned int context
;
2385 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2386 if (BE (*err
!= REG_NOERROR
, 0))
2388 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2390 int cur_node
= state
->nodes
.elems
[node_cnt
];
2391 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2393 *err
= re_node_set_merge (&next_nodes
,
2394 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2395 if (BE (*err
!= REG_NOERROR
, 0))
2397 re_node_set_free (&next_nodes
);
2402 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2403 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2404 /* We don't need to check errors here, since the return value of
2405 this function is next_state and ERR is already set. */
2407 re_node_set_free (&next_nodes
);
2408 re_string_skip_bytes (&mctx
->input
, 1);
2413 #ifdef RE_ENABLE_I18N
2414 static reg_errcode_t
2415 transit_state_mb (mctx
, pstate
)
2416 re_match_context_t
*mctx
;
2417 re_dfastate_t
*pstate
;
2419 re_dfa_t
*const dfa
= mctx
->dfa
;
2423 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2425 re_node_set dest_nodes
, *new_nodes
;
2426 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2427 int naccepted
= 0, dest_idx
;
2428 unsigned int context
;
2429 re_dfastate_t
*dest_state
;
2431 if (dfa
->nodes
[cur_node_idx
].constraint
)
2433 context
= re_string_context_at (&mctx
->input
,
2434 re_string_cur_idx (&mctx
->input
),
2436 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2441 /* How many bytes the node can accept? */
2442 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2443 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2444 re_string_cur_idx (&mctx
->input
));
2448 /* The node can accepts `naccepted' bytes. */
2449 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2450 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2451 : mctx
->max_mb_elem_len
);
2452 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2453 if (BE (err
!= REG_NOERROR
, 0))
2456 assert (dfa
->nexts
[cur_node_idx
] != -1);
2458 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2459 then we use pstate->nodes.elems[i] instead. */
2460 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2462 dest_state
= mctx
->state_log
[dest_idx
];
2463 if (dest_state
== NULL
)
2464 dest_nodes
= *new_nodes
;
2467 err
= re_node_set_init_union (&dest_nodes
,
2468 dest_state
->entrance_nodes
, new_nodes
);
2469 if (BE (err
!= REG_NOERROR
, 0))
2472 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1, mctx
->eflags
);
2473 mctx
->state_log
[dest_idx
]
2474 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2475 if (dest_state
!= NULL
)
2476 re_node_set_free (&dest_nodes
);
2477 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2482 #endif /* RE_ENABLE_I18N */
2484 static reg_errcode_t
2485 transit_state_bkref (mctx
, nodes
)
2486 re_match_context_t
*mctx
;
2487 const re_node_set
*nodes
;
2489 re_dfa_t
*const dfa
= mctx
->dfa
;
2492 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2494 for (i
= 0; i
< nodes
->nelem
; ++i
)
2496 int dest_str_idx
, prev_nelem
, bkc_idx
;
2497 int node_idx
= nodes
->elems
[i
];
2498 unsigned int context
;
2499 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2500 re_node_set
*new_dest_nodes
;
2502 /* Check whether `node' is a backreference or not. */
2503 if (node
->type
!= OP_BACK_REF
)
2506 if (node
->constraint
)
2508 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2510 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2514 /* `node' is a backreference.
2515 Check the substring which the substring matched. */
2516 bkc_idx
= mctx
->nbkref_ents
;
2517 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2518 if (BE (err
!= REG_NOERROR
, 0))
2521 /* And add the epsilon closures (which is `new_dest_nodes') of
2522 the backreference to appropriate state_log. */
2524 assert (dfa
->nexts
[node_idx
] != -1);
2526 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2529 re_dfastate_t
*dest_state
;
2530 struct re_backref_cache_entry
*bkref_ent
;
2531 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2532 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2534 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2535 new_dest_nodes
= (subexp_len
== 0
2536 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2537 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2538 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2539 - bkref_ent
->subexp_from
);
2540 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2542 dest_state
= mctx
->state_log
[dest_str_idx
];
2543 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2544 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2545 /* Add `new_dest_node' to state_log. */
2546 if (dest_state
== NULL
)
2548 mctx
->state_log
[dest_str_idx
]
2549 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2551 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2552 && err
!= REG_NOERROR
, 0))
2557 re_node_set dest_nodes
;
2558 err
= re_node_set_init_union (&dest_nodes
,
2559 dest_state
->entrance_nodes
,
2561 if (BE (err
!= REG_NOERROR
, 0))
2563 re_node_set_free (&dest_nodes
);
2566 mctx
->state_log
[dest_str_idx
]
2567 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2568 re_node_set_free (&dest_nodes
);
2569 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2570 && err
!= REG_NOERROR
, 0))
2573 /* We need to check recursively if the backreference can epsilon
2576 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2578 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2580 if (BE (err
!= REG_NOERROR
, 0))
2582 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2583 if (BE (err
!= REG_NOERROR
, 0))
2593 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2594 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2595 Note that we might collect inappropriate candidates here.
2596 However, the cost of checking them strictly here is too high, then we
2597 delay these checking for prune_impossible_nodes(). */
2599 static reg_errcode_t
2600 get_subexp (mctx
, bkref_node
, bkref_str_idx
)
2601 re_match_context_t
*mctx
;
2602 int bkref_node
, bkref_str_idx
;
2604 re_dfa_t
*const dfa
= mctx
->dfa
;
2605 int subexp_num
, sub_top_idx
;
2606 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2607 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2608 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2609 if (cache_idx
!= -1)
2611 const struct re_backref_cache_entry
*entry
= mctx
->bkref_ents
+ cache_idx
;
2613 if (entry
->node
== bkref_node
)
2614 return REG_NOERROR
; /* We already checked it. */
2615 while (entry
++->more
);
2618 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
- 1;
2620 /* For each sub expression */
2621 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2624 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2625 re_sub_match_last_t
*sub_last
;
2626 int sub_last_idx
, sl_str
, bkref_str_off
;
2628 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2629 continue; /* It isn't related. */
2631 sl_str
= sub_top
->str_idx
;
2632 bkref_str_off
= bkref_str_idx
;
2633 /* At first, check the last node of sub expressions we already
2635 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2638 sub_last
= sub_top
->lasts
[sub_last_idx
];
2639 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2640 /* The matched string by the sub expression match with the substring
2641 at the back reference? */
2642 if (sl_str_diff
> 0)
2644 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2646 /* Not enough chars for a successful match. */
2647 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2650 err
= clean_state_log_if_needed (mctx
,
2653 if (BE (err
!= REG_NOERROR
, 0))
2655 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2657 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2658 break; /* We don't need to search this sub expression any more. */
2660 bkref_str_off
+= sl_str_diff
;
2661 sl_str
+= sl_str_diff
;
2662 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2665 /* Reload buf, since the preceding call might have reallocated
2667 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2669 if (err
== REG_NOMATCH
)
2671 if (BE (err
!= REG_NOERROR
, 0))
2675 if (sub_last_idx
< sub_top
->nlasts
)
2677 if (sub_last_idx
> 0)
2679 /* Then, search for the other last nodes of the sub expression. */
2680 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2682 int cls_node
, sl_str_off
;
2683 const re_node_set
*nodes
;
2684 sl_str_off
= sl_str
- sub_top
->str_idx
;
2685 /* The matched string by the sub expression match with the substring
2686 at the back reference? */
2689 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2691 /* If we are at the end of the input, we cannot match. */
2692 if (bkref_str_off
>= mctx
->input
.len
)
2695 err
= extend_buffers (mctx
);
2696 if (BE (err
!= REG_NOERROR
, 0))
2699 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2701 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2702 break; /* We don't need to search this sub expression
2705 if (mctx
->state_log
[sl_str
] == NULL
)
2707 /* Does this state have a ')' of the sub expression? */
2708 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2709 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, OP_CLOSE_SUBEXP
);
2712 if (sub_top
->path
== NULL
)
2714 sub_top
->path
= calloc (sizeof (state_array_t
),
2715 sl_str
- sub_top
->str_idx
+ 1);
2716 if (sub_top
->path
== NULL
)
2719 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2720 in the current context? */
2721 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2722 sub_top
->str_idx
, cls_node
, sl_str
, OP_CLOSE_SUBEXP
);
2723 if (err
== REG_NOMATCH
)
2725 if (BE (err
!= REG_NOERROR
, 0))
2727 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2728 if (BE (sub_last
== NULL
, 0))
2730 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2732 if (err
== REG_NOMATCH
)
2739 /* Helper functions for get_subexp(). */
2741 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2742 If it can arrive, register the sub expression expressed with SUB_TOP
2745 static reg_errcode_t
2746 get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2747 re_match_context_t
*mctx
;
2748 const re_sub_match_top_t
*sub_top
;
2749 re_sub_match_last_t
*sub_last
;
2750 int bkref_node
, bkref_str
;
2754 /* Can the subexpression arrive the back reference? */
2755 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2756 sub_last
->str_idx
, bkref_node
, bkref_str
, OP_OPEN_SUBEXP
);
2757 if (err
!= REG_NOERROR
)
2759 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2761 if (BE (err
!= REG_NOERROR
, 0))
2763 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2764 return clean_state_log_if_needed (mctx
, to_idx
);
2767 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2768 Search '(' if FL_OPEN, or search ')' otherwise.
2769 TODO: This function isn't efficient...
2770 Because there might be more than one nodes whose types are
2771 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2776 find_subexp_node (dfa
, nodes
, subexp_idx
, type
)
2777 const re_dfa_t
*dfa
;
2778 const re_node_set
*nodes
;
2779 int subexp_idx
, type
;
2782 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2784 int cls_node
= nodes
->elems
[cls_idx
];
2785 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2786 if (node
->type
== type
2787 && node
->opr
.idx
== subexp_idx
)
2793 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2794 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2796 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2798 static reg_errcode_t
2799 check_arrival (mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2801 re_match_context_t
*mctx
;
2802 state_array_t
*path
;
2803 int top_node
, top_str
, last_node
, last_str
, type
;
2805 re_dfa_t
*const dfa
= mctx
->dfa
;
2807 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2808 re_dfastate_t
*cur_state
= NULL
;
2809 re_node_set
*cur_nodes
, next_nodes
;
2810 re_dfastate_t
**backup_state_log
;
2811 unsigned int context
;
2813 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2814 /* Extend the buffer if we need. */
2815 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2817 re_dfastate_t
**new_array
;
2818 int old_alloc
= path
->alloc
;
2819 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2820 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2821 if (new_array
== NULL
)
2823 path
->alloc
= old_alloc
;
2826 path
->array
= new_array
;
2827 memset (new_array
+ old_alloc
, '\0',
2828 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2831 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2833 /* Temporary modify MCTX. */
2834 backup_state_log
= mctx
->state_log
;
2835 backup_cur_idx
= mctx
->input
.cur_idx
;
2836 mctx
->state_log
= path
->array
;
2837 mctx
->input
.cur_idx
= str_idx
;
2839 /* Setup initial node set. */
2840 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2841 if (str_idx
== top_str
)
2843 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2844 if (BE (err
!= REG_NOERROR
, 0))
2846 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2847 if (BE (err
!= REG_NOERROR
, 0))
2849 re_node_set_free (&next_nodes
);
2855 cur_state
= mctx
->state_log
[str_idx
];
2856 if (cur_state
&& cur_state
->has_backref
)
2858 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2859 if (BE ( err
!= REG_NOERROR
, 0))
2863 re_node_set_init_empty (&next_nodes
);
2865 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2867 if (next_nodes
.nelem
)
2869 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2871 if (BE ( err
!= REG_NOERROR
, 0))
2873 re_node_set_free (&next_nodes
);
2877 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2878 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2880 re_node_set_free (&next_nodes
);
2883 mctx
->state_log
[str_idx
] = cur_state
;
2886 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2888 re_node_set_empty (&next_nodes
);
2889 if (mctx
->state_log
[str_idx
+ 1])
2891 err
= re_node_set_merge (&next_nodes
,
2892 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2893 if (BE (err
!= REG_NOERROR
, 0))
2895 re_node_set_free (&next_nodes
);
2901 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2902 &cur_state
->nodes
, &next_nodes
);
2903 if (BE (err
!= REG_NOERROR
, 0))
2905 re_node_set_free (&next_nodes
);
2910 if (next_nodes
.nelem
)
2912 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2913 if (BE (err
!= REG_NOERROR
, 0))
2915 re_node_set_free (&next_nodes
);
2918 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2920 if (BE ( err
!= REG_NOERROR
, 0))
2922 re_node_set_free (&next_nodes
);
2926 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2927 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2928 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2930 re_node_set_free (&next_nodes
);
2933 mctx
->state_log
[str_idx
] = cur_state
;
2934 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2936 re_node_set_free (&next_nodes
);
2937 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2938 : &mctx
->state_log
[last_str
]->nodes
);
2939 path
->next_idx
= str_idx
;
2942 mctx
->state_log
= backup_state_log
;
2943 mctx
->input
.cur_idx
= backup_cur_idx
;
2945 /* Then check the current node set has the node LAST_NODE. */
2946 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
2952 /* Helper functions for check_arrival. */
2954 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2956 TODO: This function is similar to the functions transit_state*(),
2957 however this function has many additional works.
2958 Can't we unify them? */
2960 static reg_errcode_t
2961 check_arrival_add_next_nodes (mctx
, str_idx
, cur_nodes
, next_nodes
)
2962 re_match_context_t
*mctx
;
2964 re_node_set
*cur_nodes
, *next_nodes
;
2966 re_dfa_t
*const dfa
= mctx
->dfa
;
2970 re_node_set union_set
;
2971 re_node_set_init_empty (&union_set
);
2972 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
2975 int cur_node
= cur_nodes
->elems
[cur_idx
];
2976 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
2977 if (IS_EPSILON_NODE (type
))
2979 #ifdef RE_ENABLE_I18N
2980 /* If the node may accept `multi byte'. */
2981 if (ACCEPT_MB_NODE (type
))
2983 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
2987 re_dfastate_t
*dest_state
;
2988 int next_node
= dfa
->nexts
[cur_node
];
2989 int next_idx
= str_idx
+ naccepted
;
2990 dest_state
= mctx
->state_log
[next_idx
];
2991 re_node_set_empty (&union_set
);
2994 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
2995 if (BE (err
!= REG_NOERROR
, 0))
2997 re_node_set_free (&union_set
);
3001 result
= re_node_set_insert (&union_set
, next_node
);
3002 if (BE (result
< 0, 0))
3004 re_node_set_free (&union_set
);
3007 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3009 if (BE (mctx
->state_log
[next_idx
] == NULL
3010 && err
!= REG_NOERROR
, 0))
3012 re_node_set_free (&union_set
);
3017 #endif /* RE_ENABLE_I18N */
3019 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3021 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3022 if (BE (result
< 0, 0))
3024 re_node_set_free (&union_set
);
3029 re_node_set_free (&union_set
);
3033 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3034 CUR_NODES, however exclude the nodes which are:
3035 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3036 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3039 static reg_errcode_t
3040 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, type
)
3042 re_node_set
*cur_nodes
;
3043 int ex_subexp
, type
;
3046 int idx
, outside_node
;
3047 re_node_set new_nodes
;
3049 assert (cur_nodes
->nelem
);
3051 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3052 if (BE (err
!= REG_NOERROR
, 0))
3054 /* Create a new node set NEW_NODES with the nodes which are epsilon
3055 closures of the node in CUR_NODES. */
3057 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3059 int cur_node
= cur_nodes
->elems
[idx
];
3060 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3061 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3062 if (outside_node
== -1)
3064 /* There are no problematic nodes, just merge them. */
3065 err
= re_node_set_merge (&new_nodes
, eclosure
);
3066 if (BE (err
!= REG_NOERROR
, 0))
3068 re_node_set_free (&new_nodes
);
3074 /* There are problematic nodes, re-calculate incrementally. */
3075 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3077 if (BE (err
!= REG_NOERROR
, 0))
3079 re_node_set_free (&new_nodes
);
3084 re_node_set_free (cur_nodes
);
3085 *cur_nodes
= new_nodes
;
3089 /* Helper function for check_arrival_expand_ecl.
3090 Check incrementally the epsilon closure of TARGET, and if it isn't
3091 problematic append it to DST_NODES. */
3093 static reg_errcode_t
3094 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, type
)
3096 int target
, ex_subexp
, type
;
3097 re_node_set
*dst_nodes
;
3100 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3104 if (dfa
->nodes
[cur_node
].type
== type
3105 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3107 if (type
== OP_CLOSE_SUBEXP
)
3109 err
= re_node_set_insert (dst_nodes
, cur_node
);
3110 if (BE (err
== -1, 0))
3115 err
= re_node_set_insert (dst_nodes
, cur_node
);
3116 if (BE (err
== -1, 0))
3118 if (dfa
->edests
[cur_node
].nelem
== 0)
3120 if (dfa
->edests
[cur_node
].nelem
== 2)
3122 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3123 dfa
->edests
[cur_node
].elems
[1],
3125 if (BE (err
!= REG_NOERROR
, 0))
3128 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3134 /* For all the back references in the current state, calculate the
3135 destination of the back references by the appropriate entry
3136 in MCTX->BKREF_ENTS. */
3138 static reg_errcode_t
3139 expand_bkref_cache (mctx
, cur_nodes
, cur_str
, subexp_num
,
3141 re_match_context_t
*mctx
;
3142 int cur_str
, subexp_num
, type
;
3143 re_node_set
*cur_nodes
;
3145 re_dfa_t
*const dfa
= mctx
->dfa
;
3147 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3148 struct re_backref_cache_entry
*ent
;
3150 if (cache_idx_start
== -1)
3154 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3157 int to_idx
, next_node
;
3159 /* Is this entry ENT is appropriate? */
3160 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3163 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3164 /* Calculate the destination of the back reference, and append it
3165 to MCTX->STATE_LOG. */
3166 if (to_idx
== cur_str
)
3168 /* The backreference did epsilon transit, we must re-check all the
3169 node in the current state. */
3170 re_node_set new_dests
;
3171 reg_errcode_t err2
, err3
;
3172 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3173 if (re_node_set_contains (cur_nodes
, next_node
))
3175 err
= re_node_set_init_1 (&new_dests
, next_node
);
3176 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3177 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3178 re_node_set_free (&new_dests
);
3179 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3180 || err3
!= REG_NOERROR
, 0))
3182 err
= (err
!= REG_NOERROR
? err
3183 : (err2
!= REG_NOERROR
? err2
: err3
));
3186 /* TODO: It is still inefficient... */
3191 re_node_set union_set
;
3192 next_node
= dfa
->nexts
[ent
->node
];
3193 if (mctx
->state_log
[to_idx
])
3196 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3199 err
= re_node_set_init_copy (&union_set
,
3200 &mctx
->state_log
[to_idx
]->nodes
);
3201 ret
= re_node_set_insert (&union_set
, next_node
);
3202 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3204 re_node_set_free (&union_set
);
3205 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3211 err
= re_node_set_init_1 (&union_set
, next_node
);
3212 if (BE (err
!= REG_NOERROR
, 0))
3215 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3216 re_node_set_free (&union_set
);
3217 if (BE (mctx
->state_log
[to_idx
] == NULL
3218 && err
!= REG_NOERROR
, 0))
3222 while (ent
++->more
);
3226 /* Build transition table for the state.
3227 Return the new table if succeeded, otherwise return NULL. */
3229 static re_dfastate_t
**
3230 build_trtable (dfa
, state
)
3232 re_dfastate_t
*state
;
3236 unsigned int elem
, mask
;
3237 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3238 int ndests
; /* Number of the destination states from `state'. */
3239 re_dfastate_t
**trtable
;
3240 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3241 re_node_set follows
, *dests_node
;
3245 /* We build DFA states which corresponds to the destination nodes
3246 from `state'. `dests_node[i]' represents the nodes which i-th
3247 destination state contains, and `dests_ch[i]' represents the
3248 characters which i-th destination state accepts. */
3250 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3251 dests_node
= (re_node_set
*)
3252 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3256 dests_node
= (re_node_set
*)
3257 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3258 if (BE (dests_node
== NULL
, 0))
3260 dests_node_malloced
= 1;
3262 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3264 /* Initialize transiton table. */
3265 state
->word_trtable
= 0;
3267 /* At first, group all nodes belonging to `state' into several
3269 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3270 if (BE (ndests
<= 0, 0))
3272 if (dests_node_malloced
)
3274 /* Return NULL in case of an error, trtable otherwise. */
3277 state
->trtable
= (re_dfastate_t
**)
3278 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);;
3279 return state
->trtable
;
3284 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3285 if (BE (err
!= REG_NOERROR
, 0))
3289 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3290 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3291 dest_states
= (re_dfastate_t
**)
3292 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3296 dest_states
= (re_dfastate_t
**)
3297 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3298 if (BE (dest_states
== NULL
, 0))
3301 if (dest_states_malloced
)
3303 re_node_set_free (&follows
);
3304 for (i
= 0; i
< ndests
; ++i
)
3305 re_node_set_free (dests_node
+ i
);
3306 if (dests_node_malloced
)
3310 dest_states_malloced
= 1;
3312 dest_states_word
= dest_states
+ ndests
;
3313 dest_states_nl
= dest_states_word
+ ndests
;
3314 bitset_empty (acceptable
);
3316 /* Then build the states for all destinations. */
3317 for (i
= 0; i
< ndests
; ++i
)
3320 re_node_set_empty (&follows
);
3321 /* Merge the follows of this destination states. */
3322 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3324 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3325 if (next_node
!= -1)
3327 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3328 if (BE (err
!= REG_NOERROR
, 0))
3332 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3333 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3335 /* If the new state has context constraint,
3336 build appropriate states for these contexts. */
3337 if (dest_states
[i
]->has_constraint
)
3339 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3341 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3344 if (dest_states
[i
] != dest_states_word
[i
]
3345 && dfa
->mb_cur_max
> 1)
3346 state
->word_trtable
= 1;
3348 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3350 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3355 dest_states_word
[i
] = dest_states
[i
];
3356 dest_states_nl
[i
] = dest_states
[i
];
3358 bitset_merge (acceptable
, dests_ch
[i
]);
3361 if (!BE (state
->word_trtable
, 0))
3363 /* We don't care about whether the following character is a word
3364 character, or we are in a single-byte character set so we can
3365 discern by looking at the character code: allocate a
3366 256-entry transition table. */
3367 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3368 if (BE (trtable
== NULL
, 0))
3371 /* For all characters ch...: */
3372 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3373 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3375 mask
<<= 1, elem
>>= 1, ++ch
)
3376 if (BE (elem
& 1, 0))
3378 /* There must be exactly one destination which accepts
3379 character ch. See group_nodes_into_DFAstates. */
3380 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3383 /* j-th destination accepts the word character ch. */
3384 if (dfa
->word_char
[i
] & mask
)
3385 trtable
[ch
] = dest_states_word
[j
];
3387 trtable
[ch
] = dest_states
[j
];
3392 /* We care about whether the following character is a word
3393 character, and we are in a multi-byte character set: discern
3394 by looking at the character code: build two 256-entry
3395 transition tables, one starting at trtable[0] and one
3396 starting at trtable[SBC_MAX]. */
3397 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*),
3399 if (BE (trtable
== NULL
, 0))
3402 /* For all characters ch...: */
3403 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3404 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3406 mask
<<= 1, elem
>>= 1, ++ch
)
3407 if (BE (elem
& 1, 0))
3409 /* There must be exactly one destination which accepts
3410 character ch. See group_nodes_into_DFAstates. */
3411 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3414 /* j-th destination accepts the word character ch. */
3415 trtable
[ch
] = dest_states
[j
];
3416 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3421 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3423 /* The current state accepts newline character. */
3424 for (j
= 0; j
< ndests
; ++j
)
3425 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3427 /* k-th destination accepts newline character. */
3428 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3429 if (state
->word_trtable
)
3430 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3431 /* There must be only one destination which accepts
3432 newline. See group_nodes_into_DFAstates. */
3437 if (dest_states_malloced
)
3440 re_node_set_free (&follows
);
3441 for (i
= 0; i
< ndests
; ++i
)
3442 re_node_set_free (dests_node
+ i
);
3444 if (dests_node_malloced
)
3447 state
->trtable
= trtable
;
3451 /* Group all nodes belonging to STATE into several destinations.
3452 Then for all destinations, set the nodes belonging to the destination
3453 to DESTS_NODE[i] and set the characters accepted by the destination
3454 to DEST_CH[i]. This function return the number of destinations. */
3457 group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
)
3459 const re_dfastate_t
*state
;
3460 re_node_set
*dests_node
;
3466 int ndests
; /* Number of the destinations from `state'. */
3467 bitset accepts
; /* Characters a node can accept. */
3468 const re_node_set
*cur_nodes
= &state
->nodes
;
3469 bitset_empty (accepts
);
3472 /* For all the nodes belonging to `state', */
3473 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3475 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3476 re_token_type_t type
= node
->type
;
3477 unsigned int constraint
= node
->constraint
;
3479 /* Enumerate all single byte character this node can accept. */
3480 if (type
== CHARACTER
)
3481 bitset_set (accepts
, node
->opr
.c
);
3482 else if (type
== SIMPLE_BRACKET
)
3484 bitset_merge (accepts
, node
->opr
.sbcset
);
3486 else if (type
== OP_PERIOD
)
3488 #ifdef RE_ENABLE_I18N
3489 if (dfa
->mb_cur_max
> 1)
3490 bitset_merge (accepts
, dfa
->sb_char
);
3493 bitset_set_all (accepts
);
3494 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3495 bitset_clear (accepts
, '\n');
3496 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3497 bitset_clear (accepts
, '\0');
3499 #ifdef RE_ENABLE_I18N
3500 else if (type
== OP_UTF8_PERIOD
)
3502 memset (accepts
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
3503 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3504 bitset_clear (accepts
, '\n');
3505 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3506 bitset_clear (accepts
, '\0');
3512 /* Check the `accepts' and sift the characters which are not
3513 match it the context. */
3516 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3518 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3519 bitset_empty (accepts
);
3520 if (accepts_newline
)
3521 bitset_set (accepts
, NEWLINE_CHAR
);
3525 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3527 bitset_empty (accepts
);
3531 if (constraint
& NEXT_WORD_CONSTRAINT
)
3533 unsigned int any_set
= 0;
3534 if (type
== CHARACTER
&& !node
->word_char
)
3536 bitset_empty (accepts
);
3539 #ifdef RE_ENABLE_I18N
3540 if (dfa
->mb_cur_max
> 1)
3541 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3542 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3545 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3546 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3550 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3552 unsigned int any_set
= 0;
3553 if (type
== CHARACTER
&& node
->word_char
)
3555 bitset_empty (accepts
);
3558 #ifdef RE_ENABLE_I18N
3559 if (dfa
->mb_cur_max
> 1)
3560 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3561 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3564 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3565 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3571 /* Then divide `accepts' into DFA states, or create a new
3572 state. Above, we make sure that accepts is not empty. */
3573 for (j
= 0; j
< ndests
; ++j
)
3575 bitset intersec
; /* Intersection sets, see below. */
3577 /* Flags, see below. */
3578 int has_intersec
, not_subset
, not_consumed
;
3580 /* Optimization, skip if this state doesn't accept the character. */
3581 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3584 /* Enumerate the intersection set of this state and `accepts'. */
3586 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3587 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3588 /* And skip if the intersection set is empty. */
3592 /* Then check if this state is a subset of `accepts'. */
3593 not_subset
= not_consumed
= 0;
3594 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3596 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3597 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3600 /* If this state isn't a subset of `accepts', create a
3601 new group state, which has the `remains'. */
3604 bitset_copy (dests_ch
[ndests
], remains
);
3605 bitset_copy (dests_ch
[j
], intersec
);
3606 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3607 if (BE (err
!= REG_NOERROR
, 0))
3612 /* Put the position in the current group. */
3613 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3614 if (BE (result
< 0, 0))
3617 /* If all characters are consumed, go to next node. */
3621 /* Some characters remain, create a new group. */
3624 bitset_copy (dests_ch
[ndests
], accepts
);
3625 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3626 if (BE (err
!= REG_NOERROR
, 0))
3629 bitset_empty (accepts
);
3634 for (j
= 0; j
< ndests
; ++j
)
3635 re_node_set_free (dests_node
+ j
);
3639 #ifdef RE_ENABLE_I18N
3640 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3641 Return the number of the bytes the node accepts.
3642 STR_IDX is the current index of the input string.
3644 This function handles the nodes which can accept one character, or
3645 one collating element like '.', '[a-z]', opposite to the other nodes
3646 can only accept one byte. */
3649 check_node_accept_bytes (dfa
, node_idx
, input
, str_idx
)
3651 int node_idx
, str_idx
;
3652 const re_string_t
*input
;
3654 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3655 int char_len
, elem_len
;
3658 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3660 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3661 if (BE (c
< 0xc2, 1))
3664 if (str_idx
+ 2 > input
->len
)
3667 d
= re_string_byte_at (input
, str_idx
+ 1);
3669 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3673 if (c
== 0xe0 && d
< 0xa0)
3679 if (c
== 0xf0 && d
< 0x90)
3685 if (c
== 0xf8 && d
< 0x88)
3691 if (c
== 0xfc && d
< 0x84)
3697 if (str_idx
+ char_len
> input
->len
)
3700 for (i
= 1; i
< char_len
; ++i
)
3702 d
= re_string_byte_at (input
, str_idx
+ i
);
3703 if (d
< 0x80 || d
> 0xbf)
3709 char_len
= re_string_char_size_at (input
, str_idx
);
3710 if (node
->type
== OP_PERIOD
)
3714 /* FIXME: I don't think this if is needed, as both '\n'
3715 and '\0' are char_len == 1. */
3716 /* '.' accepts any one character except the following two cases. */
3717 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3718 re_string_byte_at (input
, str_idx
) == '\n') ||
3719 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3720 re_string_byte_at (input
, str_idx
) == '\0'))
3725 elem_len
= re_string_elem_size_at (input
, str_idx
);
3726 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3729 if (node
->type
== COMPLEX_BRACKET
)
3731 const re_charset_t
*cset
= node
->opr
.mbcset
;
3733 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3739 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3740 ? re_string_wchar_at (input
, str_idx
) : 0);
3742 /* match with multibyte character? */
3743 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3744 if (wc
== cset
->mbchars
[i
])
3746 match_len
= char_len
;
3747 goto check_node_accept_bytes_match
;
3749 /* match with character_class? */
3750 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3752 wctype_t wt
= cset
->char_classes
[i
];
3753 if (__iswctype (wc
, wt
))
3755 match_len
= char_len
;
3756 goto check_node_accept_bytes_match
;
3761 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3764 unsigned int in_collseq
= 0;
3765 const int32_t *table
, *indirect
;
3766 const unsigned char *weights
, *extra
;
3767 const char *collseqwc
;
3769 /* This #include defines a local function! */
3770 # include <locale/weight.h>
3772 /* match with collating_symbol? */
3773 if (cset
->ncoll_syms
)
3774 extra
= (const unsigned char *)
3775 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3776 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3778 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3779 /* Compare the length of input collating element and
3780 the length of current collating element. */
3781 if (*coll_sym
!= elem_len
)
3783 /* Compare each bytes. */
3784 for (j
= 0; j
< *coll_sym
; j
++)
3785 if (pin
[j
] != coll_sym
[1 + j
])
3789 /* Match if every bytes is equal. */
3791 goto check_node_accept_bytes_match
;
3797 if (elem_len
<= char_len
)
3799 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3800 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3803 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3805 /* match with range expression? */
3806 for (i
= 0; i
< cset
->nranges
; ++i
)
3807 if (cset
->range_starts
[i
] <= in_collseq
3808 && in_collseq
<= cset
->range_ends
[i
])
3810 match_len
= elem_len
;
3811 goto check_node_accept_bytes_match
;
3814 /* match with equivalence_class? */
3815 if (cset
->nequiv_classes
)
3817 const unsigned char *cp
= pin
;
3818 table
= (const int32_t *)
3819 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3820 weights
= (const unsigned char *)
3821 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3822 extra
= (const unsigned char *)
3823 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3824 indirect
= (const int32_t *)
3825 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3826 idx
= findidx (&cp
);
3828 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3830 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3831 size_t weight_len
= weights
[idx
];
3832 if (weight_len
== weights
[equiv_class_idx
])
3835 while (cnt
<= weight_len
3836 && (weights
[equiv_class_idx
+ 1 + cnt
]
3837 == weights
[idx
+ 1 + cnt
]))
3839 if (cnt
> weight_len
)
3841 match_len
= elem_len
;
3842 goto check_node_accept_bytes_match
;
3851 /* match with range expression? */
3853 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3855 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3858 for (i
= 0; i
< cset
->nranges
; ++i
)
3860 cmp_buf
[0] = cset
->range_starts
[i
];
3861 cmp_buf
[4] = cset
->range_ends
[i
];
3862 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3863 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3865 match_len
= char_len
;
3866 goto check_node_accept_bytes_match
;
3870 check_node_accept_bytes_match
:
3871 if (!cset
->non_match
)
3878 return (elem_len
> char_len
) ? elem_len
: char_len
;
3886 find_collation_sequence_value (mbs
, mbs_len
)
3887 const unsigned char *mbs
;
3890 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3895 /* No valid character. Match it as a single byte character. */
3896 const unsigned char *collseq
= (const unsigned char *)
3897 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3898 return collseq
[mbs
[0]];
3905 const unsigned char *extra
= (const unsigned char *)
3906 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3907 int32_t extrasize
= (const unsigned char *)
3908 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3910 for (idx
= 0; idx
< extrasize
;)
3912 int mbs_cnt
, found
= 0;
3913 int32_t elem_mbs_len
;
3914 /* Skip the name of collating element name. */
3915 idx
= idx
+ extra
[idx
] + 1;
3916 elem_mbs_len
= extra
[idx
++];
3917 if (mbs_len
== elem_mbs_len
)
3919 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3920 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3922 if (mbs_cnt
== elem_mbs_len
)
3923 /* Found the entry. */
3926 /* Skip the byte sequence of the collating element. */
3927 idx
+= elem_mbs_len
;
3928 /* Adjust for the alignment. */
3929 idx
= (idx
+ 3) & ~3;
3930 /* Skip the collation sequence value. */
3931 idx
+= sizeof (uint32_t);
3932 /* Skip the wide char sequence of the collating element. */
3933 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3934 /* If we found the entry, return the sequence value. */
3936 return *(uint32_t *) (extra
+ idx
);
3937 /* Skip the collation sequence value. */
3938 idx
+= sizeof (uint32_t);
3944 #endif /* RE_ENABLE_I18N */
3946 /* Check whether the node accepts the byte which is IDX-th
3947 byte of the INPUT. */
3950 check_node_accept (mctx
, node
, idx
)
3951 const re_match_context_t
*mctx
;
3952 const re_token_t
*node
;
3955 re_dfa_t
*const dfa
= mctx
->dfa
;
3957 if (node
->constraint
)
3959 /* The node has constraints. Check whether the current context
3960 satisfies the constraints. */
3961 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
3963 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3966 ch
= re_string_byte_at (&mctx
->input
, idx
);
3970 return node
->opr
.c
== ch
;
3971 case SIMPLE_BRACKET
:
3972 return bitset_contain (node
->opr
.sbcset
, ch
);
3973 #ifdef RE_ENABLE_I18N
3974 case OP_UTF8_PERIOD
:
3980 return !((ch
== '\n' && !(dfa
->syntax
& RE_DOT_NEWLINE
))
3981 || (ch
== '\0' && (dfa
->syntax
& RE_DOT_NOT_NULL
)));
3987 /* Extend the buffers, if the buffers have run out. */
3989 static reg_errcode_t
3990 extend_buffers (mctx
)
3991 re_match_context_t
*mctx
;
3994 re_string_t
*pstr
= &mctx
->input
;
3996 /* Double the lengthes of the buffers. */
3997 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3998 if (BE (ret
!= REG_NOERROR
, 0))
4001 if (mctx
->state_log
!= NULL
)
4003 /* And double the length of state_log. */
4004 /* XXX We have no indication of the size of this buffer. If this
4005 allocation fail we have no indication that the state_log array
4006 does not have the right size. */
4007 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4008 pstr
->bufs_len
+ 1);
4009 if (BE (new_array
== NULL
, 0))
4011 mctx
->state_log
= new_array
;
4014 /* Then reconstruct the buffers. */
4017 #ifdef RE_ENABLE_I18N
4018 if (pstr
->mb_cur_max
> 1)
4020 ret
= build_wcs_upper_buffer (pstr
);
4021 if (BE (ret
!= REG_NOERROR
, 0))
4025 #endif /* RE_ENABLE_I18N */
4026 build_upper_buffer (pstr
);
4030 #ifdef RE_ENABLE_I18N
4031 if (pstr
->mb_cur_max
> 1)
4032 build_wcs_buffer (pstr
);
4034 #endif /* RE_ENABLE_I18N */
4036 if (pstr
->trans
!= NULL
)
4037 re_string_translate_buffer (pstr
);
4044 /* Functions for matching context. */
4046 /* Initialize MCTX. */
4048 static reg_errcode_t
4049 match_ctx_init (mctx
, eflags
, n
)
4050 re_match_context_t
*mctx
;
4053 mctx
->eflags
= eflags
;
4054 mctx
->match_last
= -1;
4057 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4058 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4059 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4062 /* Already zero-ed by the caller.
4064 mctx->bkref_ents = NULL;
4065 mctx->nbkref_ents = 0;
4066 mctx->nsub_tops = 0; */
4067 mctx
->abkref_ents
= n
;
4068 mctx
->max_mb_elem_len
= 1;
4069 mctx
->asub_tops
= n
;
4073 /* Clean the entries which depend on the current input in MCTX.
4074 This function must be invoked when the matcher changes the start index
4075 of the input, or changes the input string. */
4078 match_ctx_clean (mctx
)
4079 re_match_context_t
*mctx
;
4081 match_ctx_free_subtops (mctx
);
4082 mctx
->nsub_tops
= 0;
4083 mctx
->nbkref_ents
= 0;
4086 /* Free all the memory associated with MCTX. */
4089 match_ctx_free (mctx
)
4090 re_match_context_t
*mctx
;
4092 match_ctx_free_subtops (mctx
);
4093 re_free (mctx
->sub_tops
);
4094 re_free (mctx
->bkref_ents
);
4097 /* Free all the memory associated with MCTX->SUB_TOPS. */
4100 match_ctx_free_subtops (mctx
)
4101 re_match_context_t
*mctx
;
4104 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4107 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4108 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4110 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4111 re_free (last
->path
.array
);
4114 re_free (top
->lasts
);
4117 re_free (top
->path
->array
);
4118 re_free (top
->path
);
4124 /* Add a new backreference entry to MCTX.
4125 Note that we assume that caller never call this function with duplicate
4126 entry, and call with STR_IDX which isn't smaller than any existing entry.
4129 static reg_errcode_t
4130 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
4131 re_match_context_t
*mctx
;
4132 int node
, str_idx
, from
, to
;
4134 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4136 struct re_backref_cache_entry
* new_entry
;
4137 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4138 mctx
->abkref_ents
* 2);
4139 if (BE (new_entry
== NULL
, 0))
4141 re_free (mctx
->bkref_ents
);
4144 mctx
->bkref_ents
= new_entry
;
4145 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4146 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4147 mctx
->abkref_ents
*= 2;
4149 if (mctx
->nbkref_ents
> 0
4150 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4151 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4153 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4154 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4155 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4156 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4157 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4158 if (mctx
->max_mb_elem_len
< to
- from
)
4159 mctx
->max_mb_elem_len
= to
- from
;
4163 /* Search for the first entry which has the same str_idx, or -1 if none is
4164 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4167 search_cur_bkref_entry (mctx
, str_idx
)
4168 re_match_context_t
*mctx
;
4171 int left
, right
, mid
, last
;
4172 last
= right
= mctx
->nbkref_ents
;
4173 for (left
= 0; left
< right
;)
4175 mid
= (left
+ right
) / 2;
4176 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4181 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4187 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4190 static reg_errcode_t
4191 match_ctx_add_subtop (mctx
, node
, str_idx
)
4192 re_match_context_t
*mctx
;
4196 assert (mctx
->sub_tops
!= NULL
);
4197 assert (mctx
->asub_tops
> 0);
4199 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4201 int new_asub_tops
= mctx
->asub_tops
* 2;
4202 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4203 re_sub_match_top_t
*,
4205 if (BE (new_array
== NULL
, 0))
4207 mctx
->sub_tops
= new_array
;
4208 mctx
->asub_tops
= new_asub_tops
;
4210 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4211 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4213 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4214 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4218 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4219 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4221 static re_sub_match_last_t
*
4222 match_ctx_add_sublast (subtop
, node
, str_idx
)
4223 re_sub_match_top_t
*subtop
;
4226 re_sub_match_last_t
*new_entry
;
4227 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4229 int new_alasts
= 2 * subtop
->alasts
+ 1;
4230 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4231 re_sub_match_last_t
*,
4233 if (BE (new_array
== NULL
, 0))
4235 subtop
->lasts
= new_array
;
4236 subtop
->alasts
= new_alasts
;
4238 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4239 if (BE (new_entry
!= NULL
, 1))
4241 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4242 new_entry
->node
= node
;
4243 new_entry
->str_idx
= str_idx
;
4250 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
)
4251 re_sift_context_t
*sctx
;
4252 re_dfastate_t
**sifted_sts
, **limited_sts
;
4253 int last_node
, last_str_idx
;
4255 sctx
->sifted_states
= sifted_sts
;
4256 sctx
->limited_states
= limited_sts
;
4257 sctx
->last_node
= last_node
;
4258 sctx
->last_str_idx
= last_str_idx
;
4259 re_node_set_init_empty (&sctx
->limits
);