1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static void match_ctx_free_subtops (re_match_context_t
*mctx
)
27 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
28 int str_idx
, int from
, int to
)
30 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
)
32 static void match_ctx_clear_flag (re_match_context_t
*mctx
) internal_function
;
33 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
34 int str_idx
) internal_function
;
35 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
36 int node
, int str_idx
)
38 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
39 re_dfastate_t
**limited_sts
, int last_node
,
40 int last_str_idx
, int check_subexp
)
42 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
43 const char *string
, int length
,
44 int start
, int range
, int stop
,
45 size_t nmatch
, regmatch_t pmatch
[],
46 int eflags
) internal_function
;
47 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
48 const char *string1
, int length1
,
49 const char *string2
, int length2
,
50 int start
, int range
, struct re_registers
*regs
,
51 int stop
, int ret_len
) internal_function
;
52 static int re_search_stub (struct re_pattern_buffer
*bufp
,
53 const char *string
, int length
, int start
,
54 int range
, int stop
, struct re_registers
*regs
,
55 int ret_len
) internal_function
;
56 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
57 int nregs
, int regs_allocated
) internal_function
;
58 static inline re_dfastate_t
*acquire_init_state_context
59 (reg_errcode_t
*err
, const re_match_context_t
*mctx
, int idx
)
60 __attribute ((always_inline
)) internal_function
;
61 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
63 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
66 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
67 unsigned int context
) internal_function
;
68 static int check_halt_state_context (const re_match_context_t
*mctx
,
69 const re_dfastate_t
*state
, int idx
)
71 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
,
72 regmatch_t
*prev_idx_match
, int cur_node
,
73 int cur_idx
, int nmatch
) internal_function
;
74 static int proceed_next_node (const re_match_context_t
*mctx
,
75 int nregs
, regmatch_t
*regs
,
76 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
77 struct re_fail_stack_t
*fs
) internal_function
;
78 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
79 int str_idx
, int *dests
, int nregs
,
81 re_node_set
*eps_via_nodes
) internal_function
;
82 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
83 regmatch_t
*regs
, re_node_set
*eps_via_nodes
) internal_function
;
84 static reg_errcode_t
set_regs (const regex_t
*preg
,
85 const re_match_context_t
*mctx
,
86 size_t nmatch
, regmatch_t
*pmatch
,
87 int fl_backtrack
) internal_function
;
88 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
) internal_function
;
91 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
92 re_sift_context_t
*sctx
,
93 int node_idx
, int str_idx
, int max_str_idx
) internal_function
;
94 #endif /* RE_ENABLE_I18N */
95 static reg_errcode_t
sift_states_backward (re_match_context_t
*mctx
,
96 re_sift_context_t
*sctx
) internal_function
;
97 static reg_errcode_t
update_cur_sifted_state (re_match_context_t
*mctx
,
98 re_sift_context_t
*sctx
,
100 re_node_set
*dest_nodes
) internal_function
;
101 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
102 re_node_set
*dest_nodes
,
103 const re_node_set
*candidates
) internal_function
;
104 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
105 re_node_set
*dest_nodes
,
106 const re_node_set
*and_nodes
) internal_function
;
107 static int check_dst_limits (re_match_context_t
*mctx
, re_node_set
*limits
,
108 int dst_node
, int dst_idx
, int src_node
,
109 int src_idx
) internal_function
;
110 static int check_dst_limits_calc_pos (re_match_context_t
*mctx
,
111 int limit
, re_node_set
*eclosures
,
112 int subexp_idx
, int node
, int str_idx
) internal_function
;
113 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
114 re_node_set
*dest_nodes
,
115 const re_node_set
*candidates
,
117 struct re_backref_cache_entry
*bkref_ents
,
118 int str_idx
) internal_function
;
119 static reg_errcode_t
sift_states_bkref (re_match_context_t
*mctx
,
120 re_sift_context_t
*sctx
,
121 int str_idx
, re_node_set
*dest_nodes
) internal_function
;
122 static reg_errcode_t
clean_state_log_if_needed (re_match_context_t
*mctx
,
123 int next_state_log_idx
) internal_function
;
124 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
125 re_dfastate_t
**src
, int num
) internal_function
;
126 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
127 re_match_context_t
*mctx
,
128 re_dfastate_t
*state
) internal_function
;
129 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
130 re_node_set
*cur_nodes
,
131 int str_idx
) internal_function
;
133 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
134 re_match_context_t
*mctx
,
135 re_dfastate_t
*pstate
) internal_function
;
137 #ifdef RE_ENABLE_I18N
138 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
139 re_dfastate_t
*pstate
) internal_function
;
140 #endif /* RE_ENABLE_I18N */
141 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
142 const re_node_set
*nodes
) internal_function
;
143 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
144 int bkref_node
, int bkref_str_idx
) internal_function
;
145 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
146 const re_sub_match_top_t
*sub_top
,
147 re_sub_match_last_t
*sub_last
,
148 int bkref_node
, int bkref_str
) internal_function
;
149 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
150 int subexp_idx
, int type
) internal_function
;
151 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
152 state_array_t
*path
, int top_node
,
153 int top_str
, int last_node
, int last_str
,
154 int type
) internal_function
;
155 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
157 re_node_set
*cur_nodes
,
158 re_node_set
*next_nodes
) internal_function
;
159 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
160 re_node_set
*cur_nodes
,
161 int ex_subexp
, int type
) internal_function
;
162 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
163 re_node_set
*dst_nodes
,
164 int target
, int ex_subexp
,
165 int type
) internal_function
;
166 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
167 re_node_set
*cur_nodes
, int cur_str
,
168 int last_str
, int subexp_num
,
169 int type
) internal_function
;
170 static re_dfastate_t
**build_trtable (re_dfa_t
*dfa
,
171 re_dfastate_t
*state
) internal_function
;
172 #ifdef RE_ENABLE_I18N
173 static int check_node_accept_bytes (re_dfa_t
*dfa
, int node_idx
,
174 const re_string_t
*input
, int idx
) internal_function
;
176 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
177 size_t name_len
) internal_function
;
179 #endif /* RE_ENABLE_I18N */
180 static int group_nodes_into_DFAstates (re_dfa_t
*dfa
,
181 const re_dfastate_t
*state
,
182 re_node_set
*states_node
,
183 bitset
*states_ch
) internal_function
;
184 static int check_node_accept (const re_match_context_t
*mctx
,
185 const re_token_t
*node
, int idx
) internal_function
;
186 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
) internal_function
;
188 /* Entry point for POSIX code. */
190 /* regexec searches for a given pattern, specified by PREG, in the
193 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
194 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
195 least NMATCH elements, and we set them to the offsets of the
196 corresponding matched substrings.
198 EFLAGS specifies `execution flags' which affect matching: if
199 REG_NOTBOL is set, then ^ does not match at the beginning of the
200 string; if REG_NOTEOL is set, then $ does not match at the end.
202 We return 0 if we find a match and REG_NOMATCH if not. */
205 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
206 const regex_t
*__restrict preg
;
207 const char *__restrict string
;
213 int length
= strlen (string
);
215 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, 0,
218 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, nmatch
,
220 return err
!= REG_NOERROR
;
223 weak_alias (__regexec
, regexec
)
226 /* Entry points for GNU code. */
228 /* re_match, re_search, re_match_2, re_search_2
230 The former two functions operate on STRING with length LENGTH,
231 while the later two operate on concatenation of STRING1 and STRING2
232 with lengths LENGTH1 and LENGTH2, respectively.
234 re_match() matches the compiled pattern in BUFP against the string,
235 starting at index START.
237 re_search() first tries matching at index START, then it tries to match
238 starting from index START + 1, and so on. The last start position tried
239 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
242 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
243 the first STOP characters of the concatenation of the strings should be
246 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
247 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
248 computed relative to the concatenation, not relative to the individual
251 On success, re_match* functions return the length of the match, re_search*
252 return the position of the start of the match. Return value -1 means no
253 match was found and -2 indicates an internal error. */
256 re_match (bufp
, string
, length
, start
, regs
)
257 struct re_pattern_buffer
*bufp
;
260 struct re_registers
*regs
;
262 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
265 weak_alias (__re_match
, re_match
)
269 re_search (bufp
, string
, length
, start
, range
, regs
)
270 struct re_pattern_buffer
*bufp
;
272 int length
, start
, range
;
273 struct re_registers
*regs
;
275 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
278 weak_alias (__re_search
, re_search
)
282 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
283 struct re_pattern_buffer
*bufp
;
284 const char *string1
, *string2
;
285 int length1
, length2
, start
, stop
;
286 struct re_registers
*regs
;
288 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
289 start
, 0, regs
, stop
, 1);
292 weak_alias (__re_match_2
, re_match_2
)
296 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
297 struct re_pattern_buffer
*bufp
;
298 const char *string1
, *string2
;
299 int length1
, length2
, start
, range
, stop
;
300 struct re_registers
*regs
;
302 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
303 start
, range
, regs
, stop
, 0);
306 weak_alias (__re_search_2
, re_search_2
)
310 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
312 struct re_pattern_buffer
*bufp
;
313 const char *string1
, *string2
;
314 int length1
, length2
, start
, range
, stop
, ret_len
;
315 struct re_registers
*regs
;
319 int len
= length1
+ length2
;
322 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
325 /* Concatenate the strings. */
329 char *s
= re_malloc (char, len
);
331 if (BE (s
== NULL
, 0))
333 memcpy (s
, string1
, length1
);
334 memcpy (s
+ length1
, string2
, length2
);
343 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
346 re_free ((char *) str
);
350 /* The parameters have the same meaning as those of re_search.
351 Additional parameters:
352 If RET_LEN is nonzero the length of the match is returned (re_match style);
353 otherwise the position of the match is returned. */
356 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
357 struct re_pattern_buffer
*bufp
;
359 int length
, start
, range
, stop
, ret_len
;
360 struct re_registers
*regs
;
362 reg_errcode_t result
;
367 /* Check for out-of-range. */
368 if (BE (start
< 0 || start
> length
, 0))
370 if (BE (start
+ range
> length
, 0))
371 range
= length
- start
;
372 else if (BE (start
+ range
< 0, 0))
375 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
376 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
378 /* Compile fastmap if we haven't yet. */
379 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
380 re_compile_fastmap (bufp
);
382 if (BE (bufp
->no_sub
, 0))
385 /* We need at least 1 register. */
388 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
389 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
391 nregs
= regs
->num_regs
;
392 if (BE (nregs
< 1, 0))
394 /* Nothing can be copied to regs. */
400 nregs
= bufp
->re_nsub
+ 1;
401 pmatch
= re_malloc (regmatch_t
, nregs
);
402 if (BE (pmatch
== NULL
, 0))
405 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
406 nregs
, pmatch
, eflags
);
410 /* I hope we needn't fill ther regs with -1's when no match was found. */
411 if (result
!= REG_NOERROR
)
413 else if (regs
!= NULL
)
415 /* If caller wants register contents data back, copy them. */
416 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
417 bufp
->regs_allocated
);
418 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
422 if (BE (rval
== 0, 1))
426 assert (pmatch
[0].rm_so
== start
);
427 rval
= pmatch
[0].rm_eo
- start
;
430 rval
= pmatch
[0].rm_so
;
437 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
438 struct re_registers
*regs
;
440 int nregs
, regs_allocated
;
442 int rval
= REGS_REALLOCATE
;
444 int need_regs
= nregs
+ 1;
445 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
448 /* Have the register data arrays been allocated? */
449 if (regs_allocated
== REGS_UNALLOCATED
)
450 { /* No. So allocate them with malloc. */
451 regs
->start
= re_malloc (regoff_t
, need_regs
);
452 regs
->end
= re_malloc (regoff_t
, need_regs
);
453 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
454 return REGS_UNALLOCATED
;
455 regs
->num_regs
= need_regs
;
457 else if (regs_allocated
== REGS_REALLOCATE
)
458 { /* Yes. If we need more elements than were already
459 allocated, reallocate them. If we need fewer, just
461 if (BE (need_regs
> regs
->num_regs
, 0))
463 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
464 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
465 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
466 return REGS_UNALLOCATED
;
467 regs
->start
= new_start
;
469 regs
->num_regs
= need_regs
;
474 assert (regs_allocated
== REGS_FIXED
);
475 /* This function may not be called with REGS_FIXED and nregs too big. */
476 assert (regs
->num_regs
>= nregs
);
481 for (i
= 0; i
< nregs
; ++i
)
483 regs
->start
[i
] = pmatch
[i
].rm_so
;
484 regs
->end
[i
] = pmatch
[i
].rm_eo
;
486 for ( ; i
< regs
->num_regs
; ++i
)
487 regs
->start
[i
] = regs
->end
[i
] = -1;
492 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
493 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
494 this memory for recording register information. STARTS and ENDS
495 must be allocated using the malloc library routine, and must each
496 be at least NUM_REGS * sizeof (regoff_t) bytes long.
498 If NUM_REGS == 0, then subsequent matches should allocate their own
501 Unless this function is called, the first search or match using
502 PATTERN_BUFFER will allocate its own register data, without
503 freeing the old data. */
506 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
507 struct re_pattern_buffer
*bufp
;
508 struct re_registers
*regs
;
510 regoff_t
*starts
, *ends
;
514 bufp
->regs_allocated
= REGS_REALLOCATE
;
515 regs
->num_regs
= num_regs
;
516 regs
->start
= starts
;
521 bufp
->regs_allocated
= REGS_UNALLOCATED
;
523 regs
->start
= regs
->end
= (regoff_t
*) 0;
527 weak_alias (__re_set_registers
, re_set_registers
)
530 /* Entry points compatible with 4.2 BSD regex library. We don't define
531 them unless specifically requested. */
533 #if defined _REGEX_RE_COMP || defined _LIBC
541 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
543 #endif /* _REGEX_RE_COMP */
545 static re_node_set empty_set
;
547 /* Internal entry point. */
549 /* Searches for a compiled pattern PREG in the string STRING, whose
550 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
551 mingings with regexec. START, and RANGE have the same meanings
553 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
554 otherwise return the error code.
555 Note: We assume front end functions already check ranges.
556 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
559 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
563 int length
, start
, range
, stop
, eflags
;
568 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
569 int left_lim
, right_lim
, incr
;
570 int fl_longest_match
, match_first
, match_last
= -1;
571 int fast_translate
, sb
;
572 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
573 re_match_context_t mctx
= { .dfa
= dfa
};
575 re_match_context_t mctx
;
577 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
578 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
);
580 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
581 memset (&mctx
, '\0', sizeof (re_match_context_t
));
585 /* Check if the DFA haven't been compiled. */
586 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
587 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
588 || dfa
->init_state_begbuf
== NULL
, 0))
592 /* We assume front-end functions already check them. */
593 assert (start
+ range
>= 0 && start
+ range
<= length
);
596 /* If initial states with non-begbuf contexts have no elements,
597 the regex must be anchored. If preg->newline_anchor is set,
598 we'll never use init_state_nl, so do not check it. */
599 if (dfa
->init_state
->nodes
.nelem
== 0
600 && dfa
->init_state_word
->nodes
.nelem
== 0
601 && (dfa
->init_state_nl
->nodes
.nelem
== 0
602 || !preg
->newline_anchor
))
604 if (start
!= 0 && start
+ range
!= 0)
609 re_node_set_init_empty (&empty_set
);
611 /* We must check the longest matching, if nmatch > 0. */
612 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
614 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
615 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
616 if (BE (err
!= REG_NOERROR
, 0))
618 mctx
.input
.stop
= stop
;
619 mctx
.input
.raw_stop
= stop
;
620 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
622 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
623 if (BE (err
!= REG_NOERROR
, 0))
626 /* We will log all the DFA states through which the dfa pass,
627 if nmatch > 1, or this dfa has "multibyte node", which is a
628 back-reference or a node which can accept multibyte character or
629 multi character collating element. */
630 if (nmatch
> 1 || dfa
->has_mb_node
)
632 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
633 if (BE (mctx
.state_log
== NULL
, 0))
640 mctx
.state_log
= NULL
;
643 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
644 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
646 /* Check incrementally whether of not the input string match. */
647 incr
= (range
< 0) ? -1 : 1;
648 left_lim
= (range
< 0) ? start
+ range
: start
;
649 right_lim
= (range
< 0) ? start
: start
+ range
;
650 sb
= dfa
->mb_cur_max
== 1;
651 fast_translate
= sb
|| !(preg
->syntax
& RE_ICASE
|| preg
->translate
);
655 /* At first get the current byte from input string. */
658 if (BE (fast_translate
, 1))
660 unsigned RE_TRANSLATE_TYPE t
661 = (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
662 if (BE (range
>= 0, 1))
664 if (BE (t
!= NULL
, 0))
666 while (BE (match_first
< right_lim
, 1)
667 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
672 while (BE (match_first
< right_lim
, 1)
673 && !fastmap
[(unsigned char) string
[match_first
]])
676 if (BE (match_first
== right_lim
, 0))
678 int ch
= match_first
>= length
679 ? 0 : (unsigned char) string
[match_first
];
680 if (!fastmap
[t
? t
[ch
] : ch
])
686 while (match_first
>= left_lim
)
688 int ch
= match_first
>= length
689 ? 0 : (unsigned char) string
[match_first
];
690 if (fastmap
[t
? t
[ch
] : ch
])
694 if (match_first
< left_lim
)
704 /* In this case, we can't determine easily the current byte,
705 since it might be a component byte of a multibyte
706 character. Then we use the constructed buffer
708 /* If MATCH_FIRST is out of the valid range, reconstruct the
710 if (mctx
.input
.raw_mbs_idx
+ mctx
.input
.valid_raw_len
712 || match_first
< mctx
.input
.raw_mbs_idx
)
714 err
= re_string_reconstruct (&mctx
.input
, match_first
,
716 if (BE (err
!= REG_NOERROR
, 0))
719 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
720 Note that MATCH_FIRST must not be smaller than 0. */
721 ch
= ((match_first
>= length
) ? 0
722 : re_string_byte_at (&mctx
.input
,
724 - mctx
.input
.raw_mbs_idx
));
729 while (match_first
>= left_lim
&& match_first
<= right_lim
);
735 /* Reconstruct the buffers so that the matcher can assume that
736 the matching starts from the beginning of the buffer. */
737 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
738 if (BE (err
!= REG_NOERROR
, 0))
740 #ifdef RE_ENABLE_I18N
741 /* Eliminate it when it is a component of a multibyte character
742 and isn't the head of a multibyte character. */
743 if (sb
|| re_string_first_byte (&mctx
.input
, 0))
746 /* It seems to be appropriate one, then use the matcher. */
747 /* We assume that the matching starts from 0. */
748 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
749 match_last
= check_matching (&mctx
, fl_longest_match
,
750 range
>= 0 ? &match_first
: NULL
);
751 if (match_last
!= -1)
753 if (BE (match_last
== -2, 0))
760 mctx
.match_last
= match_last
;
761 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
763 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
764 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
767 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
770 err
= prune_impossible_nodes (&mctx
);
771 if (err
== REG_NOERROR
)
773 if (BE (err
!= REG_NOMATCH
, 0))
778 break; /* We found a match. */
781 match_ctx_clean (&mctx
);
783 /* Update counter. */
785 if (match_first
< left_lim
|| right_lim
< match_first
)
789 /* Set pmatch[] if we need. */
790 if (match_last
!= -1 && nmatch
> 0)
794 /* Initialize registers. */
795 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
796 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
798 /* Set the points where matching start/end. */
800 pmatch
[0].rm_eo
= mctx
.match_last
;
802 if (!preg
->no_sub
&& nmatch
> 1)
804 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
805 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
806 if (BE (err
!= REG_NOERROR
, 0))
810 /* At last, add the offset to the each registers, since we slided
811 the buffers so that we could assume that the matching starts
813 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
814 if (pmatch
[reg_idx
].rm_so
!= -1)
816 #ifdef RE_ENABLE_I18N
817 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
819 if (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
)
820 pmatch
[reg_idx
].rm_so
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
822 pmatch
[reg_idx
].rm_so
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
];
823 if (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
)
824 pmatch
[reg_idx
].rm_eo
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
826 pmatch
[reg_idx
].rm_eo
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
];
829 assert (mctx
.input
.offsets_needed
== 0);
831 pmatch
[reg_idx
].rm_so
+= match_first
;
832 pmatch
[reg_idx
].rm_eo
+= match_first
;
835 err
= (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
837 re_free (mctx
.state_log
);
839 match_ctx_free (&mctx
);
840 re_string_destruct (&mctx
.input
);
845 prune_impossible_nodes (mctx
)
846 re_match_context_t
*mctx
;
848 re_dfa_t
*const dfa
= mctx
->dfa
;
849 int halt_node
, match_last
;
851 re_dfastate_t
**sifted_states
;
852 re_dfastate_t
**lim_states
= NULL
;
853 re_sift_context_t sctx
;
855 assert (mctx
->state_log
!= NULL
);
857 match_last
= mctx
->match_last
;
858 halt_node
= mctx
->last_node
;
859 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
860 if (BE (sifted_states
== NULL
, 0))
867 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
868 if (BE (lim_states
== NULL
, 0))
875 memset (lim_states
, '\0',
876 sizeof (re_dfastate_t
*) * (match_last
+ 1));
877 match_ctx_clear_flag (mctx
);
878 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
880 ret
= sift_states_backward (mctx
, &sctx
);
881 re_node_set_free (&sctx
.limits
);
882 if (BE (ret
!= REG_NOERROR
, 0))
884 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
894 } while (mctx
->state_log
[match_last
] == NULL
895 || !mctx
->state_log
[match_last
]->halt
);
896 halt_node
= check_halt_state_context (mctx
,
897 mctx
->state_log
[match_last
],
900 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
902 re_free (lim_states
);
904 if (BE (ret
!= REG_NOERROR
, 0))
909 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
911 ret
= sift_states_backward (mctx
, &sctx
);
912 re_node_set_free (&sctx
.limits
);
913 if (BE (ret
!= REG_NOERROR
, 0))
916 re_free (mctx
->state_log
);
917 mctx
->state_log
= sifted_states
;
918 sifted_states
= NULL
;
919 mctx
->last_node
= halt_node
;
920 mctx
->match_last
= match_last
;
923 re_free (sifted_states
);
924 re_free (lim_states
);
928 /* Acquire an initial state and return it.
929 We must select appropriate initial state depending on the context,
930 since initial states may have constraints like "\<", "^", etc.. */
932 static inline re_dfastate_t
*
933 acquire_init_state_context (err
, mctx
, idx
)
935 const re_match_context_t
*mctx
;
938 re_dfa_t
*const dfa
= mctx
->dfa
;
940 if (dfa
->init_state
->has_constraint
)
942 unsigned int context
;
943 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
944 if (IS_WORD_CONTEXT (context
))
945 return dfa
->init_state_word
;
946 else if (IS_ORDINARY_CONTEXT (context
))
947 return dfa
->init_state
;
948 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
949 return dfa
->init_state_begbuf
;
950 else if (IS_NEWLINE_CONTEXT (context
))
951 return dfa
->init_state_nl
;
952 else if (IS_BEGBUF_CONTEXT (context
))
954 /* It is relatively rare case, then calculate on demand. */
955 return re_acquire_state_context (err
, dfa
,
956 dfa
->init_state
->entrance_nodes
,
960 /* Must not happen? */
961 return dfa
->init_state
;
964 return dfa
->init_state
;
967 /* Check whether the regular expression match input string INPUT or not,
968 and return the index where the matching end, return -1 if not match,
969 or return -2 in case of an error.
970 FL_LONGEST_MATCH means we want the POSIX longest matching.
971 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
972 next place where we may want to try matching.
973 Note that the matcher assume that the maching starts from the current
974 index of the buffer. */
977 check_matching (mctx
, fl_longest_match
, p_match_first
)
978 re_match_context_t
*mctx
;
979 int fl_longest_match
;
982 re_dfa_t
*const dfa
= mctx
->dfa
;
986 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
987 re_dfastate_t
*cur_state
;
988 int at_init_state
= p_match_first
!= NULL
, skipped
= 0;
990 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
991 /* An initial state must not be NULL(invalid state). */
992 if (BE (cur_state
== NULL
, 0))
994 if (mctx
->state_log
!= NULL
)
995 mctx
->state_log
[cur_str_idx
] = cur_state
;
997 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
998 later. E.g. Processing back references. */
999 if (BE (dfa
->nbackref
, 0))
1002 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1003 if (BE (err
!= REG_NOERROR
, 0))
1006 if (cur_state
->has_backref
)
1008 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1009 if (BE (err
!= REG_NOERROR
, 0))
1014 /* If the RE accepts NULL string. */
1015 if (BE (cur_state
->halt
, 0))
1017 if (!cur_state
->has_constraint
1018 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1020 if (!fl_longest_match
)
1024 match_last
= cur_str_idx
;
1030 while (!re_string_eoi (&mctx
->input
))
1032 re_dfastate_t
*old_state
= cur_state
;
1033 cur_state
= transit_state (&err
, mctx
, cur_state
);
1036 if (old_state
== cur_state
)
1042 if (cur_state
== NULL
) /* Reached at the invalid state or an error. */
1044 cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1045 if (BE (err
!= REG_NOERROR
, 0))
1047 if (!fl_longest_match
&& match
)
1051 if (mctx
->state_log
== NULL
)
1055 int max
= mctx
->state_log_top
;
1056 for (; cur_str_idx
<= max
; ++cur_str_idx
)
1057 if (mctx
->state_log
[cur_str_idx
] != NULL
)
1059 if (cur_str_idx
> max
)
1065 if (cur_state
!= NULL
&& cur_state
->halt
)
1067 /* Reached at a halt state.
1068 Check the halt state can satisfy the current context. */
1069 if (!cur_state
->has_constraint
1070 || check_halt_state_context (mctx
, cur_state
,
1071 re_string_cur_idx (&mctx
->input
)))
1073 /* We found an appropriate halt state. */
1074 match_last
= re_string_cur_idx (&mctx
->input
);
1076 if (!fl_longest_match
)
1082 if (match_last
== -1 && skipped
)
1083 *p_match_first
+= skipped
;
1088 /* Check NODE match the current context. */
1090 static int check_halt_node_context (dfa
, node
, context
)
1091 const re_dfa_t
*dfa
;
1093 unsigned int context
;
1095 re_token_type_t type
= dfa
->nodes
[node
].type
;
1096 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1097 if (type
!= END_OF_RE
)
1101 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1106 /* Check the halt state STATE match the current context.
1107 Return 0 if not match, if the node, STATE has, is a halt node and
1108 match the context, return the node. */
1111 check_halt_state_context (mctx
, state
, idx
)
1112 const re_match_context_t
*mctx
;
1113 const re_dfastate_t
*state
;
1117 unsigned int context
;
1119 assert (state
->halt
);
1121 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1122 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1123 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1124 return state
->nodes
.elems
[i
];
1128 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1129 corresponding to the DFA).
1130 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1134 proceed_next_node (mctx
, nregs
, regs
, pidx
, node
, eps_via_nodes
, fs
)
1135 const re_match_context_t
*mctx
;
1137 int nregs
, *pidx
, node
;
1138 re_node_set
*eps_via_nodes
;
1139 struct re_fail_stack_t
*fs
;
1141 re_dfa_t
*const dfa
= mctx
->dfa
;
1142 int i
, err
, dest_node
;
1144 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1146 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1147 int ndest
, dest_nodes
[2];
1148 err
= re_node_set_insert (eps_via_nodes
, node
);
1149 if (BE (err
< 0, 0))
1151 /* Pick up valid destinations. */
1152 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1154 int candidate
= dfa
->edests
[node
].elems
[i
];
1155 if (!re_node_set_contains (cur_nodes
, candidate
))
1157 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1158 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1162 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1163 /* In order to avoid infinite loop like "(a*)*". */
1164 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1165 return dest_nodes
[1];
1167 && push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
,
1170 return dest_nodes
[0];
1175 re_token_type_t type
= dfa
->nodes
[node
].type
;
1177 #ifdef RE_ENABLE_I18N
1178 if (ACCEPT_MB_NODE (type
))
1179 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1181 #endif /* RE_ENABLE_I18N */
1182 if (type
== OP_BACK_REF
)
1184 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1185 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1188 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1192 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1193 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1201 err
= re_node_set_insert (eps_via_nodes
, node
);
1202 if (BE (err
< 0, 0))
1204 dest_node
= dfa
->edests
[node
].elems
[0];
1205 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1212 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1214 dest_node
= dfa
->nexts
[node
];
1215 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1216 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1217 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1220 re_node_set_empty (eps_via_nodes
);
1227 static reg_errcode_t
1228 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1229 struct re_fail_stack_t
*fs
;
1230 int str_idx
, *dests
, nregs
;
1232 re_node_set
*eps_via_nodes
;
1235 int num
= fs
->num
++;
1236 if (fs
->num
== fs
->alloc
)
1238 struct re_fail_stack_ent_t
*new_array
;
1239 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1241 if (new_array
== NULL
)
1244 fs
->stack
= new_array
;
1246 fs
->stack
[num
].idx
= str_idx
;
1247 fs
->stack
[num
].node
= dests
[1];
1248 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1249 if (fs
->stack
[num
].regs
== NULL
)
1251 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1252 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1257 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1258 struct re_fail_stack_t
*fs
;
1261 re_node_set
*eps_via_nodes
;
1263 int num
= --fs
->num
;
1265 *pidx
= fs
->stack
[num
].idx
;
1266 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1267 re_node_set_free (eps_via_nodes
);
1268 re_free (fs
->stack
[num
].regs
);
1269 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1270 return fs
->stack
[num
].node
;
1273 /* Set the positions where the subexpressions are starts/ends to registers
1275 Note: We assume that pmatch[0] is already set, and
1276 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1278 static reg_errcode_t
1279 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1280 const regex_t
*preg
;
1281 const re_match_context_t
*mctx
;
1286 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1287 int idx
, cur_node
, real_nmatch
;
1288 re_node_set eps_via_nodes
;
1289 struct re_fail_stack_t
*fs
;
1290 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1291 regmatch_t
*prev_idx_match
;
1294 assert (nmatch
> 1);
1295 assert (mctx
->state_log
!= NULL
);
1300 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1301 if (fs
->stack
== NULL
)
1307 cur_node
= dfa
->init_node
;
1308 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1309 re_node_set_init_empty (&eps_via_nodes
);
1311 prev_idx_match
= (regmatch_t
*) alloca (sizeof (regmatch_t
) * real_nmatch
);
1312 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * real_nmatch
);
1314 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1316 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, real_nmatch
);
1318 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1323 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1324 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1326 if (reg_idx
== nmatch
)
1328 re_node_set_free (&eps_via_nodes
);
1329 return free_fail_stack_return (fs
);
1331 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1336 re_node_set_free (&eps_via_nodes
);
1341 /* Proceed to next node. */
1342 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1343 &eps_via_nodes
, fs
);
1345 if (BE (cur_node
< 0, 0))
1347 if (BE (cur_node
== -2, 0))
1349 re_node_set_free (&eps_via_nodes
);
1350 free_fail_stack_return (fs
);
1354 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1358 re_node_set_free (&eps_via_nodes
);
1363 re_node_set_free (&eps_via_nodes
);
1364 return free_fail_stack_return (fs
);
1367 static reg_errcode_t
1368 free_fail_stack_return (fs
)
1369 struct re_fail_stack_t
*fs
;
1374 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1376 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1377 re_free (fs
->stack
[fs_idx
].regs
);
1379 re_free (fs
->stack
);
1385 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, cur_idx
, nmatch
)
1387 regmatch_t
*pmatch
, *prev_idx_match
;
1388 int cur_node
, cur_idx
, nmatch
;
1390 int type
= dfa
->nodes
[cur_node
].type
;
1391 if (type
== OP_OPEN_SUBEXP
)
1393 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1395 /* We are at the first node of this sub expression. */
1396 if (reg_num
< nmatch
)
1398 pmatch
[reg_num
].rm_so
= cur_idx
;
1399 pmatch
[reg_num
].rm_eo
= -1;
1402 else if (type
== OP_CLOSE_SUBEXP
)
1404 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1405 if (reg_num
< nmatch
)
1407 /* We are at the last node of this sub expression. */
1408 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1410 pmatch
[reg_num
].rm_eo
= cur_idx
;
1411 /* This is a non-empty match or we are not inside an optional
1412 subexpression. Accept this right away. */
1413 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1417 if (dfa
->nodes
[cur_node
].opt_subexp
1418 && prev_idx_match
[reg_num
].rm_so
!= -1)
1419 /* We transited through an empty match for an optional
1420 subexpression, like (a?)*, and this is not the subexp's
1421 first match. Copy back the old content of the registers
1422 so that matches of an inner subexpression are undone as
1423 well, like in ((a?))*. */
1424 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1426 /* We completed a subexpression, but it may be part of
1427 an optional one, so do not update PREV_IDX_MATCH. */
1428 pmatch
[reg_num
].rm_eo
= cur_idx
;
1434 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1435 and sift the nodes in each states according to the following rules.
1436 Updated state_log will be wrote to STATE_LOG.
1438 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1439 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1440 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1441 the LAST_NODE, we throw away the node `a'.
1442 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1443 string `s' and transit to `b':
1444 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1446 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1447 thrown away, we throw away the node `a'.
1448 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1449 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1451 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1452 we throw away the node `a'. */
1454 #define STATE_NODE_CONTAINS(state,node) \
1455 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1457 static reg_errcode_t
1458 sift_states_backward (mctx
, sctx
)
1459 re_match_context_t
*mctx
;
1460 re_sift_context_t
*sctx
;
1462 re_dfa_t
*const dfa
= mctx
->dfa
;
1465 int str_idx
= sctx
->last_str_idx
;
1466 re_node_set cur_dest
;
1467 re_node_set
*cur_src
; /* Points the state_log[str_idx]->nodes */
1470 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1472 cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1474 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1475 transit to the last_node and the last_node itself. */
1476 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1477 if (BE (err
!= REG_NOERROR
, 0))
1479 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1480 if (BE (err
!= REG_NOERROR
, 0))
1483 /* Then check each states in the state_log. */
1487 /* Update counters. */
1488 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1489 if (null_cnt
> mctx
->max_mb_elem_len
)
1491 memset (sctx
->sifted_states
, '\0',
1492 sizeof (re_dfastate_t
*) * str_idx
);
1493 re_node_set_free (&cur_dest
);
1496 re_node_set_empty (&cur_dest
);
1498 cur_src
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1499 : &mctx
->state_log
[str_idx
]->nodes
);
1501 /* Then build the next sifted state.
1502 We build the next sifted state on `cur_dest', and update
1503 `sifted_states[str_idx]' with `cur_dest'.
1505 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1506 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1507 for (i
= 0; i
< cur_src
->nelem
; i
++)
1509 int prev_node
= cur_src
->elems
[i
];
1511 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1513 if (IS_EPSILON_NODE (type
))
1515 #ifdef RE_ENABLE_I18N
1516 /* If the node may accept `multi byte'. */
1517 if (ACCEPT_MB_NODE (type
))
1518 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1519 str_idx
, sctx
->last_str_idx
);
1521 #endif /* RE_ENABLE_I18N */
1522 /* We don't check backreferences here.
1523 See update_cur_sifted_state(). */
1526 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1527 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1528 dfa
->nexts
[prev_node
]))
1534 if (sctx
->limits
.nelem
)
1536 int to_idx
= str_idx
+ naccepted
;
1537 if (check_dst_limits (mctx
, &sctx
->limits
,
1538 dfa
->nexts
[prev_node
], to_idx
,
1539 prev_node
, str_idx
))
1542 ret
= re_node_set_insert (&cur_dest
, prev_node
);
1543 if (BE (ret
== -1, 0))
1550 /* Add all the nodes which satisfy the following conditions:
1551 - It can epsilon transit to a node in CUR_DEST.
1553 And update state_log. */
1554 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1555 if (BE (err
!= REG_NOERROR
, 0))
1560 re_node_set_free (&cur_dest
);
1564 /* Helper functions. */
1566 static reg_errcode_t
1567 clean_state_log_if_needed (mctx
, next_state_log_idx
)
1568 re_match_context_t
*mctx
;
1569 int next_state_log_idx
;
1571 int top
= mctx
->state_log_top
;
1573 if (next_state_log_idx
>= mctx
->input
.bufs_len
1574 || (next_state_log_idx
>= mctx
->input
.valid_len
1575 && mctx
->input
.valid_len
< mctx
->input
.len
))
1578 err
= extend_buffers (mctx
);
1579 if (BE (err
!= REG_NOERROR
, 0))
1583 if (top
< next_state_log_idx
)
1585 memset (mctx
->state_log
+ top
+ 1, '\0',
1586 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1587 mctx
->state_log_top
= next_state_log_idx
;
1592 static reg_errcode_t
1593 merge_state_array (dfa
, dst
, src
, num
)
1595 re_dfastate_t
**dst
;
1596 re_dfastate_t
**src
;
1601 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1603 if (dst
[st_idx
] == NULL
)
1604 dst
[st_idx
] = src
[st_idx
];
1605 else if (src
[st_idx
] != NULL
)
1607 re_node_set merged_set
;
1608 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1609 &src
[st_idx
]->nodes
);
1610 if (BE (err
!= REG_NOERROR
, 0))
1612 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1613 re_node_set_free (&merged_set
);
1614 if (BE (err
!= REG_NOERROR
, 0))
1621 static reg_errcode_t
1622 update_cur_sifted_state (mctx
, sctx
, str_idx
, dest_nodes
)
1623 re_match_context_t
*mctx
;
1624 re_sift_context_t
*sctx
;
1626 re_node_set
*dest_nodes
;
1628 re_dfa_t
*const dfa
= mctx
->dfa
;
1630 const re_node_set
*candidates
;
1631 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1632 : &mctx
->state_log
[str_idx
]->nodes
);
1634 /* At first, add the nodes which can epsilon transit to a node in
1636 if (dest_nodes
->nelem
)
1638 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1639 if (BE (err
!= REG_NOERROR
, 0))
1643 /* Then, check the limitations in the current sift_context. */
1644 if (dest_nodes
->nelem
&& sctx
->limits
.nelem
)
1646 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1647 mctx
->bkref_ents
, str_idx
);
1648 if (BE (err
!= REG_NOERROR
, 0))
1652 /* Update state_log. */
1653 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1654 if (BE (sctx
->sifted_states
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1657 if ((mctx
->state_log
[str_idx
] != NULL
1658 && mctx
->state_log
[str_idx
]->has_backref
))
1660 err
= sift_states_bkref (mctx
, sctx
, str_idx
, dest_nodes
);
1661 if (BE (err
!= REG_NOERROR
, 0))
1667 static reg_errcode_t
1668 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1670 re_node_set
*dest_nodes
;
1671 const re_node_set
*candidates
;
1675 re_node_set src_copy
;
1677 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1678 if (BE (err
!= REG_NOERROR
, 0))
1680 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1682 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1684 + src_copy
.elems
[src_idx
]);
1685 if (BE (err
!= REG_NOERROR
, 0))
1687 re_node_set_free (&src_copy
);
1691 re_node_set_free (&src_copy
);
1695 static reg_errcode_t
1696 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1699 re_node_set
*dest_nodes
;
1700 const re_node_set
*candidates
;
1704 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1705 re_node_set except_nodes
;
1706 re_node_set_init_empty (&except_nodes
);
1707 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1709 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1710 if (cur_node
== node
)
1712 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1714 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1715 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1716 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1717 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1718 && re_node_set_contains (dest_nodes
, edst1
))
1720 && !re_node_set_contains (inv_eclosure
, edst2
)
1721 && re_node_set_contains (dest_nodes
, edst2
)))
1723 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1724 dfa
->inveclosures
+ cur_node
);
1725 if (BE (err
!= REG_NOERROR
, 0))
1727 re_node_set_free (&except_nodes
);
1733 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1735 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1736 if (!re_node_set_contains (&except_nodes
, cur_node
))
1738 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1739 re_node_set_remove_at (dest_nodes
, idx
);
1742 re_node_set_free (&except_nodes
);
1747 check_dst_limits (mctx
, limits
, dst_node
, dst_idx
, src_node
, src_idx
)
1748 re_match_context_t
*mctx
;
1749 re_node_set
*limits
;
1750 int dst_node
, dst_idx
, src_node
, src_idx
;
1752 re_dfa_t
*const dfa
= mctx
->dfa
;
1753 int lim_idx
, src_pos
, dst_pos
;
1755 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1758 struct re_backref_cache_entry
*ent
;
1759 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1760 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1762 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1763 dfa
->eclosures
+ dst_node
,
1764 subexp_idx
, dst_node
, dst_idx
);
1765 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1766 dfa
->eclosures
+ src_node
,
1767 subexp_idx
, src_node
, src_idx
);
1770 <src> <dst> ( <subexp> )
1771 ( <subexp> ) <src> <dst>
1772 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1773 if (src_pos
== dst_pos
)
1774 continue; /* This is unrelated limitation. */
1782 check_dst_limits_calc_pos (mctx
, limit
, eclosures
, subexp_idx
, from_node
,
1784 re_match_context_t
*mctx
;
1785 re_node_set
*eclosures
;
1786 int limit
, subexp_idx
, from_node
, str_idx
;
1788 re_dfa_t
*const dfa
= mctx
->dfa
;
1789 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1792 /* If we are outside the range of the subexpression, return -1 or 1. */
1793 if (str_idx
< lim
->subexp_from
)
1796 if (lim
->subexp_to
< str_idx
)
1799 /* If we are within the subexpression, return 0. */
1800 if (str_idx
!= lim
->subexp_from
&& str_idx
!= lim
->subexp_to
)
1803 /* Else, we are on the boundary: examine the nodes on the epsilon
1805 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1807 int node
= eclosures
->elems
[node_idx
];
1808 switch (dfa
->nodes
[node
].type
)
1812 int bi
= search_cur_bkref_entry (mctx
, str_idx
);
1813 for (; bi
< mctx
->nbkref_ents
; ++bi
)
1815 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bi
;
1818 /* If this backreference goes beyond the point we're
1819 examining, don't go any further. */
1820 if (ent
->str_idx
> str_idx
)
1823 if (ent
->node
!= node
|| ent
->subexp_from
!= ent
->subexp_to
)
1826 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1827 OP_CLOSE_SUBEXP cases below. But, if the
1828 destination node is the same node as the source
1829 node, don't recurse because it would cause an
1830 infinite loop: a regex that exhibits this behavior
1832 dst
= dfa
->edests
[node
].elems
[0];
1833 if (dst
== from_node
)
1835 if (str_idx
== lim
->subexp_from
)
1837 else /* if (str_idx == lim->subexp_to) */
1841 cpos
= check_dst_limits_calc_pos (mctx
, limit
,
1842 dfa
->eclosures
+ dst
,
1846 if (cpos
== -1 && str_idx
== lim
->subexp_from
)
1849 if (cpos
== 0 /* && str_idx == lim->lim->subexp_to */)
1855 case OP_OPEN_SUBEXP
:
1856 if (str_idx
== lim
->subexp_from
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1860 case OP_CLOSE_SUBEXP
:
1861 if (str_idx
== lim
->subexp_to
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1870 if (str_idx
== lim
->subexp_to
)
1876 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1877 which are against limitations from DEST_NODES. */
1879 static reg_errcode_t
1880 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1882 re_node_set
*dest_nodes
;
1883 const re_node_set
*candidates
;
1884 re_node_set
*limits
;
1885 struct re_backref_cache_entry
*bkref_ents
;
1889 int node_idx
, lim_idx
;
1891 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1894 struct re_backref_cache_entry
*ent
;
1895 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1897 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1898 continue; /* This is unrelated limitation. */
1900 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1901 if (ent
->subexp_to
== str_idx
)
1905 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1907 int node
= dest_nodes
->elems
[node_idx
];
1908 re_token_type_t type
= dfa
->nodes
[node
].type
;
1909 if (type
== OP_OPEN_SUBEXP
1910 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1912 else if (type
== OP_CLOSE_SUBEXP
1913 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1917 /* Check the limitation of the open subexpression. */
1918 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1921 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
1923 if (BE (err
!= REG_NOERROR
, 0))
1927 /* Check the limitation of the close subexpression. */
1929 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1931 int node
= dest_nodes
->elems
[node_idx
];
1932 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
1934 && !re_node_set_contains (dfa
->eclosures
+ node
,
1937 /* It is against this limitation.
1938 Remove it form the current sifted state. */
1939 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1941 if (BE (err
!= REG_NOERROR
, 0))
1947 else /* (ent->subexp_to != str_idx) */
1949 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1951 int node
= dest_nodes
->elems
[node_idx
];
1952 re_token_type_t type
= dfa
->nodes
[node
].type
;
1953 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
1955 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
1957 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
1958 || (type
== OP_OPEN_SUBEXP
))
1960 /* It is against this limitation.
1961 Remove it form the current sifted state. */
1962 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1964 if (BE (err
!= REG_NOERROR
, 0))
1974 static reg_errcode_t
1975 sift_states_bkref (mctx
, sctx
, str_idx
, dest_nodes
)
1976 re_match_context_t
*mctx
;
1977 re_sift_context_t
*sctx
;
1979 re_node_set
*dest_nodes
;
1981 re_dfa_t
*const dfa
= mctx
->dfa
;
1984 re_sift_context_t local_sctx
;
1985 const re_node_set
*candidates
;
1986 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1987 : &mctx
->state_log
[str_idx
]->nodes
);
1988 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
1990 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
1992 int cur_bkref_idx
= re_string_cur_idx (&mctx
->input
);
1993 re_token_type_t type
;
1994 node
= candidates
->elems
[node_idx
];
1995 type
= dfa
->nodes
[node
].type
;
1996 if (node
== sctx
->cur_bkref
&& str_idx
== cur_bkref_idx
)
1998 /* Avoid infinite loop for the REs like "()\1+". */
1999 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2001 if (type
== OP_BACK_REF
)
2003 int enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2004 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2006 int disabled_idx
, subexp_len
, to_idx
, dst_node
;
2007 struct re_backref_cache_entry
*entry
;
2008 entry
= mctx
->bkref_ents
+ enabled_idx
;
2009 if (entry
->str_idx
> str_idx
)
2011 if (entry
->node
!= node
)
2013 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2014 to_idx
= str_idx
+ subexp_len
;
2015 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2016 : dfa
->edests
[node
].elems
[0]);
2018 if (to_idx
> sctx
->last_str_idx
2019 || sctx
->sifted_states
[to_idx
] == NULL
2020 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
],
2022 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2023 str_idx
, dst_node
, to_idx
))
2026 re_dfastate_t
*cur_state
;
2028 for (disabled_idx
= enabled_idx
+ 1;
2029 disabled_idx
< mctx
->nbkref_ents
; ++disabled_idx
)
2031 struct re_backref_cache_entry
*entry2
;
2032 entry2
= mctx
->bkref_ents
+ disabled_idx
;
2033 if (entry2
->str_idx
> str_idx
)
2035 entry2
->flag
= (entry2
->node
== node
) ? 1 : entry2
->flag
;
2038 if (local_sctx
.sifted_states
== NULL
)
2041 err
= re_node_set_init_copy (&local_sctx
.limits
,
2043 if (BE (err
!= REG_NOERROR
, 0))
2046 local_sctx
.last_node
= node
;
2047 local_sctx
.last_str_idx
= str_idx
;
2048 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2049 if (BE (err
< 0, 0))
2054 cur_state
= local_sctx
.sifted_states
[str_idx
];
2055 err
= sift_states_backward (mctx
, &local_sctx
);
2056 if (BE (err
!= REG_NOERROR
, 0))
2058 if (sctx
->limited_states
!= NULL
)
2060 err
= merge_state_array (dfa
, sctx
->limited_states
,
2061 local_sctx
.sifted_states
,
2063 if (BE (err
!= REG_NOERROR
, 0))
2066 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2067 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2068 /* We must not use the variable entry here, since
2069 mctx->bkref_ents might be realloced. */
2070 mctx
->bkref_ents
[enabled_idx
].flag
= 1;
2073 enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2074 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2076 struct re_backref_cache_entry
*entry
;
2077 entry
= mctx
->bkref_ents
+ enabled_idx
;
2078 if (entry
->str_idx
> str_idx
)
2080 if (entry
->node
== node
)
2087 if (local_sctx
.sifted_states
!= NULL
)
2089 re_node_set_free (&local_sctx
.limits
);
2096 #ifdef RE_ENABLE_I18N
2098 sift_states_iter_mb (mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2099 const re_match_context_t
*mctx
;
2100 re_sift_context_t
*sctx
;
2101 int node_idx
, str_idx
, max_str_idx
;
2103 re_dfa_t
*const dfa
= mctx
->dfa
;
2105 /* Check the node can accept `multi byte'. */
2106 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2107 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2108 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2109 dfa
->nexts
[node_idx
]))
2110 /* The node can't accept the `multi byte', or the
2111 destination was already thrown away, then the node
2112 could't accept the current input `multi byte'. */
2114 /* Otherwise, it is sure that the node could accept
2115 `naccepted' bytes input. */
2118 #endif /* RE_ENABLE_I18N */
2121 /* Functions for state transition. */
2123 /* Return the next state to which the current state STATE will transit by
2124 accepting the current input byte, and update STATE_LOG if necessary.
2125 If STATE can accept a multibyte char/collating element/back reference
2126 update the destination of STATE_LOG. */
2128 static re_dfastate_t
*
2129 transit_state (err
, mctx
, state
)
2131 re_match_context_t
*mctx
;
2132 re_dfastate_t
*state
;
2134 re_dfa_t
*const dfa
= mctx
->dfa
;
2135 re_dfastate_t
**trtable
, *next_state
;
2139 if (re_string_cur_idx (&mctx
->input
) + 1 >= mctx
->input
.bufs_len
2140 || (re_string_cur_idx (&mctx
->input
) + 1 >= mctx
->input
.valid_len
2141 && mctx
->input
.valid_len
< mctx
->input
.len
))
2143 *err
= extend_buffers (mctx
);
2144 if (BE (*err
!= REG_NOERROR
, 0))
2152 re_string_skip_bytes (&mctx
->input
, 1);
2156 #ifdef RE_ENABLE_I18N
2157 /* If the current state can accept multibyte. */
2158 if (state
->accept_mb
)
2160 *err
= transit_state_mb (mctx
, state
);
2161 if (BE (*err
!= REG_NOERROR
, 0))
2164 #endif /* RE_ENABLE_I18N */
2166 /* Then decide the next state with the single byte. */
2169 /* Use transition table */
2170 ch
= re_string_fetch_byte (&mctx
->input
);
2171 trtable
= state
->trtable
;
2172 if (trtable
== NULL
)
2174 trtable
= build_trtable (dfa
, state
);
2175 if (trtable
== NULL
)
2181 if (BE (state
->word_trtable
, 0))
2183 unsigned int context
;
2185 = re_string_context_at (&mctx
->input
,
2186 re_string_cur_idx (&mctx
->input
) - 1,
2188 if (IS_WORD_CONTEXT (context
))
2189 next_state
= trtable
[ch
+ SBC_MAX
];
2191 next_state
= trtable
[ch
];
2194 next_state
= trtable
[ch
];
2199 /* don't use transition table */
2200 next_state
= transit_state_sb (err
, mctx
, state
);
2201 if (BE (next_state
== NULL
&& err
!= REG_NOERROR
, 0))
2207 cur_idx
= re_string_cur_idx (&mctx
->input
);
2208 /* Update the state_log if we need. */
2209 if (mctx
->state_log
!= NULL
)
2211 if (cur_idx
> mctx
->state_log_top
)
2213 mctx
->state_log
[cur_idx
] = next_state
;
2214 mctx
->state_log_top
= cur_idx
;
2216 else if (mctx
->state_log
[cur_idx
] == 0)
2218 mctx
->state_log
[cur_idx
] = next_state
;
2222 re_dfastate_t
*pstate
;
2223 unsigned int context
;
2224 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2225 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2226 the destination of a multibyte char/collating element/
2227 back reference. Then the next state is the union set of
2228 these destinations and the results of the transition table. */
2229 pstate
= mctx
->state_log
[cur_idx
];
2230 log_nodes
= pstate
->entrance_nodes
;
2231 if (next_state
!= NULL
)
2233 table_nodes
= next_state
->entrance_nodes
;
2234 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2236 if (BE (*err
!= REG_NOERROR
, 0))
2240 next_nodes
= *log_nodes
;
2241 /* Note: We already add the nodes of the initial state,
2242 then we don't need to add them here. */
2244 context
= re_string_context_at (&mctx
->input
,
2245 re_string_cur_idx (&mctx
->input
) - 1,
2247 next_state
= mctx
->state_log
[cur_idx
]
2248 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2249 /* We don't need to check errors here, since the return value of
2250 this function is next_state and ERR is already set. */
2252 if (table_nodes
!= NULL
)
2253 re_node_set_free (&next_nodes
);
2257 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2259 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2260 later. We must check them here, since the back references in the
2261 next state might use them. */
2262 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2264 if (BE (*err
!= REG_NOERROR
, 0))
2267 /* If the next state has back references. */
2268 if (next_state
->has_backref
)
2270 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2271 if (BE (*err
!= REG_NOERROR
, 0))
2273 next_state
= mctx
->state_log
[cur_idx
];
2279 /* Helper functions for transit_state. */
2281 /* From the node set CUR_NODES, pick up the nodes whose types are
2282 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2283 expression. And register them to use them later for evaluating the
2284 correspoding back references. */
2286 static reg_errcode_t
2287 check_subexp_matching_top (mctx
, cur_nodes
, str_idx
)
2288 re_match_context_t
*mctx
;
2289 re_node_set
*cur_nodes
;
2292 re_dfa_t
*const dfa
= mctx
->dfa
;
2296 /* TODO: This isn't efficient.
2297 Because there might be more than one nodes whose types are
2298 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2301 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2303 int node
= cur_nodes
->elems
[node_idx
];
2304 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2305 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2306 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2308 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2309 if (BE (err
!= REG_NOERROR
, 0))
2317 /* Return the next state to which the current state STATE will transit by
2318 accepting the current input byte. */
2320 static re_dfastate_t
*
2321 transit_state_sb (err
, mctx
, state
)
2323 re_match_context_t
*mctx
;
2324 re_dfastate_t
*state
;
2326 re_dfa_t
*const dfa
= mctx
->dfa
;
2327 re_node_set next_nodes
;
2328 re_dfastate_t
*next_state
;
2329 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2330 unsigned int context
;
2332 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2333 if (BE (*err
!= REG_NOERROR
, 0))
2335 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2337 int cur_node
= state
->nodes
.elems
[node_cnt
];
2338 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2340 *err
= re_node_set_merge (&next_nodes
,
2341 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2342 if (BE (*err
!= REG_NOERROR
, 0))
2344 re_node_set_free (&next_nodes
);
2349 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2350 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2351 /* We don't need to check errors here, since the return value of
2352 this function is next_state and ERR is already set. */
2354 re_node_set_free (&next_nodes
);
2355 re_string_skip_bytes (&mctx
->input
, 1);
2360 #ifdef RE_ENABLE_I18N
2361 static reg_errcode_t
2362 transit_state_mb (mctx
, pstate
)
2363 re_match_context_t
*mctx
;
2364 re_dfastate_t
*pstate
;
2366 re_dfa_t
*const dfa
= mctx
->dfa
;
2370 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2372 re_node_set dest_nodes
, *new_nodes
;
2373 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2374 int naccepted
= 0, dest_idx
;
2375 unsigned int context
;
2376 re_dfastate_t
*dest_state
;
2378 if (dfa
->nodes
[cur_node_idx
].constraint
)
2380 context
= re_string_context_at (&mctx
->input
,
2381 re_string_cur_idx (&mctx
->input
),
2383 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2388 /* How many bytes the node can accept? */
2389 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2390 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2391 re_string_cur_idx (&mctx
->input
));
2395 /* The node can accepts `naccepted' bytes. */
2396 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2397 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2398 : mctx
->max_mb_elem_len
);
2399 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2400 if (BE (err
!= REG_NOERROR
, 0))
2403 assert (dfa
->nexts
[cur_node_idx
] != -1);
2405 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2406 then we use pstate->nodes.elems[i] instead. */
2407 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2409 dest_state
= mctx
->state_log
[dest_idx
];
2410 if (dest_state
== NULL
)
2411 dest_nodes
= *new_nodes
;
2414 err
= re_node_set_init_union (&dest_nodes
,
2415 dest_state
->entrance_nodes
, new_nodes
);
2416 if (BE (err
!= REG_NOERROR
, 0))
2419 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1, mctx
->eflags
);
2420 mctx
->state_log
[dest_idx
]
2421 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2422 if (dest_state
!= NULL
)
2423 re_node_set_free (&dest_nodes
);
2424 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2429 #endif /* RE_ENABLE_I18N */
2431 static reg_errcode_t
2432 transit_state_bkref (mctx
, nodes
)
2433 re_match_context_t
*mctx
;
2434 const re_node_set
*nodes
;
2436 re_dfa_t
*const dfa
= mctx
->dfa
;
2439 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2441 for (i
= 0; i
< nodes
->nelem
; ++i
)
2443 int dest_str_idx
, prev_nelem
, bkc_idx
;
2444 int node_idx
= nodes
->elems
[i
];
2445 unsigned int context
;
2446 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2447 re_node_set
*new_dest_nodes
;
2449 /* Check whether `node' is a backreference or not. */
2450 if (node
->type
!= OP_BACK_REF
)
2453 if (node
->constraint
)
2455 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2457 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2461 /* `node' is a backreference.
2462 Check the substring which the substring matched. */
2463 bkc_idx
= mctx
->nbkref_ents
;
2464 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2465 if (BE (err
!= REG_NOERROR
, 0))
2468 /* And add the epsilon closures (which is `new_dest_nodes') of
2469 the backreference to appropriate state_log. */
2471 assert (dfa
->nexts
[node_idx
] != -1);
2473 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2476 re_dfastate_t
*dest_state
;
2477 struct re_backref_cache_entry
*bkref_ent
;
2478 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2479 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2481 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2482 new_dest_nodes
= (subexp_len
== 0
2483 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2484 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2485 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2486 - bkref_ent
->subexp_from
);
2487 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2489 dest_state
= mctx
->state_log
[dest_str_idx
];
2490 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2491 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2492 /* Add `new_dest_node' to state_log. */
2493 if (dest_state
== NULL
)
2495 mctx
->state_log
[dest_str_idx
]
2496 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2498 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2499 && err
!= REG_NOERROR
, 0))
2504 re_node_set dest_nodes
;
2505 err
= re_node_set_init_union (&dest_nodes
,
2506 dest_state
->entrance_nodes
,
2508 if (BE (err
!= REG_NOERROR
, 0))
2510 re_node_set_free (&dest_nodes
);
2513 mctx
->state_log
[dest_str_idx
]
2514 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2515 re_node_set_free (&dest_nodes
);
2516 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2517 && err
!= REG_NOERROR
, 0))
2520 /* We need to check recursively if the backreference can epsilon
2523 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2525 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2527 if (BE (err
!= REG_NOERROR
, 0))
2529 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2530 if (BE (err
!= REG_NOERROR
, 0))
2540 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2541 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2542 Note that we might collect inappropriate candidates here.
2543 However, the cost of checking them strictly here is too high, then we
2544 delay these checking for prune_impossible_nodes(). */
2546 static reg_errcode_t
2547 get_subexp (mctx
, bkref_node
, bkref_str_idx
)
2548 re_match_context_t
*mctx
;
2549 int bkref_node
, bkref_str_idx
;
2551 re_dfa_t
*const dfa
= mctx
->dfa
;
2552 int subexp_num
, sub_top_idx
;
2553 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2554 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2555 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2556 for (; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
2558 const struct re_backref_cache_entry
*entry
2559 = &mctx
->bkref_ents
[cache_idx
];
2560 if (entry
->str_idx
> bkref_str_idx
)
2562 if (entry
->node
== bkref_node
)
2563 return REG_NOERROR
; /* We already checked it. */
2565 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
- 1;
2567 /* For each sub expression */
2568 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2571 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2572 re_sub_match_last_t
*sub_last
;
2573 int sub_last_idx
, sl_str
, bkref_str_off
;
2575 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2576 continue; /* It isn't related. */
2578 sl_str
= sub_top
->str_idx
;
2579 bkref_str_off
= bkref_str_idx
;
2580 /* At first, check the last node of sub expressions we already
2582 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2585 sub_last
= sub_top
->lasts
[sub_last_idx
];
2586 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2587 /* The matched string by the sub expression match with the substring
2588 at the back reference? */
2589 if (sl_str_diff
> 0)
2591 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2593 /* Not enough chars for a successful match. */
2594 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2597 err
= clean_state_log_if_needed (mctx
,
2600 if (BE (err
!= REG_NOERROR
, 0))
2602 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2604 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2605 break; /* We don't need to search this sub expression any more. */
2607 bkref_str_off
+= sl_str_diff
;
2608 sl_str
+= sl_str_diff
;
2609 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2612 /* Reload buf, since the preceding call might have reallocated
2614 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2616 if (err
== REG_NOMATCH
)
2618 if (BE (err
!= REG_NOERROR
, 0))
2622 if (sub_last_idx
< sub_top
->nlasts
)
2624 if (sub_last_idx
> 0)
2626 /* Then, search for the other last nodes of the sub expression. */
2627 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2629 int cls_node
, sl_str_off
;
2630 const re_node_set
*nodes
;
2631 sl_str_off
= sl_str
- sub_top
->str_idx
;
2632 /* The matched string by the sub expression match with the substring
2633 at the back reference? */
2636 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2638 /* If we are at the end of the input, we cannot match. */
2639 if (bkref_str_off
>= mctx
->input
.len
)
2642 err
= extend_buffers (mctx
);
2643 if (BE (err
!= REG_NOERROR
, 0))
2646 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2648 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2649 break; /* We don't need to search this sub expression
2652 if (mctx
->state_log
[sl_str
] == NULL
)
2654 /* Does this state have a ')' of the sub expression? */
2655 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2656 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, OP_CLOSE_SUBEXP
);
2659 if (sub_top
->path
== NULL
)
2661 sub_top
->path
= calloc (sizeof (state_array_t
),
2662 sl_str
- sub_top
->str_idx
+ 1);
2663 if (sub_top
->path
== NULL
)
2666 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2667 in the current context? */
2668 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2669 sub_top
->str_idx
, cls_node
, sl_str
, OP_CLOSE_SUBEXP
);
2670 if (err
== REG_NOMATCH
)
2672 if (BE (err
!= REG_NOERROR
, 0))
2674 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2675 if (BE (sub_last
== NULL
, 0))
2677 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2679 if (err
== REG_NOMATCH
)
2686 /* Helper functions for get_subexp(). */
2688 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2689 If it can arrive, register the sub expression expressed with SUB_TOP
2692 static reg_errcode_t
2693 get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2694 re_match_context_t
*mctx
;
2695 const re_sub_match_top_t
*sub_top
;
2696 re_sub_match_last_t
*sub_last
;
2697 int bkref_node
, bkref_str
;
2701 /* Can the subexpression arrive the back reference? */
2702 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2703 sub_last
->str_idx
, bkref_node
, bkref_str
, OP_OPEN_SUBEXP
);
2704 if (err
!= REG_NOERROR
)
2706 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2708 if (BE (err
!= REG_NOERROR
, 0))
2710 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2711 return clean_state_log_if_needed (mctx
, to_idx
);
2714 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2715 Search '(' if FL_OPEN, or search ')' otherwise.
2716 TODO: This function isn't efficient...
2717 Because there might be more than one nodes whose types are
2718 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2723 find_subexp_node (dfa
, nodes
, subexp_idx
, type
)
2724 const re_dfa_t
*dfa
;
2725 const re_node_set
*nodes
;
2726 int subexp_idx
, type
;
2729 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2731 int cls_node
= nodes
->elems
[cls_idx
];
2732 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2733 if (node
->type
== type
2734 && node
->opr
.idx
== subexp_idx
)
2740 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2741 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2743 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2745 static reg_errcode_t
2746 check_arrival (mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2748 re_match_context_t
*mctx
;
2749 state_array_t
*path
;
2750 int top_node
, top_str
, last_node
, last_str
, type
;
2752 re_dfa_t
*const dfa
= mctx
->dfa
;
2754 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2755 re_dfastate_t
*cur_state
= NULL
;
2756 re_node_set
*cur_nodes
, next_nodes
;
2757 re_dfastate_t
**backup_state_log
;
2758 unsigned int context
;
2760 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2761 /* Extend the buffer if we need. */
2762 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2764 re_dfastate_t
**new_array
;
2765 int old_alloc
= path
->alloc
;
2766 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2767 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2768 if (new_array
== NULL
)
2770 path
->alloc
= old_alloc
;
2773 path
->array
= new_array
;
2774 memset (new_array
+ old_alloc
, '\0',
2775 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2778 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2780 /* Temporary modify MCTX. */
2781 backup_state_log
= mctx
->state_log
;
2782 backup_cur_idx
= mctx
->input
.cur_idx
;
2783 mctx
->state_log
= path
->array
;
2784 mctx
->input
.cur_idx
= str_idx
;
2786 /* Setup initial node set. */
2787 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2788 if (str_idx
== top_str
)
2790 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2791 if (BE (err
!= REG_NOERROR
, 0))
2793 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2794 if (BE (err
!= REG_NOERROR
, 0))
2796 re_node_set_free (&next_nodes
);
2802 cur_state
= mctx
->state_log
[str_idx
];
2803 if (cur_state
&& cur_state
->has_backref
)
2805 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2806 if (BE ( err
!= REG_NOERROR
, 0))
2810 re_node_set_init_empty (&next_nodes
);
2812 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2814 if (next_nodes
.nelem
)
2816 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
, last_str
,
2818 if (BE ( err
!= REG_NOERROR
, 0))
2820 re_node_set_free (&next_nodes
);
2824 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2825 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2827 re_node_set_free (&next_nodes
);
2830 mctx
->state_log
[str_idx
] = cur_state
;
2833 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2835 re_node_set_empty (&next_nodes
);
2836 if (mctx
->state_log
[str_idx
+ 1])
2838 err
= re_node_set_merge (&next_nodes
,
2839 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2840 if (BE (err
!= REG_NOERROR
, 0))
2842 re_node_set_free (&next_nodes
);
2848 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2849 &cur_state
->nodes
, &next_nodes
);
2850 if (BE (err
!= REG_NOERROR
, 0))
2852 re_node_set_free (&next_nodes
);
2857 if (next_nodes
.nelem
)
2859 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2860 if (BE (err
!= REG_NOERROR
, 0))
2862 re_node_set_free (&next_nodes
);
2865 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
, last_str
,
2867 if (BE ( err
!= REG_NOERROR
, 0))
2869 re_node_set_free (&next_nodes
);
2873 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2874 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2875 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2877 re_node_set_free (&next_nodes
);
2880 mctx
->state_log
[str_idx
] = cur_state
;
2881 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2883 re_node_set_free (&next_nodes
);
2884 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2885 : &mctx
->state_log
[last_str
]->nodes
);
2886 path
->next_idx
= str_idx
;
2889 mctx
->state_log
= backup_state_log
;
2890 mctx
->input
.cur_idx
= backup_cur_idx
;
2892 /* Then check the current node set has the node LAST_NODE. */
2893 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
2899 /* Helper functions for check_arrival. */
2901 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2903 TODO: This function is similar to the functions transit_state*(),
2904 however this function has many additional works.
2905 Can't we unify them? */
2907 static reg_errcode_t
2908 check_arrival_add_next_nodes (mctx
, str_idx
, cur_nodes
, next_nodes
)
2909 re_match_context_t
*mctx
;
2911 re_node_set
*cur_nodes
, *next_nodes
;
2913 re_dfa_t
*const dfa
= mctx
->dfa
;
2916 re_node_set union_set
;
2917 re_node_set_init_empty (&union_set
);
2918 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
2921 int cur_node
= cur_nodes
->elems
[cur_idx
];
2922 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
2923 if (IS_EPSILON_NODE (type
))
2925 #ifdef RE_ENABLE_I18N
2926 /* If the node may accept `multi byte'. */
2927 if (ACCEPT_MB_NODE (type
))
2929 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
2933 re_dfastate_t
*dest_state
;
2934 int next_node
= dfa
->nexts
[cur_node
];
2935 int next_idx
= str_idx
+ naccepted
;
2936 dest_state
= mctx
->state_log
[next_idx
];
2937 re_node_set_empty (&union_set
);
2940 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
2941 if (BE (err
!= REG_NOERROR
, 0))
2943 re_node_set_free (&union_set
);
2947 err
= re_node_set_insert (&union_set
, next_node
);
2948 if (BE (err
< 0, 0))
2950 re_node_set_free (&union_set
);
2953 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
2955 if (BE (mctx
->state_log
[next_idx
] == NULL
2956 && err
!= REG_NOERROR
, 0))
2958 re_node_set_free (&union_set
);
2963 #endif /* RE_ENABLE_I18N */
2965 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
2967 err
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
2968 if (BE (err
< 0, 0))
2970 re_node_set_free (&union_set
);
2975 re_node_set_free (&union_set
);
2979 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
2980 CUR_NODES, however exclude the nodes which are:
2981 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2982 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2985 static reg_errcode_t
2986 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, type
)
2988 re_node_set
*cur_nodes
;
2989 int ex_subexp
, type
;
2992 int idx
, outside_node
;
2993 re_node_set new_nodes
;
2995 assert (cur_nodes
->nelem
);
2997 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
2998 if (BE (err
!= REG_NOERROR
, 0))
3000 /* Create a new node set NEW_NODES with the nodes which are epsilon
3001 closures of the node in CUR_NODES. */
3003 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3005 int cur_node
= cur_nodes
->elems
[idx
];
3006 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3007 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3008 if (outside_node
== -1)
3010 /* There are no problematic nodes, just merge them. */
3011 err
= re_node_set_merge (&new_nodes
, eclosure
);
3012 if (BE (err
!= REG_NOERROR
, 0))
3014 re_node_set_free (&new_nodes
);
3020 /* There are problematic nodes, re-calculate incrementally. */
3021 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3023 if (BE (err
!= REG_NOERROR
, 0))
3025 re_node_set_free (&new_nodes
);
3030 re_node_set_free (cur_nodes
);
3031 *cur_nodes
= new_nodes
;
3035 /* Helper function for check_arrival_expand_ecl.
3036 Check incrementally the epsilon closure of TARGET, and if it isn't
3037 problematic append it to DST_NODES. */
3039 static reg_errcode_t
3040 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, type
)
3042 int target
, ex_subexp
, type
;
3043 re_node_set
*dst_nodes
;
3046 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3050 if (dfa
->nodes
[cur_node
].type
== type
3051 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3053 if (type
== OP_CLOSE_SUBEXP
)
3055 err
= re_node_set_insert (dst_nodes
, cur_node
);
3056 if (BE (err
== -1, 0))
3061 err
= re_node_set_insert (dst_nodes
, cur_node
);
3062 if (BE (err
== -1, 0))
3064 if (dfa
->edests
[cur_node
].nelem
== 0)
3066 if (dfa
->edests
[cur_node
].nelem
== 2)
3068 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3069 dfa
->edests
[cur_node
].elems
[1],
3071 if (BE (err
!= REG_NOERROR
, 0))
3074 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3080 /* For all the back references in the current state, calculate the
3081 destination of the back references by the appropriate entry
3082 in MCTX->BKREF_ENTS. */
3084 static reg_errcode_t
3085 expand_bkref_cache (mctx
, cur_nodes
, cur_str
, last_str
, subexp_num
,
3087 re_match_context_t
*mctx
;
3088 int cur_str
, last_str
, subexp_num
, type
;
3089 re_node_set
*cur_nodes
;
3091 re_dfa_t
*const dfa
= mctx
->dfa
;
3093 int cache_idx
, cache_idx_start
;
3094 /* The current state. */
3096 cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3097 for (cache_idx
= cache_idx_start
; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
3099 int to_idx
, next_node
;
3100 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ cache_idx
;
3101 if (ent
->str_idx
> cur_str
)
3103 /* Is this entry ENT is appropriate? */
3104 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3107 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3108 /* Calculate the destination of the back reference, and append it
3109 to MCTX->STATE_LOG. */
3110 if (to_idx
== cur_str
)
3112 /* The backreference did epsilon transit, we must re-check all the
3113 node in the current state. */
3114 re_node_set new_dests
;
3115 reg_errcode_t err2
, err3
;
3116 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3117 if (re_node_set_contains (cur_nodes
, next_node
))
3119 err
= re_node_set_init_1 (&new_dests
, next_node
);
3120 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3121 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3122 re_node_set_free (&new_dests
);
3123 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3124 || err3
!= REG_NOERROR
, 0))
3126 err
= (err
!= REG_NOERROR
? err
3127 : (err2
!= REG_NOERROR
? err2
: err3
));
3130 /* TODO: It is still inefficient... */
3131 cache_idx
= cache_idx_start
- 1;
3136 re_node_set union_set
;
3137 next_node
= dfa
->nexts
[ent
->node
];
3138 if (mctx
->state_log
[to_idx
])
3141 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3144 err
= re_node_set_init_copy (&union_set
,
3145 &mctx
->state_log
[to_idx
]->nodes
);
3146 ret
= re_node_set_insert (&union_set
, next_node
);
3147 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3149 re_node_set_free (&union_set
);
3150 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3156 err
= re_node_set_init_1 (&union_set
, next_node
);
3157 if (BE (err
!= REG_NOERROR
, 0))
3160 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3161 re_node_set_free (&union_set
);
3162 if (BE (mctx
->state_log
[to_idx
] == NULL
3163 && err
!= REG_NOERROR
, 0))
3170 /* Build transition table for the state.
3171 Return the new table if succeeded, otherwise return NULL. */
3173 static re_dfastate_t
**
3174 build_trtable (dfa
, state
)
3176 re_dfastate_t
*state
;
3180 unsigned int elem
, mask
;
3181 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3182 int ndests
; /* Number of the destination states from `state'. */
3183 re_dfastate_t
**trtable
;
3184 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3185 re_node_set follows
, *dests_node
;
3189 /* We build DFA states which corresponds to the destination nodes
3190 from `state'. `dests_node[i]' represents the nodes which i-th
3191 destination state contains, and `dests_ch[i]' represents the
3192 characters which i-th destination state accepts. */
3194 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3195 dests_node
= (re_node_set
*)
3196 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3200 dests_node
= (re_node_set
*)
3201 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3202 if (BE (dests_node
== NULL
, 0))
3204 dests_node_malloced
= 1;
3206 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3208 /* Initialize transiton table. */
3209 state
->word_trtable
= 0;
3211 /* At first, group all nodes belonging to `state' into several
3213 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3214 if (BE (ndests
<= 0, 0))
3216 if (dests_node_malloced
)
3218 /* Return NULL in case of an error, trtable otherwise. */
3221 state
->trtable
= (re_dfastate_t
**)
3222 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);;
3223 return state
->trtable
;
3228 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3229 if (BE (err
!= REG_NOERROR
, 0))
3233 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3234 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3235 dest_states
= (re_dfastate_t
**)
3236 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3240 dest_states
= (re_dfastate_t
**)
3241 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3242 if (BE (dest_states
== NULL
, 0))
3245 if (dest_states_malloced
)
3247 re_node_set_free (&follows
);
3248 for (i
= 0; i
< ndests
; ++i
)
3249 re_node_set_free (dests_node
+ i
);
3250 if (dests_node_malloced
)
3254 dest_states_malloced
= 1;
3256 dest_states_word
= dest_states
+ ndests
;
3257 dest_states_nl
= dest_states_word
+ ndests
;
3258 bitset_empty (acceptable
);
3260 /* Then build the states for all destinations. */
3261 for (i
= 0; i
< ndests
; ++i
)
3264 re_node_set_empty (&follows
);
3265 /* Merge the follows of this destination states. */
3266 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3268 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3269 if (next_node
!= -1)
3271 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3272 if (BE (err
!= REG_NOERROR
, 0))
3276 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3277 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3279 /* If the new state has context constraint,
3280 build appropriate states for these contexts. */
3281 if (dest_states
[i
]->has_constraint
)
3283 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3285 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3288 if (dest_states
[i
] != dest_states_word
[i
]
3289 && dfa
->mb_cur_max
> 1)
3290 state
->word_trtable
= 1;
3292 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3294 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3299 dest_states_word
[i
] = dest_states
[i
];
3300 dest_states_nl
[i
] = dest_states
[i
];
3302 bitset_merge (acceptable
, dests_ch
[i
]);
3305 if (!BE (state
->word_trtable
, 0))
3307 /* We don't care about whether the following character is a word
3308 character, or we are in a single-byte character set so we can
3309 discern by looking at the character code: allocate a
3310 256-entry transition table. */
3311 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3312 if (BE (trtable
== NULL
, 0))
3315 /* For all characters ch...: */
3316 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3317 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3319 mask
<<= 1, elem
>>= 1, ++ch
)
3320 if (BE (elem
& 1, 0))
3322 /* There must be exactly one destination which accepts
3323 character ch. See group_nodes_into_DFAstates. */
3324 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3327 /* j-th destination accepts the word character ch. */
3328 if (dfa
->word_char
[i
] & mask
)
3329 trtable
[ch
] = dest_states_word
[j
];
3331 trtable
[ch
] = dest_states
[j
];
3336 /* We care about whether the following character is a word
3337 character, and we are in a multi-byte character set: discern
3338 by looking at the character code: build two 256-entry
3339 transition tables, one starting at trtable[0] and one
3340 starting at trtable[SBC_MAX]. */
3341 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*),
3343 if (BE (trtable
== NULL
, 0))
3346 /* For all characters ch...: */
3347 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3348 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3350 mask
<<= 1, elem
>>= 1, ++ch
)
3351 if (BE (elem
& 1, 0))
3353 /* There must be exactly one destination which accepts
3354 character ch. See group_nodes_into_DFAstates. */
3355 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3358 /* j-th destination accepts the word character ch. */
3359 trtable
[ch
] = dest_states
[j
];
3360 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3365 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3367 /* The current state accepts newline character. */
3368 for (j
= 0; j
< ndests
; ++j
)
3369 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3371 /* k-th destination accepts newline character. */
3372 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3373 if (state
->word_trtable
)
3374 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3375 /* There must be only one destination which accepts
3376 newline. See group_nodes_into_DFAstates. */
3381 if (dest_states_malloced
)
3384 re_node_set_free (&follows
);
3385 for (i
= 0; i
< ndests
; ++i
)
3386 re_node_set_free (dests_node
+ i
);
3388 if (dests_node_malloced
)
3391 state
->trtable
= trtable
;
3395 /* Group all nodes belonging to STATE into several destinations.
3396 Then for all destinations, set the nodes belonging to the destination
3397 to DESTS_NODE[i] and set the characters accepted by the destination
3398 to DEST_CH[i]. This function return the number of destinations. */
3401 group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
)
3403 const re_dfastate_t
*state
;
3404 re_node_set
*dests_node
;
3409 int ndests
; /* Number of the destinations from `state'. */
3410 bitset accepts
; /* Characters a node can accept. */
3411 const re_node_set
*cur_nodes
= &state
->nodes
;
3412 bitset_empty (accepts
);
3415 /* For all the nodes belonging to `state', */
3416 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3418 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3419 re_token_type_t type
= node
->type
;
3420 unsigned int constraint
= node
->constraint
;
3422 /* Enumerate all single byte character this node can accept. */
3423 if (type
== CHARACTER
)
3424 bitset_set (accepts
, node
->opr
.c
);
3425 else if (type
== SIMPLE_BRACKET
)
3427 bitset_merge (accepts
, node
->opr
.sbcset
);
3429 else if (type
== OP_PERIOD
)
3431 #ifdef RE_ENABLE_I18N
3432 if (dfa
->mb_cur_max
> 1)
3433 bitset_merge (accepts
, dfa
->sb_char
);
3436 bitset_set_all (accepts
);
3437 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3438 bitset_clear (accepts
, '\n');
3439 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3440 bitset_clear (accepts
, '\0');
3442 #ifdef RE_ENABLE_I18N
3443 else if (type
== OP_UTF8_PERIOD
)
3445 memset (accepts
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
3446 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3447 bitset_clear (accepts
, '\n');
3448 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3449 bitset_clear (accepts
, '\0');
3455 /* Check the `accepts' and sift the characters which are not
3456 match it the context. */
3459 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3461 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3462 bitset_empty (accepts
);
3463 if (accepts_newline
)
3464 bitset_set (accepts
, NEWLINE_CHAR
);
3468 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3470 bitset_empty (accepts
);
3474 if (constraint
& NEXT_WORD_CONSTRAINT
)
3476 unsigned int any_set
= 0;
3477 if (type
== CHARACTER
&& !node
->word_char
)
3479 bitset_empty (accepts
);
3482 #ifdef RE_ENABLE_I18N
3483 if (dfa
->mb_cur_max
> 1)
3484 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3485 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3488 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3489 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3493 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3495 unsigned int any_set
= 0;
3496 if (type
== CHARACTER
&& node
->word_char
)
3498 bitset_empty (accepts
);
3501 #ifdef RE_ENABLE_I18N
3502 if (dfa
->mb_cur_max
> 1)
3503 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3504 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3507 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3508 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3514 /* Then divide `accepts' into DFA states, or create a new
3515 state. Above, we make sure that accepts is not empty. */
3516 for (j
= 0; j
< ndests
; ++j
)
3518 bitset intersec
; /* Intersection sets, see below. */
3520 /* Flags, see below. */
3521 int has_intersec
, not_subset
, not_consumed
;
3523 /* Optimization, skip if this state doesn't accept the character. */
3524 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3527 /* Enumerate the intersection set of this state and `accepts'. */
3529 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3530 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3531 /* And skip if the intersection set is empty. */
3535 /* Then check if this state is a subset of `accepts'. */
3536 not_subset
= not_consumed
= 0;
3537 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3539 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3540 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3543 /* If this state isn't a subset of `accepts', create a
3544 new group state, which has the `remains'. */
3547 bitset_copy (dests_ch
[ndests
], remains
);
3548 bitset_copy (dests_ch
[j
], intersec
);
3549 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3550 if (BE (err
!= REG_NOERROR
, 0))
3555 /* Put the position in the current group. */
3556 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3557 if (BE (err
< 0, 0))
3560 /* If all characters are consumed, go to next node. */
3564 /* Some characters remain, create a new group. */
3567 bitset_copy (dests_ch
[ndests
], accepts
);
3568 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3569 if (BE (err
!= REG_NOERROR
, 0))
3572 bitset_empty (accepts
);
3577 for (j
= 0; j
< ndests
; ++j
)
3578 re_node_set_free (dests_node
+ j
);
3582 #ifdef RE_ENABLE_I18N
3583 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3584 Return the number of the bytes the node accepts.
3585 STR_IDX is the current index of the input string.
3587 This function handles the nodes which can accept one character, or
3588 one collating element like '.', '[a-z]', opposite to the other nodes
3589 can only accept one byte. */
3592 check_node_accept_bytes (dfa
, node_idx
, input
, str_idx
)
3594 int node_idx
, str_idx
;
3595 const re_string_t
*input
;
3597 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3598 int char_len
, elem_len
;
3601 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3603 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3604 if (BE (c
< 0xc2, 1))
3607 if (str_idx
+ 2 > input
->len
)
3610 d
= re_string_byte_at (input
, str_idx
+ 1);
3612 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3616 if (c
== 0xe0 && d
< 0xa0)
3622 if (c
== 0xf0 && d
< 0x90)
3628 if (c
== 0xf8 && d
< 0x88)
3634 if (c
== 0xfc && d
< 0x84)
3640 if (str_idx
+ char_len
> input
->len
)
3643 for (i
= 1; i
< char_len
; ++i
)
3645 d
= re_string_byte_at (input
, str_idx
+ i
);
3646 if (d
< 0x80 || d
> 0xbf)
3652 char_len
= re_string_char_size_at (input
, str_idx
);
3653 if (node
->type
== OP_PERIOD
)
3657 /* FIXME: I don't think this if is needed, as both '\n'
3658 and '\0' are char_len == 1. */
3659 /* '.' accepts any one character except the following two cases. */
3660 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3661 re_string_byte_at (input
, str_idx
) == '\n') ||
3662 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3663 re_string_byte_at (input
, str_idx
) == '\0'))
3668 elem_len
= re_string_elem_size_at (input
, str_idx
);
3669 if (elem_len
<= 1 && char_len
<= 1)
3672 if (node
->type
== COMPLEX_BRACKET
)
3674 const re_charset_t
*cset
= node
->opr
.mbcset
;
3676 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3682 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3683 ? re_string_wchar_at (input
, str_idx
) : 0);
3685 /* match with multibyte character? */
3686 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3687 if (wc
== cset
->mbchars
[i
])
3689 match_len
= char_len
;
3690 goto check_node_accept_bytes_match
;
3692 /* match with character_class? */
3693 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3695 wctype_t wt
= cset
->char_classes
[i
];
3696 if (__iswctype (wc
, wt
))
3698 match_len
= char_len
;
3699 goto check_node_accept_bytes_match
;
3704 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3707 unsigned int in_collseq
= 0;
3708 const int32_t *table
, *indirect
;
3709 const unsigned char *weights
, *extra
;
3710 const char *collseqwc
;
3712 /* This #include defines a local function! */
3713 # include <locale/weight.h>
3715 /* match with collating_symbol? */
3716 if (cset
->ncoll_syms
)
3717 extra
= (const unsigned char *)
3718 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3719 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3721 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3722 /* Compare the length of input collating element and
3723 the length of current collating element. */
3724 if (*coll_sym
!= elem_len
)
3726 /* Compare each bytes. */
3727 for (j
= 0; j
< *coll_sym
; j
++)
3728 if (pin
[j
] != coll_sym
[1 + j
])
3732 /* Match if every bytes is equal. */
3734 goto check_node_accept_bytes_match
;
3740 if (elem_len
<= char_len
)
3742 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3743 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3746 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3748 /* match with range expression? */
3749 for (i
= 0; i
< cset
->nranges
; ++i
)
3750 if (cset
->range_starts
[i
] <= in_collseq
3751 && in_collseq
<= cset
->range_ends
[i
])
3753 match_len
= elem_len
;
3754 goto check_node_accept_bytes_match
;
3757 /* match with equivalence_class? */
3758 if (cset
->nequiv_classes
)
3760 const unsigned char *cp
= pin
;
3761 table
= (const int32_t *)
3762 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3763 weights
= (const unsigned char *)
3764 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3765 extra
= (const unsigned char *)
3766 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3767 indirect
= (const int32_t *)
3768 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3769 idx
= findidx (&cp
);
3771 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3773 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3774 size_t weight_len
= weights
[idx
];
3775 if (weight_len
== weights
[equiv_class_idx
])
3778 while (cnt
<= weight_len
3779 && (weights
[equiv_class_idx
+ 1 + cnt
]
3780 == weights
[idx
+ 1 + cnt
]))
3782 if (cnt
> weight_len
)
3784 match_len
= elem_len
;
3785 goto check_node_accept_bytes_match
;
3794 /* match with range expression? */
3796 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3798 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3801 for (i
= 0; i
< cset
->nranges
; ++i
)
3803 cmp_buf
[0] = cset
->range_starts
[i
];
3804 cmp_buf
[4] = cset
->range_ends
[i
];
3805 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3806 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3808 match_len
= char_len
;
3809 goto check_node_accept_bytes_match
;
3813 check_node_accept_bytes_match
:
3814 if (!cset
->non_match
)
3821 return (elem_len
> char_len
) ? elem_len
: char_len
;
3829 find_collation_sequence_value (mbs
, mbs_len
)
3830 const unsigned char *mbs
;
3833 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3838 /* No valid character. Match it as a single byte character. */
3839 const unsigned char *collseq
= (const unsigned char *)
3840 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3841 return collseq
[mbs
[0]];
3848 const unsigned char *extra
= (const unsigned char *)
3849 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3853 int mbs_cnt
, found
= 0;
3854 int32_t elem_mbs_len
;
3855 /* Skip the name of collating element name. */
3856 idx
= idx
+ extra
[idx
] + 1;
3857 elem_mbs_len
= extra
[idx
++];
3858 if (mbs_len
== elem_mbs_len
)
3860 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3861 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3863 if (mbs_cnt
== elem_mbs_len
)
3864 /* Found the entry. */
3867 /* Skip the byte sequence of the collating element. */
3868 idx
+= elem_mbs_len
;
3869 /* Adjust for the alignment. */
3870 idx
= (idx
+ 3) & ~3;
3871 /* Skip the collation sequence value. */
3872 idx
+= sizeof (uint32_t);
3873 /* Skip the wide char sequence of the collating element. */
3874 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3875 /* If we found the entry, return the sequence value. */
3877 return *(uint32_t *) (extra
+ idx
);
3878 /* Skip the collation sequence value. */
3879 idx
+= sizeof (uint32_t);
3884 #endif /* RE_ENABLE_I18N */
3886 /* Check whether the node accepts the byte which is IDX-th
3887 byte of the INPUT. */
3890 check_node_accept (mctx
, node
, idx
)
3891 const re_match_context_t
*mctx
;
3892 const re_token_t
*node
;
3895 re_dfa_t
*const dfa
= mctx
->dfa
;
3897 if (node
->constraint
)
3899 /* The node has constraints. Check whether the current context
3900 satisfies the constraints. */
3901 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
3903 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3906 ch
= re_string_byte_at (&mctx
->input
, idx
);
3910 return node
->opr
.c
== ch
;
3911 case SIMPLE_BRACKET
:
3912 return bitset_contain (node
->opr
.sbcset
, ch
);
3913 #ifdef RE_ENABLE_I18N
3914 case OP_UTF8_PERIOD
:
3920 return !((ch
== '\n' && !(dfa
->syntax
& RE_DOT_NEWLINE
))
3921 || (ch
== '\0' && (dfa
->syntax
& RE_DOT_NOT_NULL
)));
3927 /* Extend the buffers, if the buffers have run out. */
3929 static reg_errcode_t
3930 extend_buffers (mctx
)
3931 re_match_context_t
*mctx
;
3934 re_string_t
*pstr
= &mctx
->input
;
3936 /* Double the lengthes of the buffers. */
3937 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3938 if (BE (ret
!= REG_NOERROR
, 0))
3941 if (mctx
->state_log
!= NULL
)
3943 /* And double the length of state_log. */
3944 /* XXX We have no indication of the size of this buffer. If this
3945 allocation fail we have no indication that the state_log array
3946 does not have the right size. */
3947 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
3948 pstr
->bufs_len
+ 1);
3949 if (BE (new_array
== NULL
, 0))
3951 mctx
->state_log
= new_array
;
3954 /* Then reconstruct the buffers. */
3957 #ifdef RE_ENABLE_I18N
3958 if (pstr
->mb_cur_max
> 1)
3960 ret
= build_wcs_upper_buffer (pstr
);
3961 if (BE (ret
!= REG_NOERROR
, 0))
3965 #endif /* RE_ENABLE_I18N */
3966 build_upper_buffer (pstr
);
3970 #ifdef RE_ENABLE_I18N
3971 if (pstr
->mb_cur_max
> 1)
3972 build_wcs_buffer (pstr
);
3974 #endif /* RE_ENABLE_I18N */
3976 if (pstr
->trans
!= NULL
)
3977 re_string_translate_buffer (pstr
);
3984 /* Functions for matching context. */
3986 /* Initialize MCTX. */
3988 static reg_errcode_t
3989 match_ctx_init (mctx
, eflags
, n
)
3990 re_match_context_t
*mctx
;
3993 mctx
->eflags
= eflags
;
3994 mctx
->match_last
= -1;
3997 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
3998 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
3999 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4002 /* Already zero-ed by the caller.
4004 mctx->bkref_ents = NULL;
4005 mctx->nbkref_ents = 0;
4006 mctx->nsub_tops = 0; */
4007 mctx
->abkref_ents
= n
;
4008 mctx
->max_mb_elem_len
= 1;
4009 mctx
->asub_tops
= n
;
4013 /* Clean the entries which depend on the current input in MCTX.
4014 This function must be invoked when the matcher changes the start index
4015 of the input, or changes the input string. */
4018 match_ctx_clean (mctx
)
4019 re_match_context_t
*mctx
;
4021 match_ctx_free_subtops (mctx
);
4022 mctx
->nsub_tops
= 0;
4023 mctx
->nbkref_ents
= 0;
4026 /* Free all the memory associated with MCTX. */
4029 match_ctx_free (mctx
)
4030 re_match_context_t
*mctx
;
4032 match_ctx_free_subtops (mctx
);
4033 re_free (mctx
->sub_tops
);
4034 re_free (mctx
->bkref_ents
);
4037 /* Free all the memory associated with MCTX->SUB_TOPS. */
4040 match_ctx_free_subtops (mctx
)
4041 re_match_context_t
*mctx
;
4044 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4047 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4048 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4050 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4051 re_free (last
->path
.array
);
4054 re_free (top
->lasts
);
4057 re_free (top
->path
->array
);
4058 re_free (top
->path
);
4064 /* Add a new backreference entry to MCTX.
4065 Note that we assume that caller never call this function with duplicate
4066 entry, and call with STR_IDX which isn't smaller than any existing entry.
4069 static reg_errcode_t
4070 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
4071 re_match_context_t
*mctx
;
4072 int node
, str_idx
, from
, to
;
4074 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4076 struct re_backref_cache_entry
* new_entry
;
4077 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4078 mctx
->abkref_ents
* 2);
4079 if (BE (new_entry
== NULL
, 0))
4081 re_free (mctx
->bkref_ents
);
4084 mctx
->bkref_ents
= new_entry
;
4085 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4086 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4087 mctx
->abkref_ents
*= 2;
4089 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4090 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4091 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4092 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4093 mctx
->bkref_ents
[mctx
->nbkref_ents
++].flag
= 0;
4094 if (mctx
->max_mb_elem_len
< to
- from
)
4095 mctx
->max_mb_elem_len
= to
- from
;
4099 /* Search for the first entry which has the same str_idx.
4100 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4103 search_cur_bkref_entry (mctx
, str_idx
)
4104 re_match_context_t
*mctx
;
4107 int left
, right
, mid
;
4108 right
= mctx
->nbkref_ents
;
4109 for (left
= 0; left
< right
;)
4111 mid
= (left
+ right
) / 2;
4112 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4121 match_ctx_clear_flag (mctx
)
4122 re_match_context_t
*mctx
;
4125 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
4126 mctx
->bkref_ents
[i
].flag
= 0;
4129 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4132 static reg_errcode_t
4133 match_ctx_add_subtop (mctx
, node
, str_idx
)
4134 re_match_context_t
*mctx
;
4138 assert (mctx
->sub_tops
!= NULL
);
4139 assert (mctx
->asub_tops
> 0);
4141 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4143 int new_asub_tops
= mctx
->asub_tops
* 2;
4144 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4145 re_sub_match_top_t
*,
4147 if (BE (new_array
== NULL
, 0))
4149 mctx
->sub_tops
= new_array
;
4150 mctx
->asub_tops
= new_asub_tops
;
4152 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4153 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4155 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4156 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4160 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4161 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4163 static re_sub_match_last_t
*
4164 match_ctx_add_sublast (subtop
, node
, str_idx
)
4165 re_sub_match_top_t
*subtop
;
4168 re_sub_match_last_t
*new_entry
;
4169 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4171 int new_alasts
= 2 * subtop
->alasts
+ 1;
4172 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4173 re_sub_match_last_t
*,
4175 if (BE (new_array
== NULL
, 0))
4177 subtop
->lasts
= new_array
;
4178 subtop
->alasts
= new_alasts
;
4180 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4181 if (BE (new_entry
!= NULL
, 1))
4183 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4184 new_entry
->node
= node
;
4185 new_entry
->str_idx
= str_idx
;
4192 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
,
4194 re_sift_context_t
*sctx
;
4195 re_dfastate_t
**sifted_sts
, **limited_sts
;
4196 int last_node
, last_str_idx
, check_subexp
;
4198 sctx
->sifted_states
= sifted_sts
;
4199 sctx
->limited_states
= limited_sts
;
4200 sctx
->last_node
= last_node
;
4201 sctx
->last_str_idx
= last_str_idx
;
4202 sctx
->check_subexp
= check_subexp
;
4203 sctx
->cur_bkref
= -1;
4204 sctx
->cls_subexp_idx
= -1;
4205 re_node_set_init_empty (&sctx
->limits
);