1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
43 int eflags
) internal_function
;
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
) internal_function
;
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
52 int ret_len
) internal_function
;
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
) internal_function
;
55 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
57 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
58 int *p_match_first
) internal_function
;
59 static int check_halt_state_context (const re_match_context_t
*mctx
,
60 const re_dfastate_t
*state
, int idx
)
62 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
63 regmatch_t
*prev_idx_match
, int cur_node
,
64 int cur_idx
, int nmatch
) internal_function
;
65 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
66 int str_idx
, int dest_node
, int nregs
,
68 re_node_set
*eps_via_nodes
)
70 static reg_errcode_t
set_regs (const regex_t
*preg
,
71 const re_match_context_t
*mctx
,
72 size_t nmatch
, regmatch_t
*pmatch
,
73 int fl_backtrack
) internal_function
;
74 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
78 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
79 re_sift_context_t
*sctx
,
80 int node_idx
, int str_idx
, int max_str_idx
)
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
84 re_sift_context_t
*sctx
)
86 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
87 re_sift_context_t
*sctx
, int str_idx
,
88 re_node_set
*cur_dest
)
90 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
,
93 re_node_set
*dest_nodes
)
95 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
96 re_node_set
*dest_nodes
,
97 const re_node_set
*candidates
)
99 static int check_dst_limits (const re_match_context_t
*mctx
,
101 int dst_node
, int dst_idx
, int src_node
,
102 int src_idx
) internal_function
;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
104 int boundaries
, int subexp_idx
,
105 int from_node
, int bkref_idx
)
107 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
108 int limit
, int subexp_idx
,
109 int node
, int str_idx
,
110 int bkref_idx
) internal_function
;
111 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
112 re_node_set
*dest_nodes
,
113 const re_node_set
*candidates
,
115 struct re_backref_cache_entry
*bkref_ents
,
116 int str_idx
) internal_function
;
117 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
118 re_sift_context_t
*sctx
,
119 int str_idx
, const re_node_set
*candidates
)
121 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
123 re_dfastate_t
**src
, int num
)
125 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
126 re_match_context_t
*mctx
) internal_function
;
127 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
128 re_match_context_t
*mctx
,
129 re_dfastate_t
*state
) internal_function
;
130 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
131 re_match_context_t
*mctx
,
132 re_dfastate_t
*next_state
)
134 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
135 re_node_set
*cur_nodes
,
136 int str_idx
) internal_function
;
138 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
139 re_match_context_t
*mctx
,
140 re_dfastate_t
*pstate
)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
145 re_dfastate_t
*pstate
)
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
149 const re_node_set
*nodes
)
151 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
152 int bkref_node
, int bkref_str_idx
)
154 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
155 const re_sub_match_top_t
*sub_top
,
156 re_sub_match_last_t
*sub_last
,
157 int bkref_node
, int bkref_str
)
159 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
160 int subexp_idx
, int type
) internal_function
;
161 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
162 state_array_t
*path
, int top_node
,
163 int top_str
, int last_node
, int last_str
,
164 int type
) internal_function
;
165 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
167 re_node_set
*cur_nodes
,
168 re_node_set
*next_nodes
)
170 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
171 re_node_set
*cur_nodes
,
172 int ex_subexp
, int type
)
174 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
175 re_node_set
*dst_nodes
,
176 int target
, int ex_subexp
,
177 int type
) internal_function
;
178 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
179 re_node_set
*cur_nodes
, int cur_str
,
180 int subexp_num
, int type
)
182 static int build_trtable (const re_dfa_t
*dfa
,
183 re_dfastate_t
*state
) internal_function
;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
186 const re_string_t
*input
, int idx
)
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
195 const re_dfastate_t
*state
,
196 re_node_set
*states_node
,
197 bitset_t
*states_ch
) internal_function
;
198 static int check_node_accept (const re_match_context_t
*mctx
,
199 const re_token_t
*node
, int idx
)
201 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
222 const regex_t
*__restrict preg
;
223 const char *__restrict string
;
230 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
232 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
235 if (eflags
& REG_STARTEND
)
237 start
= pmatch
[0].rm_so
;
238 length
= pmatch
[0].rm_eo
;
243 length
= strlen (string
);
246 __libc_lock_lock (dfa
->lock
);
248 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
249 length
, 0, NULL
, eflags
);
251 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
252 length
, nmatch
, pmatch
, eflags
);
253 __libc_lock_unlock (dfa
->lock
);
254 return err
!= REG_NOERROR
;
258 # include <shlib-compat.h>
259 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec
) __compat_regexec
;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t
*__restrict preg
,
267 const char *__restrict string
, size_t nmatch
,
268 regmatch_t pmatch
[], int eflags
)
270 return regexec (preg
, string
, nmatch
, pmatch
,
271 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
273 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp
, string
, length
, start
, regs
)
308 struct re_pattern_buffer
*bufp
;
311 struct re_registers
*regs
;
313 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
316 weak_alias (__re_match
, re_match
)
320 re_search (bufp
, string
, length
, start
, range
, regs
)
321 struct re_pattern_buffer
*bufp
;
323 int length
, start
, range
;
324 struct re_registers
*regs
;
326 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
329 weak_alias (__re_search
, re_search
)
333 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
334 struct re_pattern_buffer
*bufp
;
335 const char *string1
, *string2
;
336 int length1
, length2
, start
, stop
;
337 struct re_registers
*regs
;
339 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
340 start
, 0, regs
, stop
, 1);
343 weak_alias (__re_match_2
, re_match_2
)
347 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
348 struct re_pattern_buffer
*bufp
;
349 const char *string1
, *string2
;
350 int length1
, length2
, start
, range
, stop
;
351 struct re_registers
*regs
;
353 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
354 start
, range
, regs
, stop
, 0);
357 weak_alias (__re_search_2
, re_search_2
)
361 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
363 struct re_pattern_buffer
*bufp
;
364 const char *string1
, *string2
;
365 int length1
, length2
, start
, range
, stop
, ret_len
;
366 struct re_registers
*regs
;
370 int len
= length1
+ length2
;
373 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
376 /* Concatenate the strings. */
380 char *s
= re_malloc (char, len
);
382 if (BE (s
== NULL
, 0))
385 memcpy (__mempcpy (s
, string1
, length1
), string2
, length2
);
387 memcpy (s
, string1
, length1
);
388 memcpy (s
+ length1
, string2
, length2
);
398 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
401 re_free ((char *) str
);
405 /* The parameters have the same meaning as those of re_search.
406 Additional parameters:
407 If RET_LEN is nonzero the length of the match is returned (re_match style);
408 otherwise the position of the match is returned. */
411 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
412 struct re_pattern_buffer
*bufp
;
414 int length
, start
, range
, stop
, ret_len
;
415 struct re_registers
*regs
;
417 reg_errcode_t result
;
421 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
423 /* Check for out-of-range. */
424 if (BE (start
< 0 || start
> length
, 0))
426 if (BE (start
+ range
> length
, 0))
427 range
= length
- start
;
428 else if (BE (start
+ range
< 0, 0))
431 __libc_lock_lock (dfa
->lock
);
433 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
434 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
436 /* Compile fastmap if we haven't yet. */
437 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
438 re_compile_fastmap (bufp
);
440 if (BE (bufp
->no_sub
, 0))
443 /* We need at least 1 register. */
446 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
447 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
449 nregs
= regs
->num_regs
;
450 if (BE (nregs
< 1, 0))
452 /* Nothing can be copied to regs. */
458 nregs
= bufp
->re_nsub
+ 1;
459 pmatch
= re_malloc (regmatch_t
, nregs
);
460 if (BE (pmatch
== NULL
, 0))
466 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
467 nregs
, pmatch
, eflags
);
471 /* I hope we needn't fill ther regs with -1's when no match was found. */
472 if (result
!= REG_NOERROR
)
474 else if (regs
!= NULL
)
476 /* If caller wants register contents data back, copy them. */
477 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
478 bufp
->regs_allocated
);
479 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
483 if (BE (rval
== 0, 1))
487 assert (pmatch
[0].rm_so
== start
);
488 rval
= pmatch
[0].rm_eo
- start
;
491 rval
= pmatch
[0].rm_so
;
495 __libc_lock_unlock (dfa
->lock
);
500 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
501 struct re_registers
*regs
;
503 int nregs
, regs_allocated
;
505 int rval
= REGS_REALLOCATE
;
507 int need_regs
= nregs
+ 1;
508 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
511 /* Have the register data arrays been allocated? */
512 if (regs_allocated
== REGS_UNALLOCATED
)
513 { /* No. So allocate them with malloc. */
514 regs
->start
= re_malloc (regoff_t
, need_regs
);
515 regs
->end
= re_malloc (regoff_t
, need_regs
);
516 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
517 return REGS_UNALLOCATED
;
518 regs
->num_regs
= need_regs
;
520 else if (regs_allocated
== REGS_REALLOCATE
)
521 { /* Yes. If we need more elements than were already
522 allocated, reallocate them. If we need fewer, just
524 if (BE (need_regs
> regs
->num_regs
, 0))
526 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
527 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
528 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
529 return REGS_UNALLOCATED
;
530 regs
->start
= new_start
;
532 regs
->num_regs
= need_regs
;
537 assert (regs_allocated
== REGS_FIXED
);
538 /* This function may not be called with REGS_FIXED and nregs too big. */
539 assert (regs
->num_regs
>= nregs
);
544 for (i
= 0; i
< nregs
; ++i
)
546 regs
->start
[i
] = pmatch
[i
].rm_so
;
547 regs
->end
[i
] = pmatch
[i
].rm_eo
;
549 for ( ; i
< regs
->num_regs
; ++i
)
550 regs
->start
[i
] = regs
->end
[i
] = -1;
555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
556 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
557 this memory for recording register information. STARTS and ENDS
558 must be allocated using the malloc library routine, and must each
559 be at least NUM_REGS * sizeof (regoff_t) bytes long.
561 If NUM_REGS == 0, then subsequent matches should allocate their own
564 Unless this function is called, the first search or match using
565 PATTERN_BUFFER will allocate its own register data, without
566 freeing the old data. */
569 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
570 struct re_pattern_buffer
*bufp
;
571 struct re_registers
*regs
;
573 regoff_t
*starts
, *ends
;
577 bufp
->regs_allocated
= REGS_REALLOCATE
;
578 regs
->num_regs
= num_regs
;
579 regs
->start
= starts
;
584 bufp
->regs_allocated
= REGS_UNALLOCATED
;
586 regs
->start
= regs
->end
= (regoff_t
*) 0;
590 weak_alias (__re_set_registers
, re_set_registers
)
593 /* Entry points compatible with 4.2 BSD regex library. We don't define
594 them unless specifically requested. */
596 #if defined _REGEX_RE_COMP || defined _LIBC
604 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
606 #endif /* _REGEX_RE_COMP */
608 /* Internal entry point. */
610 /* Searches for a compiled pattern PREG in the string STRING, whose
611 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
612 mingings with regexec. START, and RANGE have the same meanings
614 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
615 otherwise return the error code.
616 Note: We assume front end functions already check ranges.
617 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
620 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
624 int length
, start
, range
, stop
, eflags
;
629 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
630 int left_lim
, right_lim
, incr
;
631 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
635 re_match_context_t mctx
= { .dfa
= dfa
};
637 re_match_context_t mctx
;
639 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
640 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
641 RE_TRANSLATE_TYPE t
= preg
->translate
;
643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
644 memset (&mctx
, '\0', sizeof (re_match_context_t
));
648 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
649 nmatch
-= extra_nmatch
;
651 /* Check if the DFA haven't been compiled. */
652 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
653 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
654 || dfa
->init_state_begbuf
== NULL
, 0))
658 /* We assume front-end functions already check them. */
659 assert (start
+ range
>= 0 && start
+ range
<= length
);
662 /* If initial states with non-begbuf contexts have no elements,
663 the regex must be anchored. If preg->newline_anchor is set,
664 we'll never use init_state_nl, so do not check it. */
665 if (dfa
->init_state
->nodes
.nelem
== 0
666 && dfa
->init_state_word
->nodes
.nelem
== 0
667 && (dfa
->init_state_nl
->nodes
.nelem
== 0
668 || !preg
->newline_anchor
))
670 if (start
!= 0 && start
+ range
!= 0)
675 /* We must check the longest matching, if nmatch > 0. */
676 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
678 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
679 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
680 if (BE (err
!= REG_NOERROR
, 0))
682 mctx
.input
.stop
= stop
;
683 mctx
.input
.raw_stop
= stop
;
684 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
686 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
687 if (BE (err
!= REG_NOERROR
, 0))
690 /* We will log all the DFA states through which the dfa pass,
691 if nmatch > 1, or this dfa has "multibyte node", which is a
692 back-reference or a node which can accept multibyte character or
693 multi character collating element. */
694 if (nmatch
> 1 || dfa
->has_mb_node
)
696 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
697 if (BE (mctx
.state_log
== NULL
, 0))
704 mctx
.state_log
= NULL
;
707 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
708 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
710 /* Check incrementally whether of not the input string match. */
711 incr
= (range
< 0) ? -1 : 1;
712 left_lim
= (range
< 0) ? start
+ range
: start
;
713 right_lim
= (range
< 0) ? start
: start
+ range
;
714 sb
= dfa
->mb_cur_max
== 1;
717 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
718 | (range
>= 0 ? 2 : 0)
719 | (t
!= NULL
? 1 : 0))
722 for (;; match_first
+= incr
)
725 if (match_first
< left_lim
|| right_lim
< match_first
)
728 /* Advance as rapidly as possible through the string, until we
729 find a plausible place to start matching. This may be done
730 with varying efficiency, so there are various possibilities:
731 only the most common of them are specialized, in order to
732 save on code size. We use a switch statement for speed. */
740 /* Fastmap with single-byte translation, match forward. */
741 while (BE (match_first
< right_lim
, 1)
742 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
744 goto forward_match_found_start_or_reached_end
;
747 /* Fastmap without translation, match forward. */
748 while (BE (match_first
< right_lim
, 1)
749 && !fastmap
[(unsigned char) string
[match_first
]])
752 forward_match_found_start_or_reached_end
:
753 if (BE (match_first
== right_lim
, 0))
755 ch
= match_first
>= length
756 ? 0 : (unsigned char) string
[match_first
];
757 if (!fastmap
[t
? t
[ch
] : ch
])
764 /* Fastmap without multi-byte translation, match backwards. */
765 while (match_first
>= left_lim
)
767 ch
= match_first
>= length
768 ? 0 : (unsigned char) string
[match_first
];
769 if (fastmap
[t
? t
[ch
] : ch
])
773 if (match_first
< left_lim
)
778 /* In this case, we can't determine easily the current byte,
779 since it might be a component byte of a multibyte
780 character. Then we use the constructed buffer instead. */
783 /* If MATCH_FIRST is out of the valid range, reconstruct the
785 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
786 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
788 err
= re_string_reconstruct (&mctx
.input
, match_first
,
790 if (BE (err
!= REG_NOERROR
, 0))
793 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
795 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
796 Note that MATCH_FIRST must not be smaller than 0. */
797 ch
= (match_first
>= length
798 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
802 if (match_first
< left_lim
|| match_first
> right_lim
)
811 /* Reconstruct the buffers so that the matcher can assume that
812 the matching starts from the beginning of the buffer. */
813 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
814 if (BE (err
!= REG_NOERROR
, 0))
817 #ifdef RE_ENABLE_I18N
818 /* Don't consider this char as a possible match start if it part,
819 yet isn't the head, of a multibyte character. */
820 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
824 /* It seems to be appropriate one, then use the matcher. */
825 /* We assume that the matching starts from 0. */
826 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
827 match_last
= check_matching (&mctx
, fl_longest_match
,
828 range
>= 0 ? &match_first
: NULL
);
829 if (match_last
!= -1)
831 if (BE (match_last
== -2, 0))
838 mctx
.match_last
= match_last
;
839 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
841 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
842 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
845 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
848 err
= prune_impossible_nodes (&mctx
);
849 if (err
== REG_NOERROR
)
851 if (BE (err
!= REG_NOMATCH
, 0))
856 break; /* We found a match. */
860 match_ctx_clean (&mctx
);
864 assert (match_last
!= -1);
865 assert (err
== REG_NOERROR
);
868 /* Set pmatch[] if we need. */
873 /* Initialize registers. */
874 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
875 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
877 /* Set the points where matching start/end. */
879 pmatch
[0].rm_eo
= mctx
.match_last
;
881 if (!preg
->no_sub
&& nmatch
> 1)
883 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
884 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
885 if (BE (err
!= REG_NOERROR
, 0))
889 /* At last, add the offset to the each registers, since we slided
890 the buffers so that we could assume that the matching starts
892 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
893 if (pmatch
[reg_idx
].rm_so
!= -1)
895 #ifdef RE_ENABLE_I18N
896 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
898 pmatch
[reg_idx
].rm_so
=
899 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
900 ? mctx
.input
.valid_raw_len
901 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
902 pmatch
[reg_idx
].rm_eo
=
903 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
904 ? mctx
.input
.valid_raw_len
905 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
908 assert (mctx
.input
.offsets_needed
== 0);
910 pmatch
[reg_idx
].rm_so
+= match_first
;
911 pmatch
[reg_idx
].rm_eo
+= match_first
;
913 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
915 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
916 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
920 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
921 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
923 pmatch
[reg_idx
+ 1].rm_so
924 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
925 pmatch
[reg_idx
+ 1].rm_eo
926 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
931 re_free (mctx
.state_log
);
933 match_ctx_free (&mctx
);
934 re_string_destruct (&mctx
.input
);
939 prune_impossible_nodes (mctx
)
940 re_match_context_t
*mctx
;
942 const re_dfa_t
*const dfa
= mctx
->dfa
;
943 int halt_node
, match_last
;
945 re_dfastate_t
**sifted_states
;
946 re_dfastate_t
**lim_states
= NULL
;
947 re_sift_context_t sctx
;
949 assert (mctx
->state_log
!= NULL
);
951 match_last
= mctx
->match_last
;
952 halt_node
= mctx
->last_node
;
953 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
954 if (BE (sifted_states
== NULL
, 0))
961 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
962 if (BE (lim_states
== NULL
, 0))
969 memset (lim_states
, '\0',
970 sizeof (re_dfastate_t
*) * (match_last
+ 1));
971 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
973 ret
= sift_states_backward (mctx
, &sctx
);
974 re_node_set_free (&sctx
.limits
);
975 if (BE (ret
!= REG_NOERROR
, 0))
977 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
987 } while (mctx
->state_log
[match_last
] == NULL
988 || !mctx
->state_log
[match_last
]->halt
);
989 halt_node
= check_halt_state_context (mctx
,
990 mctx
->state_log
[match_last
],
993 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
995 re_free (lim_states
);
997 if (BE (ret
!= REG_NOERROR
, 0))
1002 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1003 ret
= sift_states_backward (mctx
, &sctx
);
1004 re_node_set_free (&sctx
.limits
);
1005 if (BE (ret
!= REG_NOERROR
, 0))
1008 re_free (mctx
->state_log
);
1009 mctx
->state_log
= sifted_states
;
1010 sifted_states
= NULL
;
1011 mctx
->last_node
= halt_node
;
1012 mctx
->match_last
= match_last
;
1015 re_free (sifted_states
);
1016 re_free (lim_states
);
1020 /* Acquire an initial state and return it.
1021 We must select appropriate initial state depending on the context,
1022 since initial states may have constraints like "\<", "^", etc.. */
1024 static inline re_dfastate_t
*
1025 __attribute ((always_inline
)) internal_function
1026 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1029 const re_dfa_t
*const dfa
= mctx
->dfa
;
1030 if (dfa
->init_state
->has_constraint
)
1032 unsigned int context
;
1033 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1034 if (IS_WORD_CONTEXT (context
))
1035 return dfa
->init_state_word
;
1036 else if (IS_ORDINARY_CONTEXT (context
))
1037 return dfa
->init_state
;
1038 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1039 return dfa
->init_state_begbuf
;
1040 else if (IS_NEWLINE_CONTEXT (context
))
1041 return dfa
->init_state_nl
;
1042 else if (IS_BEGBUF_CONTEXT (context
))
1044 /* It is relatively rare case, then calculate on demand. */
1045 return re_acquire_state_context (err
, dfa
,
1046 dfa
->init_state
->entrance_nodes
,
1050 /* Must not happen? */
1051 return dfa
->init_state
;
1054 return dfa
->init_state
;
1057 /* Check whether the regular expression match input string INPUT or not,
1058 and return the index where the matching end, return -1 if not match,
1059 or return -2 in case of an error.
1060 FL_LONGEST_MATCH means we want the POSIX longest matching.
1061 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1062 next place where we may want to try matching.
1063 Note that the matcher assume that the maching starts from the current
1064 index of the buffer. */
1068 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1071 const re_dfa_t
*const dfa
= mctx
->dfa
;
1074 int match_last
= -1;
1075 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1076 re_dfastate_t
*cur_state
;
1077 int at_init_state
= p_match_first
!= NULL
;
1078 int next_start_idx
= cur_str_idx
;
1081 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1082 /* An initial state must not be NULL (invalid). */
1083 if (BE (cur_state
== NULL
, 0))
1085 assert (err
== REG_ESPACE
);
1089 if (mctx
->state_log
!= NULL
)
1091 mctx
->state_log
[cur_str_idx
] = cur_state
;
1093 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1094 later. E.g. Processing back references. */
1095 if (BE (dfa
->nbackref
, 0))
1098 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1099 if (BE (err
!= REG_NOERROR
, 0))
1102 if (cur_state
->has_backref
)
1104 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1105 if (BE (err
!= REG_NOERROR
, 0))
1111 /* If the RE accepts NULL string. */
1112 if (BE (cur_state
->halt
, 0))
1114 if (!cur_state
->has_constraint
1115 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1117 if (!fl_longest_match
)
1121 match_last
= cur_str_idx
;
1127 while (!re_string_eoi (&mctx
->input
))
1129 re_dfastate_t
*old_state
= cur_state
;
1130 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1132 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1133 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1134 && mctx
->input
.valid_len
< mctx
->input
.len
))
1136 err
= extend_buffers (mctx
);
1137 if (BE (err
!= REG_NOERROR
, 0))
1139 assert (err
== REG_ESPACE
);
1144 cur_state
= transit_state (&err
, mctx
, cur_state
);
1145 if (mctx
->state_log
!= NULL
)
1146 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1148 if (cur_state
== NULL
)
1150 /* Reached the invalid state or an error. Try to recover a valid
1151 state using the state log, if available and if we have not
1152 already found a valid (even if not the longest) match. */
1153 if (BE (err
!= REG_NOERROR
, 0))
1156 if (mctx
->state_log
== NULL
1157 || (match
&& !fl_longest_match
)
1158 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1162 if (BE (at_init_state
, 0))
1164 if (old_state
== cur_state
)
1165 next_start_idx
= next_char_idx
;
1170 if (cur_state
->halt
)
1172 /* Reached a halt state.
1173 Check the halt state can satisfy the current context. */
1174 if (!cur_state
->has_constraint
1175 || check_halt_state_context (mctx
, cur_state
,
1176 re_string_cur_idx (&mctx
->input
)))
1178 /* We found an appropriate halt state. */
1179 match_last
= re_string_cur_idx (&mctx
->input
);
1182 /* We found a match, do not modify match_first below. */
1183 p_match_first
= NULL
;
1184 if (!fl_longest_match
)
1191 *p_match_first
+= next_start_idx
;
1196 /* Check NODE match the current context. */
1200 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1202 re_token_type_t type
= dfa
->nodes
[node
].type
;
1203 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1204 if (type
!= END_OF_RE
)
1208 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1213 /* Check the halt state STATE match the current context.
1214 Return 0 if not match, if the node, STATE has, is a halt node and
1215 match the context, return the node. */
1219 check_halt_state_context (const re_match_context_t
*mctx
,
1220 const re_dfastate_t
*state
, int idx
)
1223 unsigned int context
;
1225 assert (state
->halt
);
1227 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1228 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1229 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1230 return state
->nodes
.elems
[i
];
1234 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1235 corresponding to the DFA).
1236 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1241 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1242 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1243 struct re_fail_stack_t
*fs
)
1245 const re_dfa_t
*const dfa
= mctx
->dfa
;
1247 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1249 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1250 re_node_set
*edests
= &dfa
->edests
[node
];
1252 err
= re_node_set_insert (eps_via_nodes
, node
);
1253 if (BE (err
< 0, 0))
1255 /* Pick up a valid destination, or return -1 if none is found. */
1256 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1258 int candidate
= edests
->elems
[i
];
1259 if (!re_node_set_contains (cur_nodes
, candidate
))
1261 if (dest_node
== -1)
1262 dest_node
= candidate
;
1266 /* In order to avoid infinite loop like "(a*)*", return the second
1267 epsilon-transition if the first was already considered. */
1268 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1271 /* Otherwise, push the second epsilon-transition on the fail stack. */
1273 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1277 /* We know we are going to exit. */
1286 re_token_type_t type
= dfa
->nodes
[node
].type
;
1288 #ifdef RE_ENABLE_I18N
1289 if (dfa
->nodes
[node
].accept_mb
)
1290 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1292 #endif /* RE_ENABLE_I18N */
1293 if (type
== OP_BACK_REF
)
1295 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1296 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1299 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1303 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1304 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1313 err
= re_node_set_insert (eps_via_nodes
, node
);
1314 if (BE (err
< 0, 0))
1316 dest_node
= dfa
->edests
[node
].elems
[0];
1317 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1324 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1326 int dest_node
= dfa
->nexts
[node
];
1327 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1328 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1329 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1332 re_node_set_empty (eps_via_nodes
);
1339 static reg_errcode_t
1341 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1342 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1345 int num
= fs
->num
++;
1346 if (fs
->num
== fs
->alloc
)
1348 struct re_fail_stack_ent_t
*new_array
;
1349 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1351 if (new_array
== NULL
)
1354 fs
->stack
= new_array
;
1356 fs
->stack
[num
].idx
= str_idx
;
1357 fs
->stack
[num
].node
= dest_node
;
1358 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1359 if (fs
->stack
[num
].regs
== NULL
)
1361 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1362 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1368 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1369 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1371 int num
= --fs
->num
;
1373 *pidx
= fs
->stack
[num
].idx
;
1374 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1375 re_node_set_free (eps_via_nodes
);
1376 re_free (fs
->stack
[num
].regs
);
1377 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1378 return fs
->stack
[num
].node
;
1381 /* Set the positions where the subexpressions are starts/ends to registers
1383 Note: We assume that pmatch[0] is already set, and
1384 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1386 static reg_errcode_t
1388 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1389 regmatch_t
*pmatch
, int fl_backtrack
)
1391 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1393 re_node_set eps_via_nodes
;
1394 struct re_fail_stack_t
*fs
;
1395 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1396 regmatch_t
*prev_idx_match
;
1397 int prev_idx_match_malloced
= 0;
1400 assert (nmatch
> 1);
1401 assert (mctx
->state_log
!= NULL
);
1406 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1407 if (fs
->stack
== NULL
)
1413 cur_node
= dfa
->init_node
;
1414 re_node_set_init_empty (&eps_via_nodes
);
1416 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1417 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1420 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1421 if (prev_idx_match
== NULL
)
1423 free_fail_stack_return (fs
);
1426 prev_idx_match_malloced
= 1;
1428 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1430 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1432 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1434 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1439 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1440 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1442 if (reg_idx
== nmatch
)
1444 re_node_set_free (&eps_via_nodes
);
1445 if (prev_idx_match_malloced
)
1446 re_free (prev_idx_match
);
1447 return free_fail_stack_return (fs
);
1449 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1454 re_node_set_free (&eps_via_nodes
);
1455 if (prev_idx_match_malloced
)
1456 re_free (prev_idx_match
);
1461 /* Proceed to next node. */
1462 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1463 &eps_via_nodes
, fs
);
1465 if (BE (cur_node
< 0, 0))
1467 if (BE (cur_node
== -2, 0))
1469 re_node_set_free (&eps_via_nodes
);
1470 if (prev_idx_match_malloced
)
1471 re_free (prev_idx_match
);
1472 free_fail_stack_return (fs
);
1476 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1480 re_node_set_free (&eps_via_nodes
);
1481 if (prev_idx_match_malloced
)
1482 re_free (prev_idx_match
);
1487 re_node_set_free (&eps_via_nodes
);
1488 if (prev_idx_match_malloced
)
1489 re_free (prev_idx_match
);
1490 return free_fail_stack_return (fs
);
1493 static reg_errcode_t
1495 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1500 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1502 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1503 re_free (fs
->stack
[fs_idx
].regs
);
1505 re_free (fs
->stack
);
1512 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1513 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1515 int type
= dfa
->nodes
[cur_node
].type
;
1516 if (type
== OP_OPEN_SUBEXP
)
1518 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1520 /* We are at the first node of this sub expression. */
1521 if (reg_num
< nmatch
)
1523 pmatch
[reg_num
].rm_so
= cur_idx
;
1524 pmatch
[reg_num
].rm_eo
= -1;
1527 else if (type
== OP_CLOSE_SUBEXP
)
1529 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1530 if (reg_num
< nmatch
)
1532 /* We are at the last node of this sub expression. */
1533 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1535 pmatch
[reg_num
].rm_eo
= cur_idx
;
1536 /* This is a non-empty match or we are not inside an optional
1537 subexpression. Accept this right away. */
1538 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1542 if (dfa
->nodes
[cur_node
].opt_subexp
1543 && prev_idx_match
[reg_num
].rm_so
!= -1)
1544 /* We transited through an empty match for an optional
1545 subexpression, like (a?)*, and this is not the subexp's
1546 first match. Copy back the old content of the registers
1547 so that matches of an inner subexpression are undone as
1548 well, like in ((a?))*. */
1549 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1551 /* We completed a subexpression, but it may be part of
1552 an optional one, so do not update PREV_IDX_MATCH. */
1553 pmatch
[reg_num
].rm_eo
= cur_idx
;
1559 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1560 and sift the nodes in each states according to the following rules.
1561 Updated state_log will be wrote to STATE_LOG.
1563 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1564 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1565 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1566 the LAST_NODE, we throw away the node `a'.
1567 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1568 string `s' and transit to `b':
1569 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1571 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1572 thrown away, we throw away the node `a'.
1573 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1576 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1577 we throw away the node `a'. */
1579 #define STATE_NODE_CONTAINS(state,node) \
1580 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1582 static reg_errcode_t
1584 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1588 int str_idx
= sctx
->last_str_idx
;
1589 re_node_set cur_dest
;
1592 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1595 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1596 transit to the last_node and the last_node itself. */
1597 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1598 if (BE (err
!= REG_NOERROR
, 0))
1600 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1601 if (BE (err
!= REG_NOERROR
, 0))
1604 /* Then check each states in the state_log. */
1607 /* Update counters. */
1608 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1609 if (null_cnt
> mctx
->max_mb_elem_len
)
1611 memset (sctx
->sifted_states
, '\0',
1612 sizeof (re_dfastate_t
*) * str_idx
);
1613 re_node_set_free (&cur_dest
);
1616 re_node_set_empty (&cur_dest
);
1619 if (mctx
->state_log
[str_idx
])
1621 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1622 if (BE (err
!= REG_NOERROR
, 0))
1626 /* Add all the nodes which satisfy the following conditions:
1627 - It can epsilon transit to a node in CUR_DEST.
1629 And update state_log. */
1630 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1631 if (BE (err
!= REG_NOERROR
, 0))
1636 re_node_set_free (&cur_dest
);
1640 static reg_errcode_t
1642 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1643 int str_idx
, re_node_set
*cur_dest
)
1645 const re_dfa_t
*const dfa
= mctx
->dfa
;
1646 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1649 /* Then build the next sifted state.
1650 We build the next sifted state on `cur_dest', and update
1651 `sifted_states[str_idx]' with `cur_dest'.
1653 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1654 `cur_src' points the node_set of the old `state_log[str_idx]'
1655 (with the epsilon nodes pre-filtered out). */
1656 for (i
= 0; i
< cur_src
->nelem
; i
++)
1658 int prev_node
= cur_src
->elems
[i
];
1663 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1664 assert (!IS_EPSILON_NODE (type
));
1666 #ifdef RE_ENABLE_I18N
1667 /* If the node may accept `multi byte'. */
1668 if (dfa
->nodes
[prev_node
].accept_mb
)
1669 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1670 str_idx
, sctx
->last_str_idx
);
1671 #endif /* RE_ENABLE_I18N */
1673 /* We don't check backreferences here.
1674 See update_cur_sifted_state(). */
1676 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1677 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1678 dfa
->nexts
[prev_node
]))
1684 if (sctx
->limits
.nelem
)
1686 int to_idx
= str_idx
+ naccepted
;
1687 if (check_dst_limits (mctx
, &sctx
->limits
,
1688 dfa
->nexts
[prev_node
], to_idx
,
1689 prev_node
, str_idx
))
1692 ret
= re_node_set_insert (cur_dest
, prev_node
);
1693 if (BE (ret
== -1, 0))
1700 /* Helper functions. */
1702 static reg_errcode_t
1704 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1706 int top
= mctx
->state_log_top
;
1708 if (next_state_log_idx
>= mctx
->input
.bufs_len
1709 || (next_state_log_idx
>= mctx
->input
.valid_len
1710 && mctx
->input
.valid_len
< mctx
->input
.len
))
1713 err
= extend_buffers (mctx
);
1714 if (BE (err
!= REG_NOERROR
, 0))
1718 if (top
< next_state_log_idx
)
1720 memset (mctx
->state_log
+ top
+ 1, '\0',
1721 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1722 mctx
->state_log_top
= next_state_log_idx
;
1727 static reg_errcode_t
1729 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1730 re_dfastate_t
**src
, int num
)
1734 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1736 if (dst
[st_idx
] == NULL
)
1737 dst
[st_idx
] = src
[st_idx
];
1738 else if (src
[st_idx
] != NULL
)
1740 re_node_set merged_set
;
1741 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1742 &src
[st_idx
]->nodes
);
1743 if (BE (err
!= REG_NOERROR
, 0))
1745 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1746 re_node_set_free (&merged_set
);
1747 if (BE (err
!= REG_NOERROR
, 0))
1754 static reg_errcode_t
1756 update_cur_sifted_state (const re_match_context_t
*mctx
,
1757 re_sift_context_t
*sctx
, int str_idx
,
1758 re_node_set
*dest_nodes
)
1760 const re_dfa_t
*const dfa
= mctx
->dfa
;
1761 reg_errcode_t err
= REG_NOERROR
;
1762 const re_node_set
*candidates
;
1763 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1764 : &mctx
->state_log
[str_idx
]->nodes
);
1766 if (dest_nodes
->nelem
== 0)
1767 sctx
->sifted_states
[str_idx
] = NULL
;
1772 /* At first, add the nodes which can epsilon transit to a node in
1774 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1775 if (BE (err
!= REG_NOERROR
, 0))
1778 /* Then, check the limitations in the current sift_context. */
1779 if (sctx
->limits
.nelem
)
1781 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1782 mctx
->bkref_ents
, str_idx
);
1783 if (BE (err
!= REG_NOERROR
, 0))
1788 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1789 if (BE (err
!= REG_NOERROR
, 0))
1793 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1795 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1796 if (BE (err
!= REG_NOERROR
, 0))
1802 static reg_errcode_t
1804 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1805 const re_node_set
*candidates
)
1807 reg_errcode_t err
= REG_NOERROR
;
1810 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1811 if (BE (err
!= REG_NOERROR
, 0))
1814 if (!state
->inveclosure
.alloc
)
1816 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1817 if (BE (err
!= REG_NOERROR
, 0))
1819 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1820 re_node_set_merge (&state
->inveclosure
,
1821 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1823 return re_node_set_add_intersect (dest_nodes
, candidates
,
1824 &state
->inveclosure
);
1827 static reg_errcode_t
1829 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1830 const re_node_set
*candidates
)
1834 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1835 re_node_set except_nodes
;
1836 re_node_set_init_empty (&except_nodes
);
1837 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1839 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1840 if (cur_node
== node
)
1842 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1844 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1845 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1846 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1847 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1848 && re_node_set_contains (dest_nodes
, edst1
))
1850 && !re_node_set_contains (inv_eclosure
, edst2
)
1851 && re_node_set_contains (dest_nodes
, edst2
)))
1853 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1854 dfa
->inveclosures
+ cur_node
);
1855 if (BE (err
!= REG_NOERROR
, 0))
1857 re_node_set_free (&except_nodes
);
1863 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1865 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1866 if (!re_node_set_contains (&except_nodes
, cur_node
))
1868 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1869 re_node_set_remove_at (dest_nodes
, idx
);
1872 re_node_set_free (&except_nodes
);
1878 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1879 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1881 const re_dfa_t
*const dfa
= mctx
->dfa
;
1882 int lim_idx
, src_pos
, dst_pos
;
1884 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1885 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1886 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1889 struct re_backref_cache_entry
*ent
;
1890 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1891 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1893 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1894 subexp_idx
, dst_node
, dst_idx
,
1896 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1897 subexp_idx
, src_node
, src_idx
,
1901 <src> <dst> ( <subexp> )
1902 ( <subexp> ) <src> <dst>
1903 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1904 if (src_pos
== dst_pos
)
1905 continue; /* This is unrelated limitation. */
1914 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1915 int subexp_idx
, int from_node
, int bkref_idx
)
1917 const re_dfa_t
*const dfa
= mctx
->dfa
;
1918 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1921 /* Else, we are on the boundary: examine the nodes on the epsilon
1923 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1925 int node
= eclosures
->elems
[node_idx
];
1926 switch (dfa
->nodes
[node
].type
)
1929 if (bkref_idx
!= -1)
1931 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1936 if (ent
->node
!= node
)
1939 if (subexp_idx
< BITSET_WORD_BITS
1940 && !(ent
->eps_reachable_subexps_map
1941 & ((bitset_word_t
) 1 << subexp_idx
)))
1944 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1945 OP_CLOSE_SUBEXP cases below. But, if the
1946 destination node is the same node as the source
1947 node, don't recurse because it would cause an
1948 infinite loop: a regex that exhibits this behavior
1950 dst
= dfa
->edests
[node
].elems
[0];
1951 if (dst
== from_node
)
1955 else /* if (boundaries & 2) */
1960 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1962 if (cpos
== -1 /* && (boundaries & 1) */)
1964 if (cpos
== 0 && (boundaries
& 2))
1967 if (subexp_idx
< BITSET_WORD_BITS
)
1968 ent
->eps_reachable_subexps_map
1969 &= ~((bitset_word_t
) 1 << subexp_idx
);
1971 while (ent
++->more
);
1975 case OP_OPEN_SUBEXP
:
1976 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1980 case OP_CLOSE_SUBEXP
:
1981 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1990 return (boundaries
& 2) ? 1 : 0;
1995 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
1996 int subexp_idx
, int from_node
, int str_idx
,
1999 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2002 /* If we are outside the range of the subexpression, return -1 or 1. */
2003 if (str_idx
< lim
->subexp_from
)
2006 if (lim
->subexp_to
< str_idx
)
2009 /* If we are within the subexpression, return 0. */
2010 boundaries
= (str_idx
== lim
->subexp_from
);
2011 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2012 if (boundaries
== 0)
2015 /* Else, examine epsilon closure. */
2016 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2017 from_node
, bkref_idx
);
2020 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2021 which are against limitations from DEST_NODES. */
2023 static reg_errcode_t
2025 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2026 const re_node_set
*candidates
, re_node_set
*limits
,
2027 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2030 int node_idx
, lim_idx
;
2032 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2035 struct re_backref_cache_entry
*ent
;
2036 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2038 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2039 continue; /* This is unrelated limitation. */
2041 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2042 if (ent
->subexp_to
== str_idx
)
2046 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2048 int node
= dest_nodes
->elems
[node_idx
];
2049 re_token_type_t type
= dfa
->nodes
[node
].type
;
2050 if (type
== OP_OPEN_SUBEXP
2051 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2053 else if (type
== OP_CLOSE_SUBEXP
2054 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2058 /* Check the limitation of the open subexpression. */
2059 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2062 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2064 if (BE (err
!= REG_NOERROR
, 0))
2068 /* Check the limitation of the close subexpression. */
2070 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2072 int node
= dest_nodes
->elems
[node_idx
];
2073 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2075 && !re_node_set_contains (dfa
->eclosures
+ node
,
2078 /* It is against this limitation.
2079 Remove it form the current sifted state. */
2080 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2082 if (BE (err
!= REG_NOERROR
, 0))
2088 else /* (ent->subexp_to != str_idx) */
2090 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2092 int node
= dest_nodes
->elems
[node_idx
];
2093 re_token_type_t type
= dfa
->nodes
[node
].type
;
2094 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2096 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2098 /* It is against this limitation.
2099 Remove it form the current sifted state. */
2100 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2102 if (BE (err
!= REG_NOERROR
, 0))
2111 static reg_errcode_t
2113 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2114 int str_idx
, const re_node_set
*candidates
)
2116 const re_dfa_t
*const dfa
= mctx
->dfa
;
2119 re_sift_context_t local_sctx
;
2120 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2122 if (first_idx
== -1)
2125 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2127 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2130 re_token_type_t type
;
2131 struct re_backref_cache_entry
*entry
;
2132 node
= candidates
->elems
[node_idx
];
2133 type
= dfa
->nodes
[node
].type
;
2134 /* Avoid infinite loop for the REs like "()\1+". */
2135 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2137 if (type
!= OP_BACK_REF
)
2140 entry
= mctx
->bkref_ents
+ first_idx
;
2141 enabled_idx
= first_idx
;
2148 re_dfastate_t
*cur_state
;
2150 if (entry
->node
!= node
)
2152 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2153 to_idx
= str_idx
+ subexp_len
;
2154 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2155 : dfa
->edests
[node
].elems
[0]);
2157 if (to_idx
> sctx
->last_str_idx
2158 || sctx
->sifted_states
[to_idx
] == NULL
2159 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2160 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2161 str_idx
, dst_node
, to_idx
))
2164 if (local_sctx
.sifted_states
== NULL
)
2167 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2168 if (BE (err
!= REG_NOERROR
, 0))
2171 local_sctx
.last_node
= node
;
2172 local_sctx
.last_str_idx
= str_idx
;
2173 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2174 if (BE (ret
< 0, 0))
2179 cur_state
= local_sctx
.sifted_states
[str_idx
];
2180 err
= sift_states_backward (mctx
, &local_sctx
);
2181 if (BE (err
!= REG_NOERROR
, 0))
2183 if (sctx
->limited_states
!= NULL
)
2185 err
= merge_state_array (dfa
, sctx
->limited_states
,
2186 local_sctx
.sifted_states
,
2188 if (BE (err
!= REG_NOERROR
, 0))
2191 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2192 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2194 /* mctx->bkref_ents may have changed, reload the pointer. */
2195 entry
= mctx
->bkref_ents
+ enabled_idx
;
2197 while (enabled_idx
++, entry
++->more
);
2201 if (local_sctx
.sifted_states
!= NULL
)
2203 re_node_set_free (&local_sctx
.limits
);
2210 #ifdef RE_ENABLE_I18N
2213 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2214 int node_idx
, int str_idx
, int max_str_idx
)
2216 const re_dfa_t
*const dfa
= mctx
->dfa
;
2218 /* Check the node can accept `multi byte'. */
2219 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2220 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2221 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2222 dfa
->nexts
[node_idx
]))
2223 /* The node can't accept the `multi byte', or the
2224 destination was already thrown away, then the node
2225 could't accept the current input `multi byte'. */
2227 /* Otherwise, it is sure that the node could accept
2228 `naccepted' bytes input. */
2231 #endif /* RE_ENABLE_I18N */
2234 /* Functions for state transition. */
2236 /* Return the next state to which the current state STATE will transit by
2237 accepting the current input byte, and update STATE_LOG if necessary.
2238 If STATE can accept a multibyte char/collating element/back reference
2239 update the destination of STATE_LOG. */
2241 static re_dfastate_t
*
2243 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2244 re_dfastate_t
*state
)
2246 re_dfastate_t
**trtable
;
2249 #ifdef RE_ENABLE_I18N
2250 /* If the current state can accept multibyte. */
2251 if (BE (state
->accept_mb
, 0))
2253 *err
= transit_state_mb (mctx
, state
);
2254 if (BE (*err
!= REG_NOERROR
, 0))
2257 #endif /* RE_ENABLE_I18N */
2259 /* Then decide the next state with the single byte. */
2262 /* don't use transition table */
2263 return transit_state_sb (err
, mctx
, state
);
2266 /* Use transition table */
2267 ch
= re_string_fetch_byte (&mctx
->input
);
2270 trtable
= state
->trtable
;
2271 if (BE (trtable
!= NULL
, 1))
2274 trtable
= state
->word_trtable
;
2275 if (BE (trtable
!= NULL
, 1))
2277 unsigned int context
;
2279 = re_string_context_at (&mctx
->input
,
2280 re_string_cur_idx (&mctx
->input
) - 1,
2282 if (IS_WORD_CONTEXT (context
))
2283 return trtable
[ch
+ SBC_MAX
];
2288 if (!build_trtable (mctx
->dfa
, state
))
2294 /* Retry, we now have a transition table. */
2298 /* Update the state_log if we need */
2301 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2302 re_dfastate_t
*next_state
)
2304 const re_dfa_t
*const dfa
= mctx
->dfa
;
2305 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2307 if (cur_idx
> mctx
->state_log_top
)
2309 mctx
->state_log
[cur_idx
] = next_state
;
2310 mctx
->state_log_top
= cur_idx
;
2312 else if (mctx
->state_log
[cur_idx
] == 0)
2314 mctx
->state_log
[cur_idx
] = next_state
;
2318 re_dfastate_t
*pstate
;
2319 unsigned int context
;
2320 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2321 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2322 the destination of a multibyte char/collating element/
2323 back reference. Then the next state is the union set of
2324 these destinations and the results of the transition table. */
2325 pstate
= mctx
->state_log
[cur_idx
];
2326 log_nodes
= pstate
->entrance_nodes
;
2327 if (next_state
!= NULL
)
2329 table_nodes
= next_state
->entrance_nodes
;
2330 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2332 if (BE (*err
!= REG_NOERROR
, 0))
2336 next_nodes
= *log_nodes
;
2337 /* Note: We already add the nodes of the initial state,
2338 then we don't need to add them here. */
2340 context
= re_string_context_at (&mctx
->input
,
2341 re_string_cur_idx (&mctx
->input
) - 1,
2343 next_state
= mctx
->state_log
[cur_idx
]
2344 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2345 /* We don't need to check errors here, since the return value of
2346 this function is next_state and ERR is already set. */
2348 if (table_nodes
!= NULL
)
2349 re_node_set_free (&next_nodes
);
2352 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2354 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2355 later. We must check them here, since the back references in the
2356 next state might use them. */
2357 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2359 if (BE (*err
!= REG_NOERROR
, 0))
2362 /* If the next state has back references. */
2363 if (next_state
->has_backref
)
2365 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2366 if (BE (*err
!= REG_NOERROR
, 0))
2368 next_state
= mctx
->state_log
[cur_idx
];
2375 /* Skip bytes in the input that correspond to part of a
2376 multi-byte match, then look in the log for a state
2377 from which to restart matching. */
2380 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2382 re_dfastate_t
*cur_state
;
2385 int max
= mctx
->state_log_top
;
2386 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2390 if (++cur_str_idx
> max
)
2392 re_string_skip_bytes (&mctx
->input
, 1);
2394 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2396 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2398 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2402 /* Helper functions for transit_state. */
2404 /* From the node set CUR_NODES, pick up the nodes whose types are
2405 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2406 expression. And register them to use them later for evaluating the
2407 correspoding back references. */
2409 static reg_errcode_t
2411 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2414 const re_dfa_t
*const dfa
= mctx
->dfa
;
2418 /* TODO: This isn't efficient.
2419 Because there might be more than one nodes whose types are
2420 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2423 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2425 int node
= cur_nodes
->elems
[node_idx
];
2426 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2427 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2428 && (dfa
->used_bkref_map
2429 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2431 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2432 if (BE (err
!= REG_NOERROR
, 0))
2440 /* Return the next state to which the current state STATE will transit by
2441 accepting the current input byte. */
2443 static re_dfastate_t
*
2444 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2445 re_dfastate_t
*state
)
2447 const re_dfa_t
*const dfa
= mctx
->dfa
;
2448 re_node_set next_nodes
;
2449 re_dfastate_t
*next_state
;
2450 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2451 unsigned int context
;
2453 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2454 if (BE (*err
!= REG_NOERROR
, 0))
2456 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2458 int cur_node
= state
->nodes
.elems
[node_cnt
];
2459 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2461 *err
= re_node_set_merge (&next_nodes
,
2462 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2463 if (BE (*err
!= REG_NOERROR
, 0))
2465 re_node_set_free (&next_nodes
);
2470 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2471 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2472 /* We don't need to check errors here, since the return value of
2473 this function is next_state and ERR is already set. */
2475 re_node_set_free (&next_nodes
);
2476 re_string_skip_bytes (&mctx
->input
, 1);
2481 #ifdef RE_ENABLE_I18N
2482 static reg_errcode_t
2484 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2486 const re_dfa_t
*const dfa
= mctx
->dfa
;
2490 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2492 re_node_set dest_nodes
, *new_nodes
;
2493 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2494 int naccepted
, dest_idx
;
2495 unsigned int context
;
2496 re_dfastate_t
*dest_state
;
2498 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2501 if (dfa
->nodes
[cur_node_idx
].constraint
)
2503 context
= re_string_context_at (&mctx
->input
,
2504 re_string_cur_idx (&mctx
->input
),
2506 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2511 /* How many bytes the node can accept? */
2512 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2513 re_string_cur_idx (&mctx
->input
));
2517 /* The node can accepts `naccepted' bytes. */
2518 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2519 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2520 : mctx
->max_mb_elem_len
);
2521 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2522 if (BE (err
!= REG_NOERROR
, 0))
2525 assert (dfa
->nexts
[cur_node_idx
] != -1);
2527 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2529 dest_state
= mctx
->state_log
[dest_idx
];
2530 if (dest_state
== NULL
)
2531 dest_nodes
= *new_nodes
;
2534 err
= re_node_set_init_union (&dest_nodes
,
2535 dest_state
->entrance_nodes
, new_nodes
);
2536 if (BE (err
!= REG_NOERROR
, 0))
2539 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2541 mctx
->state_log
[dest_idx
]
2542 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2543 if (dest_state
!= NULL
)
2544 re_node_set_free (&dest_nodes
);
2545 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2550 #endif /* RE_ENABLE_I18N */
2552 static reg_errcode_t
2554 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2556 const re_dfa_t
*const dfa
= mctx
->dfa
;
2559 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2561 for (i
= 0; i
< nodes
->nelem
; ++i
)
2563 int dest_str_idx
, prev_nelem
, bkc_idx
;
2564 int node_idx
= nodes
->elems
[i
];
2565 unsigned int context
;
2566 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2567 re_node_set
*new_dest_nodes
;
2569 /* Check whether `node' is a backreference or not. */
2570 if (node
->type
!= OP_BACK_REF
)
2573 if (node
->constraint
)
2575 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2577 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2581 /* `node' is a backreference.
2582 Check the substring which the substring matched. */
2583 bkc_idx
= mctx
->nbkref_ents
;
2584 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2585 if (BE (err
!= REG_NOERROR
, 0))
2588 /* And add the epsilon closures (which is `new_dest_nodes') of
2589 the backreference to appropriate state_log. */
2591 assert (dfa
->nexts
[node_idx
] != -1);
2593 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2596 re_dfastate_t
*dest_state
;
2597 struct re_backref_cache_entry
*bkref_ent
;
2598 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2599 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2601 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2602 new_dest_nodes
= (subexp_len
== 0
2603 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2604 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2605 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2606 - bkref_ent
->subexp_from
);
2607 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2609 dest_state
= mctx
->state_log
[dest_str_idx
];
2610 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2611 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2612 /* Add `new_dest_node' to state_log. */
2613 if (dest_state
== NULL
)
2615 mctx
->state_log
[dest_str_idx
]
2616 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2618 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2619 && err
!= REG_NOERROR
, 0))
2624 re_node_set dest_nodes
;
2625 err
= re_node_set_init_union (&dest_nodes
,
2626 dest_state
->entrance_nodes
,
2628 if (BE (err
!= REG_NOERROR
, 0))
2630 re_node_set_free (&dest_nodes
);
2633 mctx
->state_log
[dest_str_idx
]
2634 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2635 re_node_set_free (&dest_nodes
);
2636 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2637 && err
!= REG_NOERROR
, 0))
2640 /* We need to check recursively if the backreference can epsilon
2643 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2645 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2647 if (BE (err
!= REG_NOERROR
, 0))
2649 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2650 if (BE (err
!= REG_NOERROR
, 0))
2660 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2661 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2662 Note that we might collect inappropriate candidates here.
2663 However, the cost of checking them strictly here is too high, then we
2664 delay these checking for prune_impossible_nodes(). */
2666 static reg_errcode_t
2668 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2670 const re_dfa_t
*const dfa
= mctx
->dfa
;
2671 int subexp_num
, sub_top_idx
;
2672 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2673 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2674 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2675 if (cache_idx
!= -1)
2677 const struct re_backref_cache_entry
*entry
2678 = mctx
->bkref_ents
+ cache_idx
;
2680 if (entry
->node
== bkref_node
)
2681 return REG_NOERROR
; /* We already checked it. */
2682 while (entry
++->more
);
2685 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2687 /* For each sub expression */
2688 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2691 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2692 re_sub_match_last_t
*sub_last
;
2693 int sub_last_idx
, sl_str
, bkref_str_off
;
2695 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2696 continue; /* It isn't related. */
2698 sl_str
= sub_top
->str_idx
;
2699 bkref_str_off
= bkref_str_idx
;
2700 /* At first, check the last node of sub expressions we already
2702 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2705 sub_last
= sub_top
->lasts
[sub_last_idx
];
2706 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2707 /* The matched string by the sub expression match with the substring
2708 at the back reference? */
2709 if (sl_str_diff
> 0)
2711 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2713 /* Not enough chars for a successful match. */
2714 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2717 err
= clean_state_log_if_needed (mctx
,
2720 if (BE (err
!= REG_NOERROR
, 0))
2722 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2724 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2725 /* We don't need to search this sub expression any more. */
2728 bkref_str_off
+= sl_str_diff
;
2729 sl_str
+= sl_str_diff
;
2730 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2733 /* Reload buf, since the preceding call might have reallocated
2735 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2737 if (err
== REG_NOMATCH
)
2739 if (BE (err
!= REG_NOERROR
, 0))
2743 if (sub_last_idx
< sub_top
->nlasts
)
2745 if (sub_last_idx
> 0)
2747 /* Then, search for the other last nodes of the sub expression. */
2748 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2750 int cls_node
, sl_str_off
;
2751 const re_node_set
*nodes
;
2752 sl_str_off
= sl_str
- sub_top
->str_idx
;
2753 /* The matched string by the sub expression match with the substring
2754 at the back reference? */
2757 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2759 /* If we are at the end of the input, we cannot match. */
2760 if (bkref_str_off
>= mctx
->input
.len
)
2763 err
= extend_buffers (mctx
);
2764 if (BE (err
!= REG_NOERROR
, 0))
2767 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2769 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2770 break; /* We don't need to search this sub expression
2773 if (mctx
->state_log
[sl_str
] == NULL
)
2775 /* Does this state have a ')' of the sub expression? */
2776 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2777 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2781 if (sub_top
->path
== NULL
)
2783 sub_top
->path
= calloc (sizeof (state_array_t
),
2784 sl_str
- sub_top
->str_idx
+ 1);
2785 if (sub_top
->path
== NULL
)
2788 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2789 in the current context? */
2790 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2791 sub_top
->str_idx
, cls_node
, sl_str
,
2793 if (err
== REG_NOMATCH
)
2795 if (BE (err
!= REG_NOERROR
, 0))
2797 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2798 if (BE (sub_last
== NULL
, 0))
2800 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2802 if (err
== REG_NOMATCH
)
2809 /* Helper functions for get_subexp(). */
2811 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2812 If it can arrive, register the sub expression expressed with SUB_TOP
2815 static reg_errcode_t
2817 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2818 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2822 /* Can the subexpression arrive the back reference? */
2823 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2824 sub_last
->str_idx
, bkref_node
, bkref_str
,
2826 if (err
!= REG_NOERROR
)
2828 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2830 if (BE (err
!= REG_NOERROR
, 0))
2832 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2833 return clean_state_log_if_needed (mctx
, to_idx
);
2836 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2837 Search '(' if FL_OPEN, or search ')' otherwise.
2838 TODO: This function isn't efficient...
2839 Because there might be more than one nodes whose types are
2840 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2846 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2847 int subexp_idx
, int type
)
2850 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2852 int cls_node
= nodes
->elems
[cls_idx
];
2853 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2854 if (node
->type
== type
2855 && node
->opr
.idx
== subexp_idx
)
2861 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2862 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2864 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2866 static reg_errcode_t
2868 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2869 int top_str
, int last_node
, int last_str
, int type
)
2871 const re_dfa_t
*const dfa
= mctx
->dfa
;
2872 reg_errcode_t err
= REG_NOERROR
;
2873 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2874 re_dfastate_t
*cur_state
= NULL
;
2875 re_node_set
*cur_nodes
, next_nodes
;
2876 re_dfastate_t
**backup_state_log
;
2877 unsigned int context
;
2879 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2880 /* Extend the buffer if we need. */
2881 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2883 re_dfastate_t
**new_array
;
2884 int old_alloc
= path
->alloc
;
2885 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2886 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2887 if (BE (new_array
== NULL
, 0))
2889 path
->alloc
= old_alloc
;
2892 path
->array
= new_array
;
2893 memset (new_array
+ old_alloc
, '\0',
2894 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2897 str_idx
= path
->next_idx
?: top_str
;
2899 /* Temporary modify MCTX. */
2900 backup_state_log
= mctx
->state_log
;
2901 backup_cur_idx
= mctx
->input
.cur_idx
;
2902 mctx
->state_log
= path
->array
;
2903 mctx
->input
.cur_idx
= str_idx
;
2905 /* Setup initial node set. */
2906 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2907 if (str_idx
== top_str
)
2909 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2910 if (BE (err
!= REG_NOERROR
, 0))
2912 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2913 if (BE (err
!= REG_NOERROR
, 0))
2915 re_node_set_free (&next_nodes
);
2921 cur_state
= mctx
->state_log
[str_idx
];
2922 if (cur_state
&& cur_state
->has_backref
)
2924 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2925 if (BE (err
!= REG_NOERROR
, 0))
2929 re_node_set_init_empty (&next_nodes
);
2931 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2933 if (next_nodes
.nelem
)
2935 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2937 if (BE (err
!= REG_NOERROR
, 0))
2939 re_node_set_free (&next_nodes
);
2943 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2944 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2946 re_node_set_free (&next_nodes
);
2949 mctx
->state_log
[str_idx
] = cur_state
;
2952 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2954 re_node_set_empty (&next_nodes
);
2955 if (mctx
->state_log
[str_idx
+ 1])
2957 err
= re_node_set_merge (&next_nodes
,
2958 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2959 if (BE (err
!= REG_NOERROR
, 0))
2961 re_node_set_free (&next_nodes
);
2967 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2968 &cur_state
->non_eps_nodes
,
2970 if (BE (err
!= REG_NOERROR
, 0))
2972 re_node_set_free (&next_nodes
);
2977 if (next_nodes
.nelem
)
2979 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2980 if (BE (err
!= REG_NOERROR
, 0))
2982 re_node_set_free (&next_nodes
);
2985 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2987 if (BE (err
!= REG_NOERROR
, 0))
2989 re_node_set_free (&next_nodes
);
2993 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2994 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2995 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2997 re_node_set_free (&next_nodes
);
3000 mctx
->state_log
[str_idx
] = cur_state
;
3001 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3003 re_node_set_free (&next_nodes
);
3004 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3005 : &mctx
->state_log
[last_str
]->nodes
);
3006 path
->next_idx
= str_idx
;
3009 mctx
->state_log
= backup_state_log
;
3010 mctx
->input
.cur_idx
= backup_cur_idx
;
3012 /* Then check the current node set has the node LAST_NODE. */
3013 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3019 /* Helper functions for check_arrival. */
3021 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3023 TODO: This function is similar to the functions transit_state*(),
3024 however this function has many additional works.
3025 Can't we unify them? */
3027 static reg_errcode_t
3029 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3030 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3032 const re_dfa_t
*const dfa
= mctx
->dfa
;
3035 reg_errcode_t err
= REG_NOERROR
;
3036 re_node_set union_set
;
3037 re_node_set_init_empty (&union_set
);
3038 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3041 int cur_node
= cur_nodes
->elems
[cur_idx
];
3043 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3044 assert (!IS_EPSILON_NODE (type
));
3046 #ifdef RE_ENABLE_I18N
3047 /* If the node may accept `multi byte'. */
3048 if (dfa
->nodes
[cur_node
].accept_mb
)
3050 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3054 re_dfastate_t
*dest_state
;
3055 int next_node
= dfa
->nexts
[cur_node
];
3056 int next_idx
= str_idx
+ naccepted
;
3057 dest_state
= mctx
->state_log
[next_idx
];
3058 re_node_set_empty (&union_set
);
3061 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3062 if (BE (err
!= REG_NOERROR
, 0))
3064 re_node_set_free (&union_set
);
3068 result
= re_node_set_insert (&union_set
, next_node
);
3069 if (BE (result
< 0, 0))
3071 re_node_set_free (&union_set
);
3074 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3076 if (BE (mctx
->state_log
[next_idx
] == NULL
3077 && err
!= REG_NOERROR
, 0))
3079 re_node_set_free (&union_set
);
3084 #endif /* RE_ENABLE_I18N */
3086 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3088 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3089 if (BE (result
< 0, 0))
3091 re_node_set_free (&union_set
);
3096 re_node_set_free (&union_set
);
3100 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3101 CUR_NODES, however exclude the nodes which are:
3102 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3103 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3106 static reg_errcode_t
3108 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3109 int ex_subexp
, int type
)
3112 int idx
, outside_node
;
3113 re_node_set new_nodes
;
3115 assert (cur_nodes
->nelem
);
3117 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3118 if (BE (err
!= REG_NOERROR
, 0))
3120 /* Create a new node set NEW_NODES with the nodes which are epsilon
3121 closures of the node in CUR_NODES. */
3123 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3125 int cur_node
= cur_nodes
->elems
[idx
];
3126 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3127 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3128 if (outside_node
== -1)
3130 /* There are no problematic nodes, just merge them. */
3131 err
= re_node_set_merge (&new_nodes
, eclosure
);
3132 if (BE (err
!= REG_NOERROR
, 0))
3134 re_node_set_free (&new_nodes
);
3140 /* There are problematic nodes, re-calculate incrementally. */
3141 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3143 if (BE (err
!= REG_NOERROR
, 0))
3145 re_node_set_free (&new_nodes
);
3150 re_node_set_free (cur_nodes
);
3151 *cur_nodes
= new_nodes
;
3155 /* Helper function for check_arrival_expand_ecl.
3156 Check incrementally the epsilon closure of TARGET, and if it isn't
3157 problematic append it to DST_NODES. */
3159 static reg_errcode_t
3161 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3162 int target
, int ex_subexp
, int type
)
3165 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3169 if (dfa
->nodes
[cur_node
].type
== type
3170 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3172 if (type
== OP_CLOSE_SUBEXP
)
3174 err
= re_node_set_insert (dst_nodes
, cur_node
);
3175 if (BE (err
== -1, 0))
3180 err
= re_node_set_insert (dst_nodes
, cur_node
);
3181 if (BE (err
== -1, 0))
3183 if (dfa
->edests
[cur_node
].nelem
== 0)
3185 if (dfa
->edests
[cur_node
].nelem
== 2)
3187 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3188 dfa
->edests
[cur_node
].elems
[1],
3190 if (BE (err
!= REG_NOERROR
, 0))
3193 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3199 /* For all the back references in the current state, calculate the
3200 destination of the back references by the appropriate entry
3201 in MCTX->BKREF_ENTS. */
3203 static reg_errcode_t
3205 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3206 int cur_str
, int subexp_num
, int type
)
3208 const re_dfa_t
*const dfa
= mctx
->dfa
;
3210 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3211 struct re_backref_cache_entry
*ent
;
3213 if (cache_idx_start
== -1)
3217 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3220 int to_idx
, next_node
;
3222 /* Is this entry ENT is appropriate? */
3223 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3226 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3227 /* Calculate the destination of the back reference, and append it
3228 to MCTX->STATE_LOG. */
3229 if (to_idx
== cur_str
)
3231 /* The backreference did epsilon transit, we must re-check all the
3232 node in the current state. */
3233 re_node_set new_dests
;
3234 reg_errcode_t err2
, err3
;
3235 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3236 if (re_node_set_contains (cur_nodes
, next_node
))
3238 err
= re_node_set_init_1 (&new_dests
, next_node
);
3239 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3240 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3241 re_node_set_free (&new_dests
);
3242 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3243 || err3
!= REG_NOERROR
, 0))
3245 err
= (err
!= REG_NOERROR
? err
3246 : (err2
!= REG_NOERROR
? err2
: err3
));
3249 /* TODO: It is still inefficient... */
3254 re_node_set union_set
;
3255 next_node
= dfa
->nexts
[ent
->node
];
3256 if (mctx
->state_log
[to_idx
])
3259 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3262 err
= re_node_set_init_copy (&union_set
,
3263 &mctx
->state_log
[to_idx
]->nodes
);
3264 ret
= re_node_set_insert (&union_set
, next_node
);
3265 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3267 re_node_set_free (&union_set
);
3268 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3274 err
= re_node_set_init_1 (&union_set
, next_node
);
3275 if (BE (err
!= REG_NOERROR
, 0))
3278 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3279 re_node_set_free (&union_set
);
3280 if (BE (mctx
->state_log
[to_idx
] == NULL
3281 && err
!= REG_NOERROR
, 0))
3285 while (ent
++->more
);
3289 /* Build transition table for the state.
3290 Return 1 if succeeded, otherwise return NULL. */
3294 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3297 int i
, j
, ch
, need_word_trtable
= 0;
3298 bitset_word_t elem
, mask
;
3299 bool dests_node_malloced
= false;
3300 bool dest_states_malloced
= false;
3301 int ndests
; /* Number of the destination states from `state'. */
3302 re_dfastate_t
**trtable
;
3303 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3304 re_node_set follows
, *dests_node
;
3306 bitset_t acceptable
;
3310 re_node_set dests_node
[SBC_MAX
];
3311 bitset_t dests_ch
[SBC_MAX
];
3314 /* We build DFA states which corresponds to the destination nodes
3315 from `state'. `dests_node[i]' represents the nodes which i-th
3316 destination state contains, and `dests_ch[i]' represents the
3317 characters which i-th destination state accepts. */
3318 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3319 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3322 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3323 if (BE (dests_alloc
== NULL
, 0))
3325 dests_node_malloced
= true;
3327 dests_node
= dests_alloc
->dests_node
;
3328 dests_ch
= dests_alloc
->dests_ch
;
3330 /* Initialize transiton table. */
3331 state
->word_trtable
= state
->trtable
= NULL
;
3333 /* At first, group all nodes belonging to `state' into several
3335 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3336 if (BE (ndests
<= 0, 0))
3338 if (dests_node_malloced
)
3340 /* Return 0 in case of an error, 1 otherwise. */
3343 state
->trtable
= (re_dfastate_t
**)
3344 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3350 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3351 if (BE (err
!= REG_NOERROR
, 0))
3354 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3355 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3356 dest_states
= (re_dfastate_t
**)
3357 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3360 dest_states
= (re_dfastate_t
**)
3361 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3362 if (BE (dest_states
== NULL
, 0))
3365 if (dest_states_malloced
)
3367 re_node_set_free (&follows
);
3368 for (i
= 0; i
< ndests
; ++i
)
3369 re_node_set_free (dests_node
+ i
);
3370 if (dests_node_malloced
)
3374 dest_states_malloced
= true;
3376 dest_states_word
= dest_states
+ ndests
;
3377 dest_states_nl
= dest_states_word
+ ndests
;
3378 bitset_empty (acceptable
);
3380 /* Then build the states for all destinations. */
3381 for (i
= 0; i
< ndests
; ++i
)
3384 re_node_set_empty (&follows
);
3385 /* Merge the follows of this destination states. */
3386 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3388 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3389 if (next_node
!= -1)
3391 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3392 if (BE (err
!= REG_NOERROR
, 0))
3396 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3397 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3399 /* If the new state has context constraint,
3400 build appropriate states for these contexts. */
3401 if (dest_states
[i
]->has_constraint
)
3403 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3405 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3408 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3409 need_word_trtable
= 1;
3411 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3413 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3418 dest_states_word
[i
] = dest_states
[i
];
3419 dest_states_nl
[i
] = dest_states
[i
];
3421 bitset_merge (acceptable
, dests_ch
[i
]);
3424 if (!BE (need_word_trtable
, 0))
3426 /* We don't care about whether the following character is a word
3427 character, or we are in a single-byte character set so we can
3428 discern by looking at the character code: allocate a
3429 256-entry transition table. */
3430 trtable
= state
->trtable
=
3431 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3432 if (BE (trtable
== NULL
, 0))
3435 /* For all characters ch...: */
3436 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3437 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3439 mask
<<= 1, elem
>>= 1, ++ch
)
3440 if (BE (elem
& 1, 0))
3442 /* There must be exactly one destination which accepts
3443 character ch. See group_nodes_into_DFAstates. */
3444 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3447 /* j-th destination accepts the word character ch. */
3448 if (dfa
->word_char
[i
] & mask
)
3449 trtable
[ch
] = dest_states_word
[j
];
3451 trtable
[ch
] = dest_states
[j
];
3456 /* We care about whether the following character is a word
3457 character, and we are in a multi-byte character set: discern
3458 by looking at the character code: build two 256-entry
3459 transition tables, one starting at trtable[0] and one
3460 starting at trtable[SBC_MAX]. */
3461 trtable
= state
->word_trtable
=
3462 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3463 if (BE (trtable
== NULL
, 0))
3466 /* For all characters ch...: */
3467 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3468 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3470 mask
<<= 1, elem
>>= 1, ++ch
)
3471 if (BE (elem
& 1, 0))
3473 /* There must be exactly one destination which accepts
3474 character ch. See group_nodes_into_DFAstates. */
3475 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3478 /* j-th destination accepts the word character ch. */
3479 trtable
[ch
] = dest_states
[j
];
3480 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3485 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3487 /* The current state accepts newline character. */
3488 for (j
= 0; j
< ndests
; ++j
)
3489 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3491 /* k-th destination accepts newline character. */
3492 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3493 if (need_word_trtable
)
3494 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3495 /* There must be only one destination which accepts
3496 newline. See group_nodes_into_DFAstates. */
3501 if (dest_states_malloced
)
3504 re_node_set_free (&follows
);
3505 for (i
= 0; i
< ndests
; ++i
)
3506 re_node_set_free (dests_node
+ i
);
3508 if (dests_node_malloced
)
3514 /* Group all nodes belonging to STATE into several destinations.
3515 Then for all destinations, set the nodes belonging to the destination
3516 to DESTS_NODE[i] and set the characters accepted by the destination
3517 to DEST_CH[i]. This function return the number of destinations. */
3521 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3522 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3527 int ndests
; /* Number of the destinations from `state'. */
3528 bitset_t accepts
; /* Characters a node can accept. */
3529 const re_node_set
*cur_nodes
= &state
->nodes
;
3530 bitset_empty (accepts
);
3533 /* For all the nodes belonging to `state', */
3534 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3536 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3537 re_token_type_t type
= node
->type
;
3538 unsigned int constraint
= node
->constraint
;
3540 /* Enumerate all single byte character this node can accept. */
3541 if (type
== CHARACTER
)
3542 bitset_set (accepts
, node
->opr
.c
);
3543 else if (type
== SIMPLE_BRACKET
)
3545 bitset_merge (accepts
, node
->opr
.sbcset
);
3547 else if (type
== OP_PERIOD
)
3549 #ifdef RE_ENABLE_I18N
3550 if (dfa
->mb_cur_max
> 1)
3551 bitset_merge (accepts
, dfa
->sb_char
);
3554 bitset_set_all (accepts
);
3555 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3556 bitset_clear (accepts
, '\n');
3557 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3558 bitset_clear (accepts
, '\0');
3560 #ifdef RE_ENABLE_I18N
3561 else if (type
== OP_UTF8_PERIOD
)
3563 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3564 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3565 bitset_clear (accepts
, '\n');
3566 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3567 bitset_clear (accepts
, '\0');
3573 /* Check the `accepts' and sift the characters which are not
3574 match it the context. */
3577 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3579 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3580 bitset_empty (accepts
);
3581 if (accepts_newline
)
3582 bitset_set (accepts
, NEWLINE_CHAR
);
3586 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3588 bitset_empty (accepts
);
3592 if (constraint
& NEXT_WORD_CONSTRAINT
)
3594 bitset_word_t any_set
= 0;
3595 if (type
== CHARACTER
&& !node
->word_char
)
3597 bitset_empty (accepts
);
3600 #ifdef RE_ENABLE_I18N
3601 if (dfa
->mb_cur_max
> 1)
3602 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3603 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3606 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3607 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3611 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3613 bitset_word_t any_set
= 0;
3614 if (type
== CHARACTER
&& node
->word_char
)
3616 bitset_empty (accepts
);
3619 #ifdef RE_ENABLE_I18N
3620 if (dfa
->mb_cur_max
> 1)
3621 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3622 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3625 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3626 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3632 /* Then divide `accepts' into DFA states, or create a new
3633 state. Above, we make sure that accepts is not empty. */
3634 for (j
= 0; j
< ndests
; ++j
)
3636 bitset_t intersec
; /* Intersection sets, see below. */
3638 /* Flags, see below. */
3639 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3641 /* Optimization, skip if this state doesn't accept the character. */
3642 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3645 /* Enumerate the intersection set of this state and `accepts'. */
3647 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3648 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3649 /* And skip if the intersection set is empty. */
3653 /* Then check if this state is a subset of `accepts'. */
3654 not_subset
= not_consumed
= 0;
3655 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3657 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3658 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3661 /* If this state isn't a subset of `accepts', create a
3662 new group state, which has the `remains'. */
3665 bitset_copy (dests_ch
[ndests
], remains
);
3666 bitset_copy (dests_ch
[j
], intersec
);
3667 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3668 if (BE (err
!= REG_NOERROR
, 0))
3673 /* Put the position in the current group. */
3674 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3675 if (BE (result
< 0, 0))
3678 /* If all characters are consumed, go to next node. */
3682 /* Some characters remain, create a new group. */
3685 bitset_copy (dests_ch
[ndests
], accepts
);
3686 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3687 if (BE (err
!= REG_NOERROR
, 0))
3690 bitset_empty (accepts
);
3695 for (j
= 0; j
< ndests
; ++j
)
3696 re_node_set_free (dests_node
+ j
);
3700 #ifdef RE_ENABLE_I18N
3701 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3702 Return the number of the bytes the node accepts.
3703 STR_IDX is the current index of the input string.
3705 This function handles the nodes which can accept one character, or
3706 one collating element like '.', '[a-z]', opposite to the other nodes
3707 can only accept one byte. */
3711 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3712 const re_string_t
*input
, int str_idx
)
3714 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3715 int char_len
, elem_len
;
3718 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3720 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3721 if (BE (c
< 0xc2, 1))
3724 if (str_idx
+ 2 > input
->len
)
3727 d
= re_string_byte_at (input
, str_idx
+ 1);
3729 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3733 if (c
== 0xe0 && d
< 0xa0)
3739 if (c
== 0xf0 && d
< 0x90)
3745 if (c
== 0xf8 && d
< 0x88)
3751 if (c
== 0xfc && d
< 0x84)
3757 if (str_idx
+ char_len
> input
->len
)
3760 for (i
= 1; i
< char_len
; ++i
)
3762 d
= re_string_byte_at (input
, str_idx
+ i
);
3763 if (d
< 0x80 || d
> 0xbf)
3769 char_len
= re_string_char_size_at (input
, str_idx
);
3770 if (node
->type
== OP_PERIOD
)
3774 /* FIXME: I don't think this if is needed, as both '\n'
3775 and '\0' are char_len == 1. */
3776 /* '.' accepts any one character except the following two cases. */
3777 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3778 re_string_byte_at (input
, str_idx
) == '\n') ||
3779 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3780 re_string_byte_at (input
, str_idx
) == '\0'))
3785 elem_len
= re_string_elem_size_at (input
, str_idx
);
3786 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3789 if (node
->type
== COMPLEX_BRACKET
)
3791 const re_charset_t
*cset
= node
->opr
.mbcset
;
3793 const unsigned char *pin
3794 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3799 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3800 ? re_string_wchar_at (input
, str_idx
) : 0);
3802 /* match with multibyte character? */
3803 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3804 if (wc
== cset
->mbchars
[i
])
3806 match_len
= char_len
;
3807 goto check_node_accept_bytes_match
;
3809 /* match with character_class? */
3810 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3812 wctype_t wt
= cset
->char_classes
[i
];
3813 if (__iswctype (wc
, wt
))
3815 match_len
= char_len
;
3816 goto check_node_accept_bytes_match
;
3821 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3824 unsigned int in_collseq
= 0;
3825 const int32_t *table
, *indirect
;
3826 const unsigned char *weights
, *extra
;
3827 const char *collseqwc
;
3829 /* This #include defines a local function! */
3830 # include <locale/weight.h>
3832 /* match with collating_symbol? */
3833 if (cset
->ncoll_syms
)
3834 extra
= (const unsigned char *)
3835 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3836 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3838 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3839 /* Compare the length of input collating element and
3840 the length of current collating element. */
3841 if (*coll_sym
!= elem_len
)
3843 /* Compare each bytes. */
3844 for (j
= 0; j
< *coll_sym
; j
++)
3845 if (pin
[j
] != coll_sym
[1 + j
])
3849 /* Match if every bytes is equal. */
3851 goto check_node_accept_bytes_match
;
3857 if (elem_len
<= char_len
)
3859 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3860 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3863 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3865 /* match with range expression? */
3866 for (i
= 0; i
< cset
->nranges
; ++i
)
3867 if (cset
->range_starts
[i
] <= in_collseq
3868 && in_collseq
<= cset
->range_ends
[i
])
3870 match_len
= elem_len
;
3871 goto check_node_accept_bytes_match
;
3874 /* match with equivalence_class? */
3875 if (cset
->nequiv_classes
)
3877 const unsigned char *cp
= pin
;
3878 table
= (const int32_t *)
3879 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3880 weights
= (const unsigned char *)
3881 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3882 extra
= (const unsigned char *)
3883 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3884 indirect
= (const int32_t *)
3885 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3886 idx
= findidx (&cp
);
3888 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3890 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3891 size_t weight_len
= weights
[idx
];
3892 if (weight_len
== weights
[equiv_class_idx
])
3895 while (cnt
<= weight_len
3896 && (weights
[equiv_class_idx
+ 1 + cnt
]
3897 == weights
[idx
+ 1 + cnt
]))
3899 if (cnt
> weight_len
)
3901 match_len
= elem_len
;
3902 goto check_node_accept_bytes_match
;
3911 /* match with range expression? */
3913 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3915 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3918 for (i
= 0; i
< cset
->nranges
; ++i
)
3920 cmp_buf
[0] = cset
->range_starts
[i
];
3921 cmp_buf
[4] = cset
->range_ends
[i
];
3922 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3923 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3925 match_len
= char_len
;
3926 goto check_node_accept_bytes_match
;
3930 check_node_accept_bytes_match
:
3931 if (!cset
->non_match
)
3938 return (elem_len
> char_len
) ? elem_len
: char_len
;
3947 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3949 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3954 /* No valid character. Match it as a single byte character. */
3955 const unsigned char *collseq
= (const unsigned char *)
3956 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3957 return collseq
[mbs
[0]];
3964 const unsigned char *extra
= (const unsigned char *)
3965 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3966 int32_t extrasize
= (const unsigned char *)
3967 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3969 for (idx
= 0; idx
< extrasize
;)
3971 int mbs_cnt
, found
= 0;
3972 int32_t elem_mbs_len
;
3973 /* Skip the name of collating element name. */
3974 idx
= idx
+ extra
[idx
] + 1;
3975 elem_mbs_len
= extra
[idx
++];
3976 if (mbs_len
== elem_mbs_len
)
3978 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3979 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3981 if (mbs_cnt
== elem_mbs_len
)
3982 /* Found the entry. */
3985 /* Skip the byte sequence of the collating element. */
3986 idx
+= elem_mbs_len
;
3987 /* Adjust for the alignment. */
3988 idx
= (idx
+ 3) & ~3;
3989 /* Skip the collation sequence value. */
3990 idx
+= sizeof (uint32_t);
3991 /* Skip the wide char sequence of the collating element. */
3992 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3993 /* If we found the entry, return the sequence value. */
3995 return *(uint32_t *) (extra
+ idx
);
3996 /* Skip the collation sequence value. */
3997 idx
+= sizeof (uint32_t);
4003 #endif /* RE_ENABLE_I18N */
4005 /* Check whether the node accepts the byte which is IDX-th
4006 byte of the INPUT. */
4010 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4014 ch
= re_string_byte_at (&mctx
->input
, idx
);
4018 if (node
->opr
.c
!= ch
)
4022 case SIMPLE_BRACKET
:
4023 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4027 #ifdef RE_ENABLE_I18N
4028 case OP_UTF8_PERIOD
:
4034 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4035 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4043 if (node
->constraint
)
4045 /* The node has constraints. Check whether the current context
4046 satisfies the constraints. */
4047 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4049 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4056 /* Extend the buffers, if the buffers have run out. */
4058 static reg_errcode_t
4060 extend_buffers (re_match_context_t
*mctx
)
4063 re_string_t
*pstr
= &mctx
->input
;
4065 /* Double the lengthes of the buffers. */
4066 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4067 if (BE (ret
!= REG_NOERROR
, 0))
4070 if (mctx
->state_log
!= NULL
)
4072 /* And double the length of state_log. */
4073 /* XXX We have no indication of the size of this buffer. If this
4074 allocation fail we have no indication that the state_log array
4075 does not have the right size. */
4076 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4077 pstr
->bufs_len
+ 1);
4078 if (BE (new_array
== NULL
, 0))
4080 mctx
->state_log
= new_array
;
4083 /* Then reconstruct the buffers. */
4086 #ifdef RE_ENABLE_I18N
4087 if (pstr
->mb_cur_max
> 1)
4089 ret
= build_wcs_upper_buffer (pstr
);
4090 if (BE (ret
!= REG_NOERROR
, 0))
4094 #endif /* RE_ENABLE_I18N */
4095 build_upper_buffer (pstr
);
4099 #ifdef RE_ENABLE_I18N
4100 if (pstr
->mb_cur_max
> 1)
4101 build_wcs_buffer (pstr
);
4103 #endif /* RE_ENABLE_I18N */
4105 if (pstr
->trans
!= NULL
)
4106 re_string_translate_buffer (pstr
);
4113 /* Functions for matching context. */
4115 /* Initialize MCTX. */
4117 static reg_errcode_t
4119 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4121 mctx
->eflags
= eflags
;
4122 mctx
->match_last
= -1;
4125 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4126 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4127 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4130 /* Already zero-ed by the caller.
4132 mctx->bkref_ents = NULL;
4133 mctx->nbkref_ents = 0;
4134 mctx->nsub_tops = 0; */
4135 mctx
->abkref_ents
= n
;
4136 mctx
->max_mb_elem_len
= 1;
4137 mctx
->asub_tops
= n
;
4141 /* Clean the entries which depend on the current input in MCTX.
4142 This function must be invoked when the matcher changes the start index
4143 of the input, or changes the input string. */
4147 match_ctx_clean (re_match_context_t
*mctx
)
4150 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4153 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4154 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4156 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4157 re_free (last
->path
.array
);
4160 re_free (top
->lasts
);
4163 re_free (top
->path
->array
);
4164 re_free (top
->path
);
4169 mctx
->nsub_tops
= 0;
4170 mctx
->nbkref_ents
= 0;
4173 /* Free all the memory associated with MCTX. */
4177 match_ctx_free (re_match_context_t
*mctx
)
4179 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4180 match_ctx_clean (mctx
);
4181 re_free (mctx
->sub_tops
);
4182 re_free (mctx
->bkref_ents
);
4185 /* Add a new backreference entry to MCTX.
4186 Note that we assume that caller never call this function with duplicate
4187 entry, and call with STR_IDX which isn't smaller than any existing entry.
4190 static reg_errcode_t
4192 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4195 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4197 struct re_backref_cache_entry
* new_entry
;
4198 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4199 mctx
->abkref_ents
* 2);
4200 if (BE (new_entry
== NULL
, 0))
4202 re_free (mctx
->bkref_ents
);
4205 mctx
->bkref_ents
= new_entry
;
4206 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4207 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4208 mctx
->abkref_ents
*= 2;
4210 if (mctx
->nbkref_ents
> 0
4211 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4212 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4214 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4215 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4216 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4217 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4219 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4220 If bit N is clear, means that this entry won't epsilon-transition to
4221 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4222 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4225 A backreference does not epsilon-transition unless it is empty, so set
4226 to all zeros if FROM != TO. */
4227 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4228 = (from
== to
? ~0 : 0);
4230 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4231 if (mctx
->max_mb_elem_len
< to
- from
)
4232 mctx
->max_mb_elem_len
= to
- from
;
4236 /* Search for the first entry which has the same str_idx, or -1 if none is
4237 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4241 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4243 int left
, right
, mid
, last
;
4244 last
= right
= mctx
->nbkref_ents
;
4245 for (left
= 0; left
< right
;)
4247 mid
= (left
+ right
) / 2;
4248 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4253 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4259 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4262 static reg_errcode_t
4264 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4267 assert (mctx
->sub_tops
!= NULL
);
4268 assert (mctx
->asub_tops
> 0);
4270 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4272 int new_asub_tops
= mctx
->asub_tops
* 2;
4273 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4274 re_sub_match_top_t
*,
4276 if (BE (new_array
== NULL
, 0))
4278 mctx
->sub_tops
= new_array
;
4279 mctx
->asub_tops
= new_asub_tops
;
4281 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4282 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4284 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4285 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4289 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4290 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4292 static re_sub_match_last_t
*
4294 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4296 re_sub_match_last_t
*new_entry
;
4297 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4299 int new_alasts
= 2 * subtop
->alasts
+ 1;
4300 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4301 re_sub_match_last_t
*,
4303 if (BE (new_array
== NULL
, 0))
4305 subtop
->lasts
= new_array
;
4306 subtop
->alasts
= new_alasts
;
4308 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4309 if (BE (new_entry
!= NULL
, 1))
4311 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4312 new_entry
->node
= node
;
4313 new_entry
->str_idx
= str_idx
;
4321 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4322 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4324 sctx
->sifted_states
= sifted_sts
;
4325 sctx
->limited_states
= limited_sts
;
4326 sctx
->last_node
= last_node
;
4327 sctx
->last_str_idx
= last_str_idx
;
4328 re_node_set_init_empty (&sctx
->limits
);