1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
43 int eflags
) internal_function
;
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
) internal_function
;
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
52 int ret_len
) internal_function
;
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
) internal_function
;
55 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
57 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
58 int *p_match_first
) internal_function
;
59 static int check_halt_state_context (const re_match_context_t
*mctx
,
60 const re_dfastate_t
*state
, int idx
)
62 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
63 regmatch_t
*prev_idx_match
, int cur_node
,
64 int cur_idx
, int nmatch
) internal_function
;
65 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
66 int str_idx
, int dest_node
, int nregs
,
68 re_node_set
*eps_via_nodes
)
70 static reg_errcode_t
set_regs (const regex_t
*preg
,
71 const re_match_context_t
*mctx
,
72 size_t nmatch
, regmatch_t
*pmatch
,
73 int fl_backtrack
) internal_function
;
74 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
78 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
79 re_sift_context_t
*sctx
,
80 int node_idx
, int str_idx
, int max_str_idx
)
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
84 re_sift_context_t
*sctx
)
86 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
87 re_sift_context_t
*sctx
, int str_idx
,
88 re_node_set
*cur_dest
)
90 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
,
93 re_node_set
*dest_nodes
)
95 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
96 re_node_set
*dest_nodes
,
97 const re_node_set
*candidates
)
99 static int check_dst_limits (const re_match_context_t
*mctx
,
101 int dst_node
, int dst_idx
, int src_node
,
102 int src_idx
) internal_function
;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
104 int boundaries
, int subexp_idx
,
105 int from_node
, int bkref_idx
)
107 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
108 int limit
, int subexp_idx
,
109 int node
, int str_idx
,
110 int bkref_idx
) internal_function
;
111 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
112 re_node_set
*dest_nodes
,
113 const re_node_set
*candidates
,
115 struct re_backref_cache_entry
*bkref_ents
,
116 int str_idx
) internal_function
;
117 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
118 re_sift_context_t
*sctx
,
119 int str_idx
, const re_node_set
*candidates
)
121 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
123 re_dfastate_t
**src
, int num
)
125 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
126 re_match_context_t
*mctx
) internal_function
;
127 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
128 re_match_context_t
*mctx
,
129 re_dfastate_t
*state
) internal_function
;
130 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
131 re_match_context_t
*mctx
,
132 re_dfastate_t
*next_state
)
134 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
135 re_node_set
*cur_nodes
,
136 int str_idx
) internal_function
;
138 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
139 re_match_context_t
*mctx
,
140 re_dfastate_t
*pstate
)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
145 re_dfastate_t
*pstate
)
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
149 const re_node_set
*nodes
)
151 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
152 int bkref_node
, int bkref_str_idx
)
154 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
155 const re_sub_match_top_t
*sub_top
,
156 re_sub_match_last_t
*sub_last
,
157 int bkref_node
, int bkref_str
)
159 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
160 int subexp_idx
, int type
) internal_function
;
161 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
162 state_array_t
*path
, int top_node
,
163 int top_str
, int last_node
, int last_str
,
164 int type
) internal_function
;
165 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
167 re_node_set
*cur_nodes
,
168 re_node_set
*next_nodes
)
170 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
171 re_node_set
*cur_nodes
,
172 int ex_subexp
, int type
)
174 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
175 re_node_set
*dst_nodes
,
176 int target
, int ex_subexp
,
177 int type
) internal_function
;
178 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
179 re_node_set
*cur_nodes
, int cur_str
,
180 int subexp_num
, int type
)
182 static int build_trtable (const re_dfa_t
*dfa
,
183 re_dfastate_t
*state
) internal_function
;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
186 const re_string_t
*input
, int idx
)
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
195 const re_dfastate_t
*state
,
196 re_node_set
*states_node
,
197 bitset_t
*states_ch
) internal_function
;
198 static int check_node_accept (const re_match_context_t
*mctx
,
199 const re_token_t
*node
, int idx
)
201 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
222 const regex_t
*__restrict preg
;
223 const char *__restrict string
;
230 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
232 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
235 if (eflags
& REG_STARTEND
)
237 start
= pmatch
[0].rm_so
;
238 length
= pmatch
[0].rm_eo
;
243 length
= strlen (string
);
246 __libc_lock_lock (dfa
->lock
);
248 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
249 length
, 0, NULL
, eflags
);
251 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
252 length
, nmatch
, pmatch
, eflags
);
253 __libc_lock_unlock (dfa
->lock
);
254 return err
!= REG_NOERROR
;
258 # include <shlib-compat.h>
259 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec
) __compat_regexec
;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t
*__restrict preg
,
267 const char *__restrict string
, size_t nmatch
,
268 regmatch_t pmatch
[], int eflags
)
270 return regexec (preg
, string
, nmatch
, pmatch
,
271 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
273 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp
, string
, length
, start
, regs
)
308 struct re_pattern_buffer
*bufp
;
311 struct re_registers
*regs
;
313 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
316 weak_alias (__re_match
, re_match
)
320 re_search (bufp
, string
, length
, start
, range
, regs
)
321 struct re_pattern_buffer
*bufp
;
323 int length
, start
, range
;
324 struct re_registers
*regs
;
326 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
329 weak_alias (__re_search
, re_search
)
333 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
334 struct re_pattern_buffer
*bufp
;
335 const char *string1
, *string2
;
336 int length1
, length2
, start
, stop
;
337 struct re_registers
*regs
;
339 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
340 start
, 0, regs
, stop
, 1);
343 weak_alias (__re_match_2
, re_match_2
)
347 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
348 struct re_pattern_buffer
*bufp
;
349 const char *string1
, *string2
;
350 int length1
, length2
, start
, range
, stop
;
351 struct re_registers
*regs
;
353 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
354 start
, range
, regs
, stop
, 0);
357 weak_alias (__re_search_2
, re_search_2
)
361 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
363 struct re_pattern_buffer
*bufp
;
364 const char *string1
, *string2
;
365 int length1
, length2
, start
, range
, stop
, ret_len
;
366 struct re_registers
*regs
;
370 int len
= length1
+ length2
;
373 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
376 /* Concatenate the strings. */
380 char *s
= re_malloc (char, len
);
382 if (BE (s
== NULL
, 0))
385 memcpy (__mempcpy (s
, string1
, length1
), string2
, length2
);
387 memcpy (s
, string1
, length1
);
388 memcpy (s
+ length1
, string2
, length2
);
398 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
401 re_free ((char *) str
);
405 /* The parameters have the same meaning as those of re_search.
406 Additional parameters:
407 If RET_LEN is nonzero the length of the match is returned (re_match style);
408 otherwise the position of the match is returned. */
411 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
412 struct re_pattern_buffer
*bufp
;
414 int length
, start
, range
, stop
, ret_len
;
415 struct re_registers
*regs
;
417 reg_errcode_t result
;
421 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
423 /* Check for out-of-range. */
424 if (BE (start
< 0 || start
> length
, 0))
426 if (BE (start
+ range
> length
, 0))
427 range
= length
- start
;
428 else if (BE (start
+ range
< 0, 0))
431 __libc_lock_lock (dfa
->lock
);
433 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
434 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
436 /* Compile fastmap if we haven't yet. */
437 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
438 re_compile_fastmap (bufp
);
440 if (BE (bufp
->no_sub
, 0))
443 /* We need at least 1 register. */
446 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
447 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
449 nregs
= regs
->num_regs
;
450 if (BE (nregs
< 1, 0))
452 /* Nothing can be copied to regs. */
458 nregs
= bufp
->re_nsub
+ 1;
459 pmatch
= re_malloc (regmatch_t
, nregs
);
460 if (BE (pmatch
== NULL
, 0))
466 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
467 nregs
, pmatch
, eflags
);
471 /* I hope we needn't fill ther regs with -1's when no match was found. */
472 if (result
!= REG_NOERROR
)
474 else if (regs
!= NULL
)
476 /* If caller wants register contents data back, copy them. */
477 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
478 bufp
->regs_allocated
);
479 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
483 if (BE (rval
== 0, 1))
487 assert (pmatch
[0].rm_so
== start
);
488 rval
= pmatch
[0].rm_eo
- start
;
491 rval
= pmatch
[0].rm_so
;
495 __libc_lock_unlock (dfa
->lock
);
500 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
501 struct re_registers
*regs
;
503 int nregs
, regs_allocated
;
505 int rval
= REGS_REALLOCATE
;
507 int need_regs
= nregs
+ 1;
508 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
511 /* Have the register data arrays been allocated? */
512 if (regs_allocated
== REGS_UNALLOCATED
)
513 { /* No. So allocate them with malloc. */
514 regs
->start
= re_malloc (regoff_t
, need_regs
);
515 regs
->end
= re_malloc (regoff_t
, need_regs
);
516 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
517 return REGS_UNALLOCATED
;
518 regs
->num_regs
= need_regs
;
520 else if (regs_allocated
== REGS_REALLOCATE
)
521 { /* Yes. If we need more elements than were already
522 allocated, reallocate them. If we need fewer, just
524 if (BE (need_regs
> regs
->num_regs
, 0))
526 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
527 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
528 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
529 return REGS_UNALLOCATED
;
530 regs
->start
= new_start
;
532 regs
->num_regs
= need_regs
;
537 assert (regs_allocated
== REGS_FIXED
);
538 /* This function may not be called with REGS_FIXED and nregs too big. */
539 assert (regs
->num_regs
>= nregs
);
544 for (i
= 0; i
< nregs
; ++i
)
546 regs
->start
[i
] = pmatch
[i
].rm_so
;
547 regs
->end
[i
] = pmatch
[i
].rm_eo
;
549 for ( ; i
< regs
->num_regs
; ++i
)
550 regs
->start
[i
] = regs
->end
[i
] = -1;
555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
556 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
557 this memory for recording register information. STARTS and ENDS
558 must be allocated using the malloc library routine, and must each
559 be at least NUM_REGS * sizeof (regoff_t) bytes long.
561 If NUM_REGS == 0, then subsequent matches should allocate their own
564 Unless this function is called, the first search or match using
565 PATTERN_BUFFER will allocate its own register data, without
566 freeing the old data. */
569 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
570 struct re_pattern_buffer
*bufp
;
571 struct re_registers
*regs
;
573 regoff_t
*starts
, *ends
;
577 bufp
->regs_allocated
= REGS_REALLOCATE
;
578 regs
->num_regs
= num_regs
;
579 regs
->start
= starts
;
584 bufp
->regs_allocated
= REGS_UNALLOCATED
;
586 regs
->start
= regs
->end
= (regoff_t
*) 0;
590 weak_alias (__re_set_registers
, re_set_registers
)
593 /* Entry points compatible with 4.2 BSD regex library. We don't define
594 them unless specifically requested. */
596 #if defined _REGEX_RE_COMP || defined _LIBC
604 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
606 #endif /* _REGEX_RE_COMP */
608 /* Internal entry point. */
610 /* Searches for a compiled pattern PREG in the string STRING, whose
611 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
612 mingings with regexec. START, and RANGE have the same meanings
614 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
615 otherwise return the error code.
616 Note: We assume front end functions already check ranges.
617 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
620 __attribute_warn_unused_result__
621 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
625 int length
, start
, range
, stop
, eflags
;
630 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
631 int left_lim
, right_lim
, incr
;
632 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
635 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
636 re_match_context_t mctx
= { .dfa
= dfa
};
638 re_match_context_t mctx
;
640 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
641 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
642 RE_TRANSLATE_TYPE t
= preg
->translate
;
644 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
645 memset (&mctx
, '\0', sizeof (re_match_context_t
));
649 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
650 nmatch
-= extra_nmatch
;
652 /* Check if the DFA haven't been compiled. */
653 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
654 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
655 || dfa
->init_state_begbuf
== NULL
, 0))
659 /* We assume front-end functions already check them. */
660 assert (start
+ range
>= 0 && start
+ range
<= length
);
663 /* If initial states with non-begbuf contexts have no elements,
664 the regex must be anchored. If preg->newline_anchor is set,
665 we'll never use init_state_nl, so do not check it. */
666 if (dfa
->init_state
->nodes
.nelem
== 0
667 && dfa
->init_state_word
->nodes
.nelem
== 0
668 && (dfa
->init_state_nl
->nodes
.nelem
== 0
669 || !preg
->newline_anchor
))
671 if (start
!= 0 && start
+ range
!= 0)
676 /* We must check the longest matching, if nmatch > 0. */
677 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
679 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
680 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
681 if (BE (err
!= REG_NOERROR
, 0))
683 mctx
.input
.stop
= stop
;
684 mctx
.input
.raw_stop
= stop
;
685 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
687 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
688 if (BE (err
!= REG_NOERROR
, 0))
691 /* We will log all the DFA states through which the dfa pass,
692 if nmatch > 1, or this dfa has "multibyte node", which is a
693 back-reference or a node which can accept multibyte character or
694 multi character collating element. */
695 if (nmatch
> 1 || dfa
->has_mb_node
)
697 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
698 if (BE (mctx
.state_log
== NULL
, 0))
705 mctx
.state_log
= NULL
;
708 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
709 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
711 /* Check incrementally whether of not the input string match. */
712 incr
= (range
< 0) ? -1 : 1;
713 left_lim
= (range
< 0) ? start
+ range
: start
;
714 right_lim
= (range
< 0) ? start
: start
+ range
;
715 sb
= dfa
->mb_cur_max
== 1;
718 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
719 | (range
>= 0 ? 2 : 0)
720 | (t
!= NULL
? 1 : 0))
723 for (;; match_first
+= incr
)
726 if (match_first
< left_lim
|| right_lim
< match_first
)
729 /* Advance as rapidly as possible through the string, until we
730 find a plausible place to start matching. This may be done
731 with varying efficiency, so there are various possibilities:
732 only the most common of them are specialized, in order to
733 save on code size. We use a switch statement for speed. */
741 /* Fastmap with single-byte translation, match forward. */
742 while (BE (match_first
< right_lim
, 1)
743 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
745 goto forward_match_found_start_or_reached_end
;
748 /* Fastmap without translation, match forward. */
749 while (BE (match_first
< right_lim
, 1)
750 && !fastmap
[(unsigned char) string
[match_first
]])
753 forward_match_found_start_or_reached_end
:
754 if (BE (match_first
== right_lim
, 0))
756 ch
= match_first
>= length
757 ? 0 : (unsigned char) string
[match_first
];
758 if (!fastmap
[t
? t
[ch
] : ch
])
765 /* Fastmap without multi-byte translation, match backwards. */
766 while (match_first
>= left_lim
)
768 ch
= match_first
>= length
769 ? 0 : (unsigned char) string
[match_first
];
770 if (fastmap
[t
? t
[ch
] : ch
])
774 if (match_first
< left_lim
)
779 /* In this case, we can't determine easily the current byte,
780 since it might be a component byte of a multibyte
781 character. Then we use the constructed buffer instead. */
784 /* If MATCH_FIRST is out of the valid range, reconstruct the
786 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
787 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
789 err
= re_string_reconstruct (&mctx
.input
, match_first
,
791 if (BE (err
!= REG_NOERROR
, 0))
794 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
796 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
797 Note that MATCH_FIRST must not be smaller than 0. */
798 ch
= (match_first
>= length
799 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
803 if (match_first
< left_lim
|| match_first
> right_lim
)
812 /* Reconstruct the buffers so that the matcher can assume that
813 the matching starts from the beginning of the buffer. */
814 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
815 if (BE (err
!= REG_NOERROR
, 0))
818 #ifdef RE_ENABLE_I18N
819 /* Don't consider this char as a possible match start if it part,
820 yet isn't the head, of a multibyte character. */
821 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
825 /* It seems to be appropriate one, then use the matcher. */
826 /* We assume that the matching starts from 0. */
827 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
828 match_last
= check_matching (&mctx
, fl_longest_match
,
829 range
>= 0 ? &match_first
: NULL
);
830 if (match_last
!= -1)
832 if (BE (match_last
== -2, 0))
839 mctx
.match_last
= match_last
;
840 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
842 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
843 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
846 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
849 err
= prune_impossible_nodes (&mctx
);
850 if (err
== REG_NOERROR
)
852 if (BE (err
!= REG_NOMATCH
, 0))
857 break; /* We found a match. */
861 match_ctx_clean (&mctx
);
865 assert (match_last
!= -1);
866 assert (err
== REG_NOERROR
);
869 /* Set pmatch[] if we need. */
874 /* Initialize registers. */
875 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
876 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
878 /* Set the points where matching start/end. */
880 pmatch
[0].rm_eo
= mctx
.match_last
;
882 if (!preg
->no_sub
&& nmatch
> 1)
884 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
885 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
886 if (BE (err
!= REG_NOERROR
, 0))
890 /* At last, add the offset to the each registers, since we slided
891 the buffers so that we could assume that the matching starts
893 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
894 if (pmatch
[reg_idx
].rm_so
!= -1)
896 #ifdef RE_ENABLE_I18N
897 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
899 pmatch
[reg_idx
].rm_so
=
900 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
901 ? mctx
.input
.valid_raw_len
902 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
903 pmatch
[reg_idx
].rm_eo
=
904 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
905 ? mctx
.input
.valid_raw_len
906 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
909 assert (mctx
.input
.offsets_needed
== 0);
911 pmatch
[reg_idx
].rm_so
+= match_first
;
912 pmatch
[reg_idx
].rm_eo
+= match_first
;
914 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
916 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
917 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
921 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
922 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
924 pmatch
[reg_idx
+ 1].rm_so
925 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
926 pmatch
[reg_idx
+ 1].rm_eo
927 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
932 re_free (mctx
.state_log
);
934 match_ctx_free (&mctx
);
935 re_string_destruct (&mctx
.input
);
940 __attribute_warn_unused_result__
941 prune_impossible_nodes (mctx
)
942 re_match_context_t
*mctx
;
944 const re_dfa_t
*const dfa
= mctx
->dfa
;
945 int halt_node
, match_last
;
947 re_dfastate_t
**sifted_states
;
948 re_dfastate_t
**lim_states
= NULL
;
949 re_sift_context_t sctx
;
951 assert (mctx
->state_log
!= NULL
);
953 match_last
= mctx
->match_last
;
954 halt_node
= mctx
->last_node
;
955 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
956 if (BE (sifted_states
== NULL
, 0))
963 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
964 if (BE (lim_states
== NULL
, 0))
971 memset (lim_states
, '\0',
972 sizeof (re_dfastate_t
*) * (match_last
+ 1));
973 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
975 ret
= sift_states_backward (mctx
, &sctx
);
976 re_node_set_free (&sctx
.limits
);
977 if (BE (ret
!= REG_NOERROR
, 0))
979 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
989 } while (mctx
->state_log
[match_last
] == NULL
990 || !mctx
->state_log
[match_last
]->halt
);
991 halt_node
= check_halt_state_context (mctx
,
992 mctx
->state_log
[match_last
],
995 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
997 re_free (lim_states
);
999 if (BE (ret
!= REG_NOERROR
, 0))
1004 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1005 ret
= sift_states_backward (mctx
, &sctx
);
1006 re_node_set_free (&sctx
.limits
);
1007 if (BE (ret
!= REG_NOERROR
, 0))
1009 if (sifted_states
[0] == NULL
)
1015 re_free (mctx
->state_log
);
1016 mctx
->state_log
= sifted_states
;
1017 sifted_states
= NULL
;
1018 mctx
->last_node
= halt_node
;
1019 mctx
->match_last
= match_last
;
1022 re_free (sifted_states
);
1023 re_free (lim_states
);
1027 /* Acquire an initial state and return it.
1028 We must select appropriate initial state depending on the context,
1029 since initial states may have constraints like "\<", "^", etc.. */
1031 static inline re_dfastate_t
*
1032 __attribute ((always_inline
)) internal_function
1033 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1036 const re_dfa_t
*const dfa
= mctx
->dfa
;
1037 if (dfa
->init_state
->has_constraint
)
1039 unsigned int context
;
1040 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1041 if (IS_WORD_CONTEXT (context
))
1042 return dfa
->init_state_word
;
1043 else if (IS_ORDINARY_CONTEXT (context
))
1044 return dfa
->init_state
;
1045 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1046 return dfa
->init_state_begbuf
;
1047 else if (IS_NEWLINE_CONTEXT (context
))
1048 return dfa
->init_state_nl
;
1049 else if (IS_BEGBUF_CONTEXT (context
))
1051 /* It is relatively rare case, then calculate on demand. */
1052 return re_acquire_state_context (err
, dfa
,
1053 dfa
->init_state
->entrance_nodes
,
1057 /* Must not happen? */
1058 return dfa
->init_state
;
1061 return dfa
->init_state
;
1064 /* Check whether the regular expression match input string INPUT or not,
1065 and return the index where the matching end, return -1 if not match,
1066 or return -2 in case of an error.
1067 FL_LONGEST_MATCH means we want the POSIX longest matching.
1068 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1069 next place where we may want to try matching.
1070 Note that the matcher assume that the maching starts from the current
1071 index of the buffer. */
1074 internal_function __attribute_warn_unused_result__
1075 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1078 const re_dfa_t
*const dfa
= mctx
->dfa
;
1081 int match_last
= -1;
1082 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1083 re_dfastate_t
*cur_state
;
1084 int at_init_state
= p_match_first
!= NULL
;
1085 int next_start_idx
= cur_str_idx
;
1088 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1089 /* An initial state must not be NULL (invalid). */
1090 if (BE (cur_state
== NULL
, 0))
1092 assert (err
== REG_ESPACE
);
1096 if (mctx
->state_log
!= NULL
)
1098 mctx
->state_log
[cur_str_idx
] = cur_state
;
1100 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1101 later. E.g. Processing back references. */
1102 if (BE (dfa
->nbackref
, 0))
1105 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1106 if (BE (err
!= REG_NOERROR
, 0))
1109 if (cur_state
->has_backref
)
1111 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1112 if (BE (err
!= REG_NOERROR
, 0))
1118 /* If the RE accepts NULL string. */
1119 if (BE (cur_state
->halt
, 0))
1121 if (!cur_state
->has_constraint
1122 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1124 if (!fl_longest_match
)
1128 match_last
= cur_str_idx
;
1134 while (!re_string_eoi (&mctx
->input
))
1136 re_dfastate_t
*old_state
= cur_state
;
1137 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1139 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1140 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1141 && mctx
->input
.valid_len
< mctx
->input
.len
))
1143 err
= extend_buffers (mctx
);
1144 if (BE (err
!= REG_NOERROR
, 0))
1146 assert (err
== REG_ESPACE
);
1151 cur_state
= transit_state (&err
, mctx
, cur_state
);
1152 if (mctx
->state_log
!= NULL
)
1153 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1155 if (cur_state
== NULL
)
1157 /* Reached the invalid state or an error. Try to recover a valid
1158 state using the state log, if available and if we have not
1159 already found a valid (even if not the longest) match. */
1160 if (BE (err
!= REG_NOERROR
, 0))
1163 if (mctx
->state_log
== NULL
1164 || (match
&& !fl_longest_match
)
1165 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1169 if (BE (at_init_state
, 0))
1171 if (old_state
== cur_state
)
1172 next_start_idx
= next_char_idx
;
1177 if (cur_state
->halt
)
1179 /* Reached a halt state.
1180 Check the halt state can satisfy the current context. */
1181 if (!cur_state
->has_constraint
1182 || check_halt_state_context (mctx
, cur_state
,
1183 re_string_cur_idx (&mctx
->input
)))
1185 /* We found an appropriate halt state. */
1186 match_last
= re_string_cur_idx (&mctx
->input
);
1189 /* We found a match, do not modify match_first below. */
1190 p_match_first
= NULL
;
1191 if (!fl_longest_match
)
1198 *p_match_first
+= next_start_idx
;
1203 /* Check NODE match the current context. */
1207 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1209 re_token_type_t type
= dfa
->nodes
[node
].type
;
1210 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1211 if (type
!= END_OF_RE
)
1215 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1220 /* Check the halt state STATE match the current context.
1221 Return 0 if not match, if the node, STATE has, is a halt node and
1222 match the context, return the node. */
1226 check_halt_state_context (const re_match_context_t
*mctx
,
1227 const re_dfastate_t
*state
, int idx
)
1230 unsigned int context
;
1232 assert (state
->halt
);
1234 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1235 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1236 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1237 return state
->nodes
.elems
[i
];
1241 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1242 corresponding to the DFA).
1243 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1248 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1249 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1250 struct re_fail_stack_t
*fs
)
1252 const re_dfa_t
*const dfa
= mctx
->dfa
;
1254 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1256 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1257 re_node_set
*edests
= &dfa
->edests
[node
];
1259 err
= re_node_set_insert (eps_via_nodes
, node
);
1260 if (BE (err
< 0, 0))
1262 /* Pick up a valid destination, or return -1 if none is found. */
1263 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1265 int candidate
= edests
->elems
[i
];
1266 if (!re_node_set_contains (cur_nodes
, candidate
))
1268 if (dest_node
== -1)
1269 dest_node
= candidate
;
1273 /* In order to avoid infinite loop like "(a*)*", return the second
1274 epsilon-transition if the first was already considered. */
1275 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1278 /* Otherwise, push the second epsilon-transition on the fail stack. */
1280 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1284 /* We know we are going to exit. */
1293 re_token_type_t type
= dfa
->nodes
[node
].type
;
1295 #ifdef RE_ENABLE_I18N
1296 if (dfa
->nodes
[node
].accept_mb
)
1297 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1299 #endif /* RE_ENABLE_I18N */
1300 if (type
== OP_BACK_REF
)
1302 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1303 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1306 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1310 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1311 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1320 err
= re_node_set_insert (eps_via_nodes
, node
);
1321 if (BE (err
< 0, 0))
1323 dest_node
= dfa
->edests
[node
].elems
[0];
1324 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1331 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1333 int dest_node
= dfa
->nexts
[node
];
1334 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1335 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1336 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1339 re_node_set_empty (eps_via_nodes
);
1346 static reg_errcode_t
1347 internal_function __attribute_warn_unused_result__
1348 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1349 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1352 int num
= fs
->num
++;
1353 if (fs
->num
== fs
->alloc
)
1355 struct re_fail_stack_ent_t
*new_array
;
1356 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1358 if (new_array
== NULL
)
1361 fs
->stack
= new_array
;
1363 fs
->stack
[num
].idx
= str_idx
;
1364 fs
->stack
[num
].node
= dest_node
;
1365 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1366 if (fs
->stack
[num
].regs
== NULL
)
1368 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1369 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1375 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1376 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1378 int num
= --fs
->num
;
1380 *pidx
= fs
->stack
[num
].idx
;
1381 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1382 re_node_set_free (eps_via_nodes
);
1383 re_free (fs
->stack
[num
].regs
);
1384 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1385 return fs
->stack
[num
].node
;
1388 /* Set the positions where the subexpressions are starts/ends to registers
1390 Note: We assume that pmatch[0] is already set, and
1391 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1393 static reg_errcode_t
1394 internal_function __attribute_warn_unused_result__
1395 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1396 regmatch_t
*pmatch
, int fl_backtrack
)
1398 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1400 re_node_set eps_via_nodes
;
1401 struct re_fail_stack_t
*fs
;
1402 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1403 regmatch_t
*prev_idx_match
;
1404 int prev_idx_match_malloced
= 0;
1407 assert (nmatch
> 1);
1408 assert (mctx
->state_log
!= NULL
);
1413 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1414 if (fs
->stack
== NULL
)
1420 cur_node
= dfa
->init_node
;
1421 re_node_set_init_empty (&eps_via_nodes
);
1423 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1424 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1427 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1428 if (prev_idx_match
== NULL
)
1430 free_fail_stack_return (fs
);
1433 prev_idx_match_malloced
= 1;
1435 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1437 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1439 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1441 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1446 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1447 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1449 if (reg_idx
== nmatch
)
1451 re_node_set_free (&eps_via_nodes
);
1452 if (prev_idx_match_malloced
)
1453 re_free (prev_idx_match
);
1454 return free_fail_stack_return (fs
);
1456 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1461 re_node_set_free (&eps_via_nodes
);
1462 if (prev_idx_match_malloced
)
1463 re_free (prev_idx_match
);
1468 /* Proceed to next node. */
1469 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1470 &eps_via_nodes
, fs
);
1472 if (BE (cur_node
< 0, 0))
1474 if (BE (cur_node
== -2, 0))
1476 re_node_set_free (&eps_via_nodes
);
1477 if (prev_idx_match_malloced
)
1478 re_free (prev_idx_match
);
1479 free_fail_stack_return (fs
);
1483 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1487 re_node_set_free (&eps_via_nodes
);
1488 if (prev_idx_match_malloced
)
1489 re_free (prev_idx_match
);
1494 re_node_set_free (&eps_via_nodes
);
1495 if (prev_idx_match_malloced
)
1496 re_free (prev_idx_match
);
1497 return free_fail_stack_return (fs
);
1500 static reg_errcode_t
1502 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1507 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1509 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1510 re_free (fs
->stack
[fs_idx
].regs
);
1512 re_free (fs
->stack
);
1519 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1520 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1522 int type
= dfa
->nodes
[cur_node
].type
;
1523 if (type
== OP_OPEN_SUBEXP
)
1525 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1527 /* We are at the first node of this sub expression. */
1528 if (reg_num
< nmatch
)
1530 pmatch
[reg_num
].rm_so
= cur_idx
;
1531 pmatch
[reg_num
].rm_eo
= -1;
1534 else if (type
== OP_CLOSE_SUBEXP
)
1536 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1537 if (reg_num
< nmatch
)
1539 /* We are at the last node of this sub expression. */
1540 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1542 pmatch
[reg_num
].rm_eo
= cur_idx
;
1543 /* This is a non-empty match or we are not inside an optional
1544 subexpression. Accept this right away. */
1545 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1549 if (dfa
->nodes
[cur_node
].opt_subexp
1550 && prev_idx_match
[reg_num
].rm_so
!= -1)
1551 /* We transited through an empty match for an optional
1552 subexpression, like (a?)*, and this is not the subexp's
1553 first match. Copy back the old content of the registers
1554 so that matches of an inner subexpression are undone as
1555 well, like in ((a?))*. */
1556 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1558 /* We completed a subexpression, but it may be part of
1559 an optional one, so do not update PREV_IDX_MATCH. */
1560 pmatch
[reg_num
].rm_eo
= cur_idx
;
1566 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1567 and sift the nodes in each states according to the following rules.
1568 Updated state_log will be wrote to STATE_LOG.
1570 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1571 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1572 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1573 the LAST_NODE, we throw away the node `a'.
1574 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1575 string `s' and transit to `b':
1576 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1578 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1579 thrown away, we throw away the node `a'.
1580 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1581 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1583 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1584 we throw away the node `a'. */
1586 #define STATE_NODE_CONTAINS(state,node) \
1587 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1589 static reg_errcode_t
1591 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1595 int str_idx
= sctx
->last_str_idx
;
1596 re_node_set cur_dest
;
1599 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1602 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1603 transit to the last_node and the last_node itself. */
1604 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1605 if (BE (err
!= REG_NOERROR
, 0))
1607 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1608 if (BE (err
!= REG_NOERROR
, 0))
1611 /* Then check each states in the state_log. */
1614 /* Update counters. */
1615 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1616 if (null_cnt
> mctx
->max_mb_elem_len
)
1618 memset (sctx
->sifted_states
, '\0',
1619 sizeof (re_dfastate_t
*) * str_idx
);
1620 re_node_set_free (&cur_dest
);
1623 re_node_set_empty (&cur_dest
);
1626 if (mctx
->state_log
[str_idx
])
1628 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1629 if (BE (err
!= REG_NOERROR
, 0))
1633 /* Add all the nodes which satisfy the following conditions:
1634 - It can epsilon transit to a node in CUR_DEST.
1636 And update state_log. */
1637 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1638 if (BE (err
!= REG_NOERROR
, 0))
1643 re_node_set_free (&cur_dest
);
1647 static reg_errcode_t
1648 internal_function __attribute_warn_unused_result__
1649 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1650 int str_idx
, re_node_set
*cur_dest
)
1652 const re_dfa_t
*const dfa
= mctx
->dfa
;
1653 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1656 /* Then build the next sifted state.
1657 We build the next sifted state on `cur_dest', and update
1658 `sifted_states[str_idx]' with `cur_dest'.
1660 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1661 `cur_src' points the node_set of the old `state_log[str_idx]'
1662 (with the epsilon nodes pre-filtered out). */
1663 for (i
= 0; i
< cur_src
->nelem
; i
++)
1665 int prev_node
= cur_src
->elems
[i
];
1670 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1671 assert (!IS_EPSILON_NODE (type
));
1673 #ifdef RE_ENABLE_I18N
1674 /* If the node may accept `multi byte'. */
1675 if (dfa
->nodes
[prev_node
].accept_mb
)
1676 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1677 str_idx
, sctx
->last_str_idx
);
1678 #endif /* RE_ENABLE_I18N */
1680 /* We don't check backreferences here.
1681 See update_cur_sifted_state(). */
1683 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1684 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1685 dfa
->nexts
[prev_node
]))
1691 if (sctx
->limits
.nelem
)
1693 int to_idx
= str_idx
+ naccepted
;
1694 if (check_dst_limits (mctx
, &sctx
->limits
,
1695 dfa
->nexts
[prev_node
], to_idx
,
1696 prev_node
, str_idx
))
1699 ret
= re_node_set_insert (cur_dest
, prev_node
);
1700 if (BE (ret
== -1, 0))
1707 /* Helper functions. */
1709 static reg_errcode_t
1711 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1713 int top
= mctx
->state_log_top
;
1715 if (next_state_log_idx
>= mctx
->input
.bufs_len
1716 || (next_state_log_idx
>= mctx
->input
.valid_len
1717 && mctx
->input
.valid_len
< mctx
->input
.len
))
1720 err
= extend_buffers (mctx
);
1721 if (BE (err
!= REG_NOERROR
, 0))
1725 if (top
< next_state_log_idx
)
1727 memset (mctx
->state_log
+ top
+ 1, '\0',
1728 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1729 mctx
->state_log_top
= next_state_log_idx
;
1734 static reg_errcode_t
1736 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1737 re_dfastate_t
**src
, int num
)
1741 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1743 if (dst
[st_idx
] == NULL
)
1744 dst
[st_idx
] = src
[st_idx
];
1745 else if (src
[st_idx
] != NULL
)
1747 re_node_set merged_set
;
1748 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1749 &src
[st_idx
]->nodes
);
1750 if (BE (err
!= REG_NOERROR
, 0))
1752 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1753 re_node_set_free (&merged_set
);
1754 if (BE (err
!= REG_NOERROR
, 0))
1761 static reg_errcode_t
1763 update_cur_sifted_state (const re_match_context_t
*mctx
,
1764 re_sift_context_t
*sctx
, int str_idx
,
1765 re_node_set
*dest_nodes
)
1767 const re_dfa_t
*const dfa
= mctx
->dfa
;
1768 reg_errcode_t err
= REG_NOERROR
;
1769 const re_node_set
*candidates
;
1770 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1771 : &mctx
->state_log
[str_idx
]->nodes
);
1773 if (dest_nodes
->nelem
== 0)
1774 sctx
->sifted_states
[str_idx
] = NULL
;
1779 /* At first, add the nodes which can epsilon transit to a node in
1781 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1782 if (BE (err
!= REG_NOERROR
, 0))
1785 /* Then, check the limitations in the current sift_context. */
1786 if (sctx
->limits
.nelem
)
1788 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1789 mctx
->bkref_ents
, str_idx
);
1790 if (BE (err
!= REG_NOERROR
, 0))
1795 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1796 if (BE (err
!= REG_NOERROR
, 0))
1800 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1802 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1803 if (BE (err
!= REG_NOERROR
, 0))
1809 static reg_errcode_t
1810 internal_function __attribute_warn_unused_result__
1811 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1812 const re_node_set
*candidates
)
1814 reg_errcode_t err
= REG_NOERROR
;
1817 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1818 if (BE (err
!= REG_NOERROR
, 0))
1821 if (!state
->inveclosure
.alloc
)
1823 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1824 if (BE (err
!= REG_NOERROR
, 0))
1826 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1828 err
= re_node_set_merge (&state
->inveclosure
,
1829 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1830 if (BE (err
!= REG_NOERROR
, 0))
1834 return re_node_set_add_intersect (dest_nodes
, candidates
,
1835 &state
->inveclosure
);
1838 static reg_errcode_t
1840 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1841 const re_node_set
*candidates
)
1845 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1846 re_node_set except_nodes
;
1847 re_node_set_init_empty (&except_nodes
);
1848 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1850 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1851 if (cur_node
== node
)
1853 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1855 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1856 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1857 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1858 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1859 && re_node_set_contains (dest_nodes
, edst1
))
1861 && !re_node_set_contains (inv_eclosure
, edst2
)
1862 && re_node_set_contains (dest_nodes
, edst2
)))
1864 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1865 dfa
->inveclosures
+ cur_node
);
1866 if (BE (err
!= REG_NOERROR
, 0))
1868 re_node_set_free (&except_nodes
);
1874 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1876 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1877 if (!re_node_set_contains (&except_nodes
, cur_node
))
1879 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1880 re_node_set_remove_at (dest_nodes
, idx
);
1883 re_node_set_free (&except_nodes
);
1889 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1890 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1892 const re_dfa_t
*const dfa
= mctx
->dfa
;
1893 int lim_idx
, src_pos
, dst_pos
;
1895 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1896 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1897 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1900 struct re_backref_cache_entry
*ent
;
1901 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1902 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1904 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1905 subexp_idx
, dst_node
, dst_idx
,
1907 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1908 subexp_idx
, src_node
, src_idx
,
1912 <src> <dst> ( <subexp> )
1913 ( <subexp> ) <src> <dst>
1914 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1915 if (src_pos
== dst_pos
)
1916 continue; /* This is unrelated limitation. */
1925 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1926 int subexp_idx
, int from_node
, int bkref_idx
)
1928 const re_dfa_t
*const dfa
= mctx
->dfa
;
1929 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1932 /* Else, we are on the boundary: examine the nodes on the epsilon
1934 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1936 int node
= eclosures
->elems
[node_idx
];
1937 switch (dfa
->nodes
[node
].type
)
1940 if (bkref_idx
!= -1)
1942 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1947 if (ent
->node
!= node
)
1950 if (subexp_idx
< BITSET_WORD_BITS
1951 && !(ent
->eps_reachable_subexps_map
1952 & ((bitset_word_t
) 1 << subexp_idx
)))
1955 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1956 OP_CLOSE_SUBEXP cases below. But, if the
1957 destination node is the same node as the source
1958 node, don't recurse because it would cause an
1959 infinite loop: a regex that exhibits this behavior
1961 dst
= dfa
->edests
[node
].elems
[0];
1962 if (dst
== from_node
)
1966 else /* if (boundaries & 2) */
1971 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1973 if (cpos
== -1 /* && (boundaries & 1) */)
1975 if (cpos
== 0 && (boundaries
& 2))
1978 if (subexp_idx
< BITSET_WORD_BITS
)
1979 ent
->eps_reachable_subexps_map
1980 &= ~((bitset_word_t
) 1 << subexp_idx
);
1982 while (ent
++->more
);
1986 case OP_OPEN_SUBEXP
:
1987 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1991 case OP_CLOSE_SUBEXP
:
1992 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2001 return (boundaries
& 2) ? 1 : 0;
2006 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2007 int subexp_idx
, int from_node
, int str_idx
,
2010 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2013 /* If we are outside the range of the subexpression, return -1 or 1. */
2014 if (str_idx
< lim
->subexp_from
)
2017 if (lim
->subexp_to
< str_idx
)
2020 /* If we are within the subexpression, return 0. */
2021 boundaries
= (str_idx
== lim
->subexp_from
);
2022 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2023 if (boundaries
== 0)
2026 /* Else, examine epsilon closure. */
2027 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2028 from_node
, bkref_idx
);
2031 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2032 which are against limitations from DEST_NODES. */
2034 static reg_errcode_t
2036 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2037 const re_node_set
*candidates
, re_node_set
*limits
,
2038 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2041 int node_idx
, lim_idx
;
2043 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2046 struct re_backref_cache_entry
*ent
;
2047 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2049 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2050 continue; /* This is unrelated limitation. */
2052 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2053 if (ent
->subexp_to
== str_idx
)
2057 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2059 int node
= dest_nodes
->elems
[node_idx
];
2060 re_token_type_t type
= dfa
->nodes
[node
].type
;
2061 if (type
== OP_OPEN_SUBEXP
2062 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2064 else if (type
== OP_CLOSE_SUBEXP
2065 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2069 /* Check the limitation of the open subexpression. */
2070 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2073 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2075 if (BE (err
!= REG_NOERROR
, 0))
2079 /* Check the limitation of the close subexpression. */
2081 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2083 int node
= dest_nodes
->elems
[node_idx
];
2084 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2086 && !re_node_set_contains (dfa
->eclosures
+ node
,
2089 /* It is against this limitation.
2090 Remove it form the current sifted state. */
2091 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2093 if (BE (err
!= REG_NOERROR
, 0))
2099 else /* (ent->subexp_to != str_idx) */
2101 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2103 int node
= dest_nodes
->elems
[node_idx
];
2104 re_token_type_t type
= dfa
->nodes
[node
].type
;
2105 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2107 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2109 /* It is against this limitation.
2110 Remove it form the current sifted state. */
2111 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2113 if (BE (err
!= REG_NOERROR
, 0))
2122 static reg_errcode_t
2123 internal_function __attribute_warn_unused_result__
2124 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2125 int str_idx
, const re_node_set
*candidates
)
2127 const re_dfa_t
*const dfa
= mctx
->dfa
;
2130 re_sift_context_t local_sctx
;
2131 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2133 if (first_idx
== -1)
2136 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2138 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2141 re_token_type_t type
;
2142 struct re_backref_cache_entry
*entry
;
2143 node
= candidates
->elems
[node_idx
];
2144 type
= dfa
->nodes
[node
].type
;
2145 /* Avoid infinite loop for the REs like "()\1+". */
2146 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2148 if (type
!= OP_BACK_REF
)
2151 entry
= mctx
->bkref_ents
+ first_idx
;
2152 enabled_idx
= first_idx
;
2159 re_dfastate_t
*cur_state
;
2161 if (entry
->node
!= node
)
2163 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2164 to_idx
= str_idx
+ subexp_len
;
2165 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2166 : dfa
->edests
[node
].elems
[0]);
2168 if (to_idx
> sctx
->last_str_idx
2169 || sctx
->sifted_states
[to_idx
] == NULL
2170 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2171 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2172 str_idx
, dst_node
, to_idx
))
2175 if (local_sctx
.sifted_states
== NULL
)
2178 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2179 if (BE (err
!= REG_NOERROR
, 0))
2182 local_sctx
.last_node
= node
;
2183 local_sctx
.last_str_idx
= str_idx
;
2184 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2185 if (BE (ret
< 0, 0))
2190 cur_state
= local_sctx
.sifted_states
[str_idx
];
2191 err
= sift_states_backward (mctx
, &local_sctx
);
2192 if (BE (err
!= REG_NOERROR
, 0))
2194 if (sctx
->limited_states
!= NULL
)
2196 err
= merge_state_array (dfa
, sctx
->limited_states
,
2197 local_sctx
.sifted_states
,
2199 if (BE (err
!= REG_NOERROR
, 0))
2202 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2203 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2205 /* mctx->bkref_ents may have changed, reload the pointer. */
2206 entry
= mctx
->bkref_ents
+ enabled_idx
;
2208 while (enabled_idx
++, entry
++->more
);
2212 if (local_sctx
.sifted_states
!= NULL
)
2214 re_node_set_free (&local_sctx
.limits
);
2221 #ifdef RE_ENABLE_I18N
2224 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2225 int node_idx
, int str_idx
, int max_str_idx
)
2227 const re_dfa_t
*const dfa
= mctx
->dfa
;
2229 /* Check the node can accept `multi byte'. */
2230 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2231 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2232 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2233 dfa
->nexts
[node_idx
]))
2234 /* The node can't accept the `multi byte', or the
2235 destination was already thrown away, then the node
2236 could't accept the current input `multi byte'. */
2238 /* Otherwise, it is sure that the node could accept
2239 `naccepted' bytes input. */
2242 #endif /* RE_ENABLE_I18N */
2245 /* Functions for state transition. */
2247 /* Return the next state to which the current state STATE will transit by
2248 accepting the current input byte, and update STATE_LOG if necessary.
2249 If STATE can accept a multibyte char/collating element/back reference
2250 update the destination of STATE_LOG. */
2252 static re_dfastate_t
*
2253 internal_function __attribute_warn_unused_result__
2254 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2255 re_dfastate_t
*state
)
2257 re_dfastate_t
**trtable
;
2260 #ifdef RE_ENABLE_I18N
2261 /* If the current state can accept multibyte. */
2262 if (BE (state
->accept_mb
, 0))
2264 *err
= transit_state_mb (mctx
, state
);
2265 if (BE (*err
!= REG_NOERROR
, 0))
2268 #endif /* RE_ENABLE_I18N */
2270 /* Then decide the next state with the single byte. */
2273 /* don't use transition table */
2274 return transit_state_sb (err
, mctx
, state
);
2277 /* Use transition table */
2278 ch
= re_string_fetch_byte (&mctx
->input
);
2281 trtable
= state
->trtable
;
2282 if (BE (trtable
!= NULL
, 1))
2285 trtable
= state
->word_trtable
;
2286 if (BE (trtable
!= NULL
, 1))
2288 unsigned int context
;
2290 = re_string_context_at (&mctx
->input
,
2291 re_string_cur_idx (&mctx
->input
) - 1,
2293 if (IS_WORD_CONTEXT (context
))
2294 return trtable
[ch
+ SBC_MAX
];
2299 if (!build_trtable (mctx
->dfa
, state
))
2305 /* Retry, we now have a transition table. */
2309 /* Update the state_log if we need */
2312 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2313 re_dfastate_t
*next_state
)
2315 const re_dfa_t
*const dfa
= mctx
->dfa
;
2316 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2318 if (cur_idx
> mctx
->state_log_top
)
2320 mctx
->state_log
[cur_idx
] = next_state
;
2321 mctx
->state_log_top
= cur_idx
;
2323 else if (mctx
->state_log
[cur_idx
] == 0)
2325 mctx
->state_log
[cur_idx
] = next_state
;
2329 re_dfastate_t
*pstate
;
2330 unsigned int context
;
2331 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2332 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2333 the destination of a multibyte char/collating element/
2334 back reference. Then the next state is the union set of
2335 these destinations and the results of the transition table. */
2336 pstate
= mctx
->state_log
[cur_idx
];
2337 log_nodes
= pstate
->entrance_nodes
;
2338 if (next_state
!= NULL
)
2340 table_nodes
= next_state
->entrance_nodes
;
2341 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2343 if (BE (*err
!= REG_NOERROR
, 0))
2347 next_nodes
= *log_nodes
;
2348 /* Note: We already add the nodes of the initial state,
2349 then we don't need to add them here. */
2351 context
= re_string_context_at (&mctx
->input
,
2352 re_string_cur_idx (&mctx
->input
) - 1,
2354 next_state
= mctx
->state_log
[cur_idx
]
2355 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2356 /* We don't need to check errors here, since the return value of
2357 this function is next_state and ERR is already set. */
2359 if (table_nodes
!= NULL
)
2360 re_node_set_free (&next_nodes
);
2363 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2365 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2366 later. We must check them here, since the back references in the
2367 next state might use them. */
2368 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2370 if (BE (*err
!= REG_NOERROR
, 0))
2373 /* If the next state has back references. */
2374 if (next_state
->has_backref
)
2376 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2377 if (BE (*err
!= REG_NOERROR
, 0))
2379 next_state
= mctx
->state_log
[cur_idx
];
2386 /* Skip bytes in the input that correspond to part of a
2387 multi-byte match, then look in the log for a state
2388 from which to restart matching. */
2391 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2393 re_dfastate_t
*cur_state
;
2396 int max
= mctx
->state_log_top
;
2397 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2401 if (++cur_str_idx
> max
)
2403 re_string_skip_bytes (&mctx
->input
, 1);
2405 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2407 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2409 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2413 /* Helper functions for transit_state. */
2415 /* From the node set CUR_NODES, pick up the nodes whose types are
2416 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2417 expression. And register them to use them later for evaluating the
2418 correspoding back references. */
2420 static reg_errcode_t
2422 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2425 const re_dfa_t
*const dfa
= mctx
->dfa
;
2429 /* TODO: This isn't efficient.
2430 Because there might be more than one nodes whose types are
2431 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2434 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2436 int node
= cur_nodes
->elems
[node_idx
];
2437 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2438 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2439 && (dfa
->used_bkref_map
2440 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2442 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2443 if (BE (err
!= REG_NOERROR
, 0))
2451 /* Return the next state to which the current state STATE will transit by
2452 accepting the current input byte. */
2454 static re_dfastate_t
*
2455 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2456 re_dfastate_t
*state
)
2458 const re_dfa_t
*const dfa
= mctx
->dfa
;
2459 re_node_set next_nodes
;
2460 re_dfastate_t
*next_state
;
2461 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2462 unsigned int context
;
2464 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2465 if (BE (*err
!= REG_NOERROR
, 0))
2467 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2469 int cur_node
= state
->nodes
.elems
[node_cnt
];
2470 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2472 *err
= re_node_set_merge (&next_nodes
,
2473 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2474 if (BE (*err
!= REG_NOERROR
, 0))
2476 re_node_set_free (&next_nodes
);
2481 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2482 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2483 /* We don't need to check errors here, since the return value of
2484 this function is next_state and ERR is already set. */
2486 re_node_set_free (&next_nodes
);
2487 re_string_skip_bytes (&mctx
->input
, 1);
2492 #ifdef RE_ENABLE_I18N
2493 static reg_errcode_t
2495 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2497 const re_dfa_t
*const dfa
= mctx
->dfa
;
2501 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2503 re_node_set dest_nodes
, *new_nodes
;
2504 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2505 int naccepted
, dest_idx
;
2506 unsigned int context
;
2507 re_dfastate_t
*dest_state
;
2509 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2512 if (dfa
->nodes
[cur_node_idx
].constraint
)
2514 context
= re_string_context_at (&mctx
->input
,
2515 re_string_cur_idx (&mctx
->input
),
2517 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2522 /* How many bytes the node can accept? */
2523 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2524 re_string_cur_idx (&mctx
->input
));
2528 /* The node can accepts `naccepted' bytes. */
2529 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2530 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2531 : mctx
->max_mb_elem_len
);
2532 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2533 if (BE (err
!= REG_NOERROR
, 0))
2536 assert (dfa
->nexts
[cur_node_idx
] != -1);
2538 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2540 dest_state
= mctx
->state_log
[dest_idx
];
2541 if (dest_state
== NULL
)
2542 dest_nodes
= *new_nodes
;
2545 err
= re_node_set_init_union (&dest_nodes
,
2546 dest_state
->entrance_nodes
, new_nodes
);
2547 if (BE (err
!= REG_NOERROR
, 0))
2550 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2552 mctx
->state_log
[dest_idx
]
2553 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2554 if (dest_state
!= NULL
)
2555 re_node_set_free (&dest_nodes
);
2556 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2561 #endif /* RE_ENABLE_I18N */
2563 static reg_errcode_t
2565 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2567 const re_dfa_t
*const dfa
= mctx
->dfa
;
2570 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2572 for (i
= 0; i
< nodes
->nelem
; ++i
)
2574 int dest_str_idx
, prev_nelem
, bkc_idx
;
2575 int node_idx
= nodes
->elems
[i
];
2576 unsigned int context
;
2577 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2578 re_node_set
*new_dest_nodes
;
2580 /* Check whether `node' is a backreference or not. */
2581 if (node
->type
!= OP_BACK_REF
)
2584 if (node
->constraint
)
2586 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2588 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2592 /* `node' is a backreference.
2593 Check the substring which the substring matched. */
2594 bkc_idx
= mctx
->nbkref_ents
;
2595 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2596 if (BE (err
!= REG_NOERROR
, 0))
2599 /* And add the epsilon closures (which is `new_dest_nodes') of
2600 the backreference to appropriate state_log. */
2602 assert (dfa
->nexts
[node_idx
] != -1);
2604 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2607 re_dfastate_t
*dest_state
;
2608 struct re_backref_cache_entry
*bkref_ent
;
2609 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2610 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2612 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2613 new_dest_nodes
= (subexp_len
== 0
2614 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2615 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2616 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2617 - bkref_ent
->subexp_from
);
2618 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2620 dest_state
= mctx
->state_log
[dest_str_idx
];
2621 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2622 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2623 /* Add `new_dest_node' to state_log. */
2624 if (dest_state
== NULL
)
2626 mctx
->state_log
[dest_str_idx
]
2627 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2629 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2630 && err
!= REG_NOERROR
, 0))
2635 re_node_set dest_nodes
;
2636 err
= re_node_set_init_union (&dest_nodes
,
2637 dest_state
->entrance_nodes
,
2639 if (BE (err
!= REG_NOERROR
, 0))
2641 re_node_set_free (&dest_nodes
);
2644 mctx
->state_log
[dest_str_idx
]
2645 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2646 re_node_set_free (&dest_nodes
);
2647 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2648 && err
!= REG_NOERROR
, 0))
2651 /* We need to check recursively if the backreference can epsilon
2654 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2656 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2658 if (BE (err
!= REG_NOERROR
, 0))
2660 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2661 if (BE (err
!= REG_NOERROR
, 0))
2671 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2672 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2673 Note that we might collect inappropriate candidates here.
2674 However, the cost of checking them strictly here is too high, then we
2675 delay these checking for prune_impossible_nodes(). */
2677 static reg_errcode_t
2678 internal_function __attribute_warn_unused_result__
2679 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2681 const re_dfa_t
*const dfa
= mctx
->dfa
;
2682 int subexp_num
, sub_top_idx
;
2683 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2684 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2685 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2686 if (cache_idx
!= -1)
2688 const struct re_backref_cache_entry
*entry
2689 = mctx
->bkref_ents
+ cache_idx
;
2691 if (entry
->node
== bkref_node
)
2692 return REG_NOERROR
; /* We already checked it. */
2693 while (entry
++->more
);
2696 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2698 /* For each sub expression */
2699 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2702 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2703 re_sub_match_last_t
*sub_last
;
2704 int sub_last_idx
, sl_str
, bkref_str_off
;
2706 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2707 continue; /* It isn't related. */
2709 sl_str
= sub_top
->str_idx
;
2710 bkref_str_off
= bkref_str_idx
;
2711 /* At first, check the last node of sub expressions we already
2713 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2716 sub_last
= sub_top
->lasts
[sub_last_idx
];
2717 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2718 /* The matched string by the sub expression match with the substring
2719 at the back reference? */
2720 if (sl_str_diff
> 0)
2722 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2724 /* Not enough chars for a successful match. */
2725 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2728 err
= clean_state_log_if_needed (mctx
,
2731 if (BE (err
!= REG_NOERROR
, 0))
2733 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2735 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2736 /* We don't need to search this sub expression any more. */
2739 bkref_str_off
+= sl_str_diff
;
2740 sl_str
+= sl_str_diff
;
2741 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2744 /* Reload buf, since the preceding call might have reallocated
2746 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2748 if (err
== REG_NOMATCH
)
2750 if (BE (err
!= REG_NOERROR
, 0))
2754 if (sub_last_idx
< sub_top
->nlasts
)
2756 if (sub_last_idx
> 0)
2758 /* Then, search for the other last nodes of the sub expression. */
2759 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2761 int cls_node
, sl_str_off
;
2762 const re_node_set
*nodes
;
2763 sl_str_off
= sl_str
- sub_top
->str_idx
;
2764 /* The matched string by the sub expression match with the substring
2765 at the back reference? */
2768 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2770 /* If we are at the end of the input, we cannot match. */
2771 if (bkref_str_off
>= mctx
->input
.len
)
2774 err
= extend_buffers (mctx
);
2775 if (BE (err
!= REG_NOERROR
, 0))
2778 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2780 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2781 break; /* We don't need to search this sub expression
2784 if (mctx
->state_log
[sl_str
] == NULL
)
2786 /* Does this state have a ')' of the sub expression? */
2787 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2788 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2792 if (sub_top
->path
== NULL
)
2794 sub_top
->path
= calloc (sizeof (state_array_t
),
2795 sl_str
- sub_top
->str_idx
+ 1);
2796 if (sub_top
->path
== NULL
)
2799 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2800 in the current context? */
2801 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2802 sub_top
->str_idx
, cls_node
, sl_str
,
2804 if (err
== REG_NOMATCH
)
2806 if (BE (err
!= REG_NOERROR
, 0))
2808 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2809 if (BE (sub_last
== NULL
, 0))
2811 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2813 if (err
== REG_NOMATCH
)
2820 /* Helper functions for get_subexp(). */
2822 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2823 If it can arrive, register the sub expression expressed with SUB_TOP
2826 static reg_errcode_t
2828 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2829 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2833 /* Can the subexpression arrive the back reference? */
2834 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2835 sub_last
->str_idx
, bkref_node
, bkref_str
,
2837 if (err
!= REG_NOERROR
)
2839 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2841 if (BE (err
!= REG_NOERROR
, 0))
2843 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2844 return clean_state_log_if_needed (mctx
, to_idx
);
2847 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2848 Search '(' if FL_OPEN, or search ')' otherwise.
2849 TODO: This function isn't efficient...
2850 Because there might be more than one nodes whose types are
2851 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2858 int subexp_idx
, int type
)
2861 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2863 int cls_node
= nodes
->elems
[cls_idx
];
2864 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2865 if (node
->type
== type
2866 && node
->opr
.idx
== subexp_idx
)
2872 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2873 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2875 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2877 static reg_errcode_t
2878 internal_function __attribute_warn_unused_result__
2879 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2880 int top_str
, int last_node
, int last_str
, int type
)
2882 const re_dfa_t
*const dfa
= mctx
->dfa
;
2883 reg_errcode_t err
= REG_NOERROR
;
2884 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2885 re_dfastate_t
*cur_state
= NULL
;
2886 re_node_set
*cur_nodes
, next_nodes
;
2887 re_dfastate_t
**backup_state_log
;
2888 unsigned int context
;
2890 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2891 /* Extend the buffer if we need. */
2892 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2894 re_dfastate_t
**new_array
;
2895 int old_alloc
= path
->alloc
;
2896 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2897 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2898 if (BE (new_array
== NULL
, 0))
2900 path
->alloc
= old_alloc
;
2903 path
->array
= new_array
;
2904 memset (new_array
+ old_alloc
, '\0',
2905 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2908 str_idx
= path
->next_idx
?: top_str
;
2910 /* Temporary modify MCTX. */
2911 backup_state_log
= mctx
->state_log
;
2912 backup_cur_idx
= mctx
->input
.cur_idx
;
2913 mctx
->state_log
= path
->array
;
2914 mctx
->input
.cur_idx
= str_idx
;
2916 /* Setup initial node set. */
2917 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2918 if (str_idx
== top_str
)
2920 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2921 if (BE (err
!= REG_NOERROR
, 0))
2923 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2924 if (BE (err
!= REG_NOERROR
, 0))
2926 re_node_set_free (&next_nodes
);
2932 cur_state
= mctx
->state_log
[str_idx
];
2933 if (cur_state
&& cur_state
->has_backref
)
2935 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2936 if (BE (err
!= REG_NOERROR
, 0))
2940 re_node_set_init_empty (&next_nodes
);
2942 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2944 if (next_nodes
.nelem
)
2946 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2948 if (BE (err
!= REG_NOERROR
, 0))
2950 re_node_set_free (&next_nodes
);
2954 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2955 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2957 re_node_set_free (&next_nodes
);
2960 mctx
->state_log
[str_idx
] = cur_state
;
2963 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2965 re_node_set_empty (&next_nodes
);
2966 if (mctx
->state_log
[str_idx
+ 1])
2968 err
= re_node_set_merge (&next_nodes
,
2969 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2970 if (BE (err
!= REG_NOERROR
, 0))
2972 re_node_set_free (&next_nodes
);
2978 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2979 &cur_state
->non_eps_nodes
,
2981 if (BE (err
!= REG_NOERROR
, 0))
2983 re_node_set_free (&next_nodes
);
2988 if (next_nodes
.nelem
)
2990 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2991 if (BE (err
!= REG_NOERROR
, 0))
2993 re_node_set_free (&next_nodes
);
2996 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2998 if (BE (err
!= REG_NOERROR
, 0))
3000 re_node_set_free (&next_nodes
);
3004 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3005 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3006 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3008 re_node_set_free (&next_nodes
);
3011 mctx
->state_log
[str_idx
] = cur_state
;
3012 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3014 re_node_set_free (&next_nodes
);
3015 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3016 : &mctx
->state_log
[last_str
]->nodes
);
3017 path
->next_idx
= str_idx
;
3020 mctx
->state_log
= backup_state_log
;
3021 mctx
->input
.cur_idx
= backup_cur_idx
;
3023 /* Then check the current node set has the node LAST_NODE. */
3024 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3030 /* Helper functions for check_arrival. */
3032 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3034 TODO: This function is similar to the functions transit_state*(),
3035 however this function has many additional works.
3036 Can't we unify them? */
3038 static reg_errcode_t
3039 internal_function __attribute_warn_unused_result__
3040 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3041 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3043 const re_dfa_t
*const dfa
= mctx
->dfa
;
3046 reg_errcode_t err
= REG_NOERROR
;
3047 re_node_set union_set
;
3048 re_node_set_init_empty (&union_set
);
3049 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3052 int cur_node
= cur_nodes
->elems
[cur_idx
];
3054 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3055 assert (!IS_EPSILON_NODE (type
));
3057 #ifdef RE_ENABLE_I18N
3058 /* If the node may accept `multi byte'. */
3059 if (dfa
->nodes
[cur_node
].accept_mb
)
3061 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3065 re_dfastate_t
*dest_state
;
3066 int next_node
= dfa
->nexts
[cur_node
];
3067 int next_idx
= str_idx
+ naccepted
;
3068 dest_state
= mctx
->state_log
[next_idx
];
3069 re_node_set_empty (&union_set
);
3072 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3073 if (BE (err
!= REG_NOERROR
, 0))
3075 re_node_set_free (&union_set
);
3079 result
= re_node_set_insert (&union_set
, next_node
);
3080 if (BE (result
< 0, 0))
3082 re_node_set_free (&union_set
);
3085 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3087 if (BE (mctx
->state_log
[next_idx
] == NULL
3088 && err
!= REG_NOERROR
, 0))
3090 re_node_set_free (&union_set
);
3095 #endif /* RE_ENABLE_I18N */
3097 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3099 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3100 if (BE (result
< 0, 0))
3102 re_node_set_free (&union_set
);
3107 re_node_set_free (&union_set
);
3111 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3112 CUR_NODES, however exclude the nodes which are:
3113 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3114 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3117 static reg_errcode_t
3119 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3120 int ex_subexp
, int type
)
3123 int idx
, outside_node
;
3124 re_node_set new_nodes
;
3126 assert (cur_nodes
->nelem
);
3128 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3129 if (BE (err
!= REG_NOERROR
, 0))
3131 /* Create a new node set NEW_NODES with the nodes which are epsilon
3132 closures of the node in CUR_NODES. */
3134 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3136 int cur_node
= cur_nodes
->elems
[idx
];
3137 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3138 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3139 if (outside_node
== -1)
3141 /* There are no problematic nodes, just merge them. */
3142 err
= re_node_set_merge (&new_nodes
, eclosure
);
3143 if (BE (err
!= REG_NOERROR
, 0))
3145 re_node_set_free (&new_nodes
);
3151 /* There are problematic nodes, re-calculate incrementally. */
3152 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3154 if (BE (err
!= REG_NOERROR
, 0))
3156 re_node_set_free (&new_nodes
);
3161 re_node_set_free (cur_nodes
);
3162 *cur_nodes
= new_nodes
;
3166 /* Helper function for check_arrival_expand_ecl.
3167 Check incrementally the epsilon closure of TARGET, and if it isn't
3168 problematic append it to DST_NODES. */
3170 static reg_errcode_t
3171 internal_function __attribute_warn_unused_result__
3172 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3173 int target
, int ex_subexp
, int type
)
3176 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3180 if (dfa
->nodes
[cur_node
].type
== type
3181 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3183 if (type
== OP_CLOSE_SUBEXP
)
3185 err
= re_node_set_insert (dst_nodes
, cur_node
);
3186 if (BE (err
== -1, 0))
3191 err
= re_node_set_insert (dst_nodes
, cur_node
);
3192 if (BE (err
== -1, 0))
3194 if (dfa
->edests
[cur_node
].nelem
== 0)
3196 if (dfa
->edests
[cur_node
].nelem
== 2)
3198 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3199 dfa
->edests
[cur_node
].elems
[1],
3201 if (BE (err
!= REG_NOERROR
, 0))
3204 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3210 /* For all the back references in the current state, calculate the
3211 destination of the back references by the appropriate entry
3212 in MCTX->BKREF_ENTS. */
3214 static reg_errcode_t
3215 internal_function __attribute_warn_unused_result__
3216 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3217 int cur_str
, int subexp_num
, int type
)
3219 const re_dfa_t
*const dfa
= mctx
->dfa
;
3221 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3222 struct re_backref_cache_entry
*ent
;
3224 if (cache_idx_start
== -1)
3228 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3231 int to_idx
, next_node
;
3233 /* Is this entry ENT is appropriate? */
3234 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3237 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3238 /* Calculate the destination of the back reference, and append it
3239 to MCTX->STATE_LOG. */
3240 if (to_idx
== cur_str
)
3242 /* The backreference did epsilon transit, we must re-check all the
3243 node in the current state. */
3244 re_node_set new_dests
;
3245 reg_errcode_t err2
, err3
;
3246 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3247 if (re_node_set_contains (cur_nodes
, next_node
))
3249 err
= re_node_set_init_1 (&new_dests
, next_node
);
3250 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3251 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3252 re_node_set_free (&new_dests
);
3253 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3254 || err3
!= REG_NOERROR
, 0))
3256 err
= (err
!= REG_NOERROR
? err
3257 : (err2
!= REG_NOERROR
? err2
: err3
));
3260 /* TODO: It is still inefficient... */
3265 re_node_set union_set
;
3266 next_node
= dfa
->nexts
[ent
->node
];
3267 if (mctx
->state_log
[to_idx
])
3270 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3273 err
= re_node_set_init_copy (&union_set
,
3274 &mctx
->state_log
[to_idx
]->nodes
);
3275 ret
= re_node_set_insert (&union_set
, next_node
);
3276 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3278 re_node_set_free (&union_set
);
3279 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3285 err
= re_node_set_init_1 (&union_set
, next_node
);
3286 if (BE (err
!= REG_NOERROR
, 0))
3289 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3290 re_node_set_free (&union_set
);
3291 if (BE (mctx
->state_log
[to_idx
] == NULL
3292 && err
!= REG_NOERROR
, 0))
3296 while (ent
++->more
);
3300 /* Build transition table for the state.
3301 Return 1 if succeeded, otherwise return NULL. */
3305 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3308 int i
, j
, ch
, need_word_trtable
= 0;
3309 bitset_word_t elem
, mask
;
3310 bool dests_node_malloced
= false;
3311 bool dest_states_malloced
= false;
3312 int ndests
; /* Number of the destination states from `state'. */
3313 re_dfastate_t
**trtable
;
3314 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3315 re_node_set follows
, *dests_node
;
3317 bitset_t acceptable
;
3321 re_node_set dests_node
[SBC_MAX
];
3322 bitset_t dests_ch
[SBC_MAX
];
3325 /* We build DFA states which corresponds to the destination nodes
3326 from `state'. `dests_node[i]' represents the nodes which i-th
3327 destination state contains, and `dests_ch[i]' represents the
3328 characters which i-th destination state accepts. */
3329 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3330 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3333 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3334 if (BE (dests_alloc
== NULL
, 0))
3336 dests_node_malloced
= true;
3338 dests_node
= dests_alloc
->dests_node
;
3339 dests_ch
= dests_alloc
->dests_ch
;
3341 /* Initialize transiton table. */
3342 state
->word_trtable
= state
->trtable
= NULL
;
3344 /* At first, group all nodes belonging to `state' into several
3346 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3347 if (BE (ndests
<= 0, 0))
3349 if (dests_node_malloced
)
3351 /* Return 0 in case of an error, 1 otherwise. */
3354 state
->trtable
= (re_dfastate_t
**)
3355 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3361 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3362 if (BE (err
!= REG_NOERROR
, 0))
3365 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3366 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3367 dest_states
= (re_dfastate_t
**)
3368 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3371 dest_states
= (re_dfastate_t
**)
3372 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3373 if (BE (dest_states
== NULL
, 0))
3376 if (dest_states_malloced
)
3378 re_node_set_free (&follows
);
3379 for (i
= 0; i
< ndests
; ++i
)
3380 re_node_set_free (dests_node
+ i
);
3381 if (dests_node_malloced
)
3385 dest_states_malloced
= true;
3387 dest_states_word
= dest_states
+ ndests
;
3388 dest_states_nl
= dest_states_word
+ ndests
;
3389 bitset_empty (acceptable
);
3391 /* Then build the states for all destinations. */
3392 for (i
= 0; i
< ndests
; ++i
)
3395 re_node_set_empty (&follows
);
3396 /* Merge the follows of this destination states. */
3397 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3399 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3400 if (next_node
!= -1)
3402 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3403 if (BE (err
!= REG_NOERROR
, 0))
3407 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3408 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3410 /* If the new state has context constraint,
3411 build appropriate states for these contexts. */
3412 if (dest_states
[i
]->has_constraint
)
3414 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3416 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3419 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3420 need_word_trtable
= 1;
3422 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3424 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3429 dest_states_word
[i
] = dest_states
[i
];
3430 dest_states_nl
[i
] = dest_states
[i
];
3432 bitset_merge (acceptable
, dests_ch
[i
]);
3435 if (!BE (need_word_trtable
, 0))
3437 /* We don't care about whether the following character is a word
3438 character, or we are in a single-byte character set so we can
3439 discern by looking at the character code: allocate a
3440 256-entry transition table. */
3441 trtable
= state
->trtable
=
3442 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3443 if (BE (trtable
== NULL
, 0))
3446 /* For all characters ch...: */
3447 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3448 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3450 mask
<<= 1, elem
>>= 1, ++ch
)
3451 if (BE (elem
& 1, 0))
3453 /* There must be exactly one destination which accepts
3454 character ch. See group_nodes_into_DFAstates. */
3455 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3458 /* j-th destination accepts the word character ch. */
3459 if (dfa
->word_char
[i
] & mask
)
3460 trtable
[ch
] = dest_states_word
[j
];
3462 trtable
[ch
] = dest_states
[j
];
3467 /* We care about whether the following character is a word
3468 character, and we are in a multi-byte character set: discern
3469 by looking at the character code: build two 256-entry
3470 transition tables, one starting at trtable[0] and one
3471 starting at trtable[SBC_MAX]. */
3472 trtable
= state
->word_trtable
=
3473 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3474 if (BE (trtable
== NULL
, 0))
3477 /* For all characters ch...: */
3478 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3479 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3481 mask
<<= 1, elem
>>= 1, ++ch
)
3482 if (BE (elem
& 1, 0))
3484 /* There must be exactly one destination which accepts
3485 character ch. See group_nodes_into_DFAstates. */
3486 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3489 /* j-th destination accepts the word character ch. */
3490 trtable
[ch
] = dest_states
[j
];
3491 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3496 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3498 /* The current state accepts newline character. */
3499 for (j
= 0; j
< ndests
; ++j
)
3500 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3502 /* k-th destination accepts newline character. */
3503 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3504 if (need_word_trtable
)
3505 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3506 /* There must be only one destination which accepts
3507 newline. See group_nodes_into_DFAstates. */
3512 if (dest_states_malloced
)
3515 re_node_set_free (&follows
);
3516 for (i
= 0; i
< ndests
; ++i
)
3517 re_node_set_free (dests_node
+ i
);
3519 if (dests_node_malloced
)
3525 /* Group all nodes belonging to STATE into several destinations.
3526 Then for all destinations, set the nodes belonging to the destination
3527 to DESTS_NODE[i] and set the characters accepted by the destination
3528 to DEST_CH[i]. This function return the number of destinations. */
3532 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3533 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3538 int ndests
; /* Number of the destinations from `state'. */
3539 bitset_t accepts
; /* Characters a node can accept. */
3540 const re_node_set
*cur_nodes
= &state
->nodes
;
3541 bitset_empty (accepts
);
3544 /* For all the nodes belonging to `state', */
3545 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3547 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3548 re_token_type_t type
= node
->type
;
3549 unsigned int constraint
= node
->constraint
;
3551 /* Enumerate all single byte character this node can accept. */
3552 if (type
== CHARACTER
)
3553 bitset_set (accepts
, node
->opr
.c
);
3554 else if (type
== SIMPLE_BRACKET
)
3556 bitset_merge (accepts
, node
->opr
.sbcset
);
3558 else if (type
== OP_PERIOD
)
3560 #ifdef RE_ENABLE_I18N
3561 if (dfa
->mb_cur_max
> 1)
3562 bitset_merge (accepts
, dfa
->sb_char
);
3565 bitset_set_all (accepts
);
3566 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3567 bitset_clear (accepts
, '\n');
3568 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3569 bitset_clear (accepts
, '\0');
3571 #ifdef RE_ENABLE_I18N
3572 else if (type
== OP_UTF8_PERIOD
)
3574 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3575 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3576 bitset_clear (accepts
, '\n');
3577 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3578 bitset_clear (accepts
, '\0');
3584 /* Check the `accepts' and sift the characters which are not
3585 match it the context. */
3588 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3590 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3591 bitset_empty (accepts
);
3592 if (accepts_newline
)
3593 bitset_set (accepts
, NEWLINE_CHAR
);
3597 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3599 bitset_empty (accepts
);
3603 if (constraint
& NEXT_WORD_CONSTRAINT
)
3605 bitset_word_t any_set
= 0;
3606 if (type
== CHARACTER
&& !node
->word_char
)
3608 bitset_empty (accepts
);
3611 #ifdef RE_ENABLE_I18N
3612 if (dfa
->mb_cur_max
> 1)
3613 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3614 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3617 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3618 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3622 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3624 bitset_word_t any_set
= 0;
3625 if (type
== CHARACTER
&& node
->word_char
)
3627 bitset_empty (accepts
);
3630 #ifdef RE_ENABLE_I18N
3631 if (dfa
->mb_cur_max
> 1)
3632 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3633 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3636 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3637 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3643 /* Then divide `accepts' into DFA states, or create a new
3644 state. Above, we make sure that accepts is not empty. */
3645 for (j
= 0; j
< ndests
; ++j
)
3647 bitset_t intersec
; /* Intersection sets, see below. */
3649 /* Flags, see below. */
3650 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3652 /* Optimization, skip if this state doesn't accept the character. */
3653 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3656 /* Enumerate the intersection set of this state and `accepts'. */
3658 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3659 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3660 /* And skip if the intersection set is empty. */
3664 /* Then check if this state is a subset of `accepts'. */
3665 not_subset
= not_consumed
= 0;
3666 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3668 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3669 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3672 /* If this state isn't a subset of `accepts', create a
3673 new group state, which has the `remains'. */
3676 bitset_copy (dests_ch
[ndests
], remains
);
3677 bitset_copy (dests_ch
[j
], intersec
);
3678 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3679 if (BE (err
!= REG_NOERROR
, 0))
3684 /* Put the position in the current group. */
3685 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3686 if (BE (result
< 0, 0))
3689 /* If all characters are consumed, go to next node. */
3693 /* Some characters remain, create a new group. */
3696 bitset_copy (dests_ch
[ndests
], accepts
);
3697 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3698 if (BE (err
!= REG_NOERROR
, 0))
3701 bitset_empty (accepts
);
3706 for (j
= 0; j
< ndests
; ++j
)
3707 re_node_set_free (dests_node
+ j
);
3711 #ifdef RE_ENABLE_I18N
3712 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3713 Return the number of the bytes the node accepts.
3714 STR_IDX is the current index of the input string.
3716 This function handles the nodes which can accept one character, or
3717 one collating element like '.', '[a-z]', opposite to the other nodes
3718 can only accept one byte. */
3722 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3723 const re_string_t
*input
, int str_idx
)
3725 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3726 int char_len
, elem_len
;
3729 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3731 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3732 if (BE (c
< 0xc2, 1))
3735 if (str_idx
+ 2 > input
->len
)
3738 d
= re_string_byte_at (input
, str_idx
+ 1);
3740 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3744 if (c
== 0xe0 && d
< 0xa0)
3750 if (c
== 0xf0 && d
< 0x90)
3756 if (c
== 0xf8 && d
< 0x88)
3762 if (c
== 0xfc && d
< 0x84)
3768 if (str_idx
+ char_len
> input
->len
)
3771 for (i
= 1; i
< char_len
; ++i
)
3773 d
= re_string_byte_at (input
, str_idx
+ i
);
3774 if (d
< 0x80 || d
> 0xbf)
3780 char_len
= re_string_char_size_at (input
, str_idx
);
3781 if (node
->type
== OP_PERIOD
)
3785 /* FIXME: I don't think this if is needed, as both '\n'
3786 and '\0' are char_len == 1. */
3787 /* '.' accepts any one character except the following two cases. */
3788 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3789 re_string_byte_at (input
, str_idx
) == '\n') ||
3790 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3791 re_string_byte_at (input
, str_idx
) == '\0'))
3796 elem_len
= re_string_elem_size_at (input
, str_idx
);
3797 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3800 if (node
->type
== COMPLEX_BRACKET
)
3802 const re_charset_t
*cset
= node
->opr
.mbcset
;
3804 const unsigned char *pin
3805 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3810 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3811 ? re_string_wchar_at (input
, str_idx
) : 0);
3813 /* match with multibyte character? */
3814 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3815 if (wc
== cset
->mbchars
[i
])
3817 match_len
= char_len
;
3818 goto check_node_accept_bytes_match
;
3820 /* match with character_class? */
3821 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3823 wctype_t wt
= cset
->char_classes
[i
];
3824 if (__iswctype (wc
, wt
))
3826 match_len
= char_len
;
3827 goto check_node_accept_bytes_match
;
3832 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3835 unsigned int in_collseq
= 0;
3836 const int32_t *table
, *indirect
;
3837 const unsigned char *weights
, *extra
;
3838 const char *collseqwc
;
3839 /* This #include defines a local function! */
3840 # include <locale/weight.h>
3842 /* match with collating_symbol? */
3843 if (cset
->ncoll_syms
)
3844 extra
= (const unsigned char *)
3845 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3846 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3848 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3849 /* Compare the length of input collating element and
3850 the length of current collating element. */
3851 if (*coll_sym
!= elem_len
)
3853 /* Compare each bytes. */
3854 for (j
= 0; j
< *coll_sym
; j
++)
3855 if (pin
[j
] != coll_sym
[1 + j
])
3859 /* Match if every bytes is equal. */
3861 goto check_node_accept_bytes_match
;
3867 if (elem_len
<= char_len
)
3869 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3870 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3873 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3875 /* match with range expression? */
3876 for (i
= 0; i
< cset
->nranges
; ++i
)
3877 if (cset
->range_starts
[i
] <= in_collseq
3878 && in_collseq
<= cset
->range_ends
[i
])
3880 match_len
= elem_len
;
3881 goto check_node_accept_bytes_match
;
3884 /* match with equivalence_class? */
3885 if (cset
->nequiv_classes
)
3887 const unsigned char *cp
= pin
;
3888 table
= (const int32_t *)
3889 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3890 weights
= (const unsigned char *)
3891 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3892 extra
= (const unsigned char *)
3893 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3894 indirect
= (const int32_t *)
3895 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3896 int32_t idx
= findidx (&cp
);
3898 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3900 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3901 size_t weight_len
= weights
[idx
& 0xffffff];
3902 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3903 && (idx
>> 24) == (equiv_class_idx
>> 24))
3908 equiv_class_idx
&= 0xffffff;
3910 while (cnt
<= weight_len
3911 && (weights
[equiv_class_idx
+ 1 + cnt
]
3912 == weights
[idx
+ 1 + cnt
]))
3914 if (cnt
> weight_len
)
3916 match_len
= elem_len
;
3917 goto check_node_accept_bytes_match
;
3926 /* match with range expression? */
3928 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3930 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3933 for (i
= 0; i
< cset
->nranges
; ++i
)
3935 cmp_buf
[0] = cset
->range_starts
[i
];
3936 cmp_buf
[4] = cset
->range_ends
[i
];
3937 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3938 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3940 match_len
= char_len
;
3941 goto check_node_accept_bytes_match
;
3945 check_node_accept_bytes_match
:
3946 if (!cset
->non_match
)
3953 return (elem_len
> char_len
) ? elem_len
: char_len
;
3962 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3964 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3969 /* No valid character. Match it as a single byte character. */
3970 const unsigned char *collseq
= (const unsigned char *)
3971 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3972 return collseq
[mbs
[0]];
3979 const unsigned char *extra
= (const unsigned char *)
3980 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3981 int32_t extrasize
= (const unsigned char *)
3982 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3984 for (idx
= 0; idx
< extrasize
;)
3986 int mbs_cnt
, found
= 0;
3987 int32_t elem_mbs_len
;
3988 /* Skip the name of collating element name. */
3989 idx
= idx
+ extra
[idx
] + 1;
3990 elem_mbs_len
= extra
[idx
++];
3991 if (mbs_len
== elem_mbs_len
)
3993 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3994 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3996 if (mbs_cnt
== elem_mbs_len
)
3997 /* Found the entry. */
4000 /* Skip the byte sequence of the collating element. */
4001 idx
+= elem_mbs_len
;
4002 /* Adjust for the alignment. */
4003 idx
= (idx
+ 3) & ~3;
4004 /* Skip the collation sequence value. */
4005 idx
+= sizeof (uint32_t);
4006 /* Skip the wide char sequence of the collating element. */
4007 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
4008 /* If we found the entry, return the sequence value. */
4010 return *(uint32_t *) (extra
+ idx
);
4011 /* Skip the collation sequence value. */
4012 idx
+= sizeof (uint32_t);
4018 #endif /* RE_ENABLE_I18N */
4020 /* Check whether the node accepts the byte which is IDX-th
4021 byte of the INPUT. */
4025 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4029 ch
= re_string_byte_at (&mctx
->input
, idx
);
4033 if (node
->opr
.c
!= ch
)
4037 case SIMPLE_BRACKET
:
4038 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4042 #ifdef RE_ENABLE_I18N
4043 case OP_UTF8_PERIOD
:
4049 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4050 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4058 if (node
->constraint
)
4060 /* The node has constraints. Check whether the current context
4061 satisfies the constraints. */
4062 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4064 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4071 /* Extend the buffers, if the buffers have run out. */
4073 static reg_errcode_t
4074 internal_function __attribute_warn_unused_result__
4075 extend_buffers (re_match_context_t
*mctx
)
4078 re_string_t
*pstr
= &mctx
->input
;
4080 /* Double the lengthes of the buffers. */
4081 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4082 if (BE (ret
!= REG_NOERROR
, 0))
4085 if (mctx
->state_log
!= NULL
)
4087 /* And double the length of state_log. */
4088 /* XXX We have no indication of the size of this buffer. If this
4089 allocation fail we have no indication that the state_log array
4090 does not have the right size. */
4091 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4092 pstr
->bufs_len
+ 1);
4093 if (BE (new_array
== NULL
, 0))
4095 mctx
->state_log
= new_array
;
4098 /* Then reconstruct the buffers. */
4101 #ifdef RE_ENABLE_I18N
4102 if (pstr
->mb_cur_max
> 1)
4104 ret
= build_wcs_upper_buffer (pstr
);
4105 if (BE (ret
!= REG_NOERROR
, 0))
4109 #endif /* RE_ENABLE_I18N */
4110 build_upper_buffer (pstr
);
4114 #ifdef RE_ENABLE_I18N
4115 if (pstr
->mb_cur_max
> 1)
4116 build_wcs_buffer (pstr
);
4118 #endif /* RE_ENABLE_I18N */
4120 if (pstr
->trans
!= NULL
)
4121 re_string_translate_buffer (pstr
);
4128 /* Functions for matching context. */
4130 /* Initialize MCTX. */
4132 static reg_errcode_t
4133 internal_function __attribute_warn_unused_result__
4134 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4136 mctx
->eflags
= eflags
;
4137 mctx
->match_last
= -1;
4140 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4141 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4142 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4145 /* Already zero-ed by the caller.
4147 mctx->bkref_ents = NULL;
4148 mctx->nbkref_ents = 0;
4149 mctx->nsub_tops = 0; */
4150 mctx
->abkref_ents
= n
;
4151 mctx
->max_mb_elem_len
= 1;
4152 mctx
->asub_tops
= n
;
4156 /* Clean the entries which depend on the current input in MCTX.
4157 This function must be invoked when the matcher changes the start index
4158 of the input, or changes the input string. */
4162 match_ctx_clean (re_match_context_t
*mctx
)
4165 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4168 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4169 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4171 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4172 re_free (last
->path
.array
);
4175 re_free (top
->lasts
);
4178 re_free (top
->path
->array
);
4179 re_free (top
->path
);
4184 mctx
->nsub_tops
= 0;
4185 mctx
->nbkref_ents
= 0;
4188 /* Free all the memory associated with MCTX. */
4192 match_ctx_free (re_match_context_t
*mctx
)
4194 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4195 match_ctx_clean (mctx
);
4196 re_free (mctx
->sub_tops
);
4197 re_free (mctx
->bkref_ents
);
4200 /* Add a new backreference entry to MCTX.
4201 Note that we assume that caller never call this function with duplicate
4202 entry, and call with STR_IDX which isn't smaller than any existing entry.
4205 static reg_errcode_t
4206 internal_function __attribute_warn_unused_result__
4207 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4210 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4212 struct re_backref_cache_entry
* new_entry
;
4213 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4214 mctx
->abkref_ents
* 2);
4215 if (BE (new_entry
== NULL
, 0))
4217 re_free (mctx
->bkref_ents
);
4220 mctx
->bkref_ents
= new_entry
;
4221 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4222 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4223 mctx
->abkref_ents
*= 2;
4225 if (mctx
->nbkref_ents
> 0
4226 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4227 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4229 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4230 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4231 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4232 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4234 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4235 If bit N is clear, means that this entry won't epsilon-transition to
4236 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4237 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4240 A backreference does not epsilon-transition unless it is empty, so set
4241 to all zeros if FROM != TO. */
4242 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4243 = (from
== to
? ~0 : 0);
4245 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4246 if (mctx
->max_mb_elem_len
< to
- from
)
4247 mctx
->max_mb_elem_len
= to
- from
;
4251 /* Search for the first entry which has the same str_idx, or -1 if none is
4252 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4256 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4258 int left
, right
, mid
, last
;
4259 last
= right
= mctx
->nbkref_ents
;
4260 for (left
= 0; left
< right
;)
4262 mid
= (left
+ right
) / 2;
4263 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4268 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4274 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4277 static reg_errcode_t
4278 internal_function __attribute_warn_unused_result__
4279 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4282 assert (mctx
->sub_tops
!= NULL
);
4283 assert (mctx
->asub_tops
> 0);
4285 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4287 int new_asub_tops
= mctx
->asub_tops
* 2;
4288 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4289 re_sub_match_top_t
*,
4291 if (BE (new_array
== NULL
, 0))
4293 mctx
->sub_tops
= new_array
;
4294 mctx
->asub_tops
= new_asub_tops
;
4296 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4297 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4299 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4300 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4304 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4305 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4307 static re_sub_match_last_t
*
4309 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4311 re_sub_match_last_t
*new_entry
;
4312 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4314 int new_alasts
= 2 * subtop
->alasts
+ 1;
4315 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4316 re_sub_match_last_t
*,
4318 if (BE (new_array
== NULL
, 0))
4320 subtop
->lasts
= new_array
;
4321 subtop
->alasts
= new_alasts
;
4323 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4324 if (BE (new_entry
!= NULL
, 1))
4326 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4327 new_entry
->node
= node
;
4328 new_entry
->str_idx
= str_idx
;
4336 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4337 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4339 sctx
->sifted_states
= sifted_sts
;
4340 sctx
->limited_states
= limited_sts
;
4341 sctx
->last_node
= last_node
;
4342 sctx
->last_str_idx
= last_str_idx
;
4343 re_node_set_init_empty (&sctx
->limits
);