1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
43 int eflags
) internal_function
;
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
) internal_function
;
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
52 int ret_len
) internal_function
;
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
) internal_function
;
55 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
57 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
58 int *p_match_first
) internal_function
;
59 static int check_halt_state_context (const re_match_context_t
*mctx
,
60 const re_dfastate_t
*state
, int idx
)
62 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
63 regmatch_t
*prev_idx_match
, int cur_node
,
64 int cur_idx
, int nmatch
) internal_function
;
65 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
66 int str_idx
, int dest_node
, int nregs
,
68 re_node_set
*eps_via_nodes
)
70 static reg_errcode_t
set_regs (const regex_t
*preg
,
71 const re_match_context_t
*mctx
,
72 size_t nmatch
, regmatch_t
*pmatch
,
73 int fl_backtrack
) internal_function
;
74 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
78 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
79 re_sift_context_t
*sctx
,
80 int node_idx
, int str_idx
, int max_str_idx
)
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
84 re_sift_context_t
*sctx
)
86 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
87 re_sift_context_t
*sctx
, int str_idx
,
88 re_node_set
*cur_dest
)
90 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
,
93 re_node_set
*dest_nodes
)
95 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
96 re_node_set
*dest_nodes
,
97 const re_node_set
*candidates
)
99 static int check_dst_limits (const re_match_context_t
*mctx
,
101 int dst_node
, int dst_idx
, int src_node
,
102 int src_idx
) internal_function
;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
104 int boundaries
, int subexp_idx
,
105 int from_node
, int bkref_idx
)
107 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
108 int limit
, int subexp_idx
,
109 int node
, int str_idx
,
110 int bkref_idx
) internal_function
;
111 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
112 re_node_set
*dest_nodes
,
113 const re_node_set
*candidates
,
115 struct re_backref_cache_entry
*bkref_ents
,
116 int str_idx
) internal_function
;
117 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
118 re_sift_context_t
*sctx
,
119 int str_idx
, const re_node_set
*candidates
)
121 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
123 re_dfastate_t
**src
, int num
)
125 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
126 re_match_context_t
*mctx
) internal_function
;
127 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
128 re_match_context_t
*mctx
,
129 re_dfastate_t
*state
) internal_function
;
130 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
131 re_match_context_t
*mctx
,
132 re_dfastate_t
*next_state
)
134 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
135 re_node_set
*cur_nodes
,
136 int str_idx
) internal_function
;
138 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
139 re_match_context_t
*mctx
,
140 re_dfastate_t
*pstate
)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
145 re_dfastate_t
*pstate
)
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
149 const re_node_set
*nodes
)
151 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
152 int bkref_node
, int bkref_str_idx
)
154 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
155 const re_sub_match_top_t
*sub_top
,
156 re_sub_match_last_t
*sub_last
,
157 int bkref_node
, int bkref_str
)
159 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
160 int subexp_idx
, int type
) internal_function
;
161 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
162 state_array_t
*path
, int top_node
,
163 int top_str
, int last_node
, int last_str
,
164 int type
) internal_function
;
165 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
167 re_node_set
*cur_nodes
,
168 re_node_set
*next_nodes
)
170 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
171 re_node_set
*cur_nodes
,
172 int ex_subexp
, int type
)
174 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
175 re_node_set
*dst_nodes
,
176 int target
, int ex_subexp
,
177 int type
) internal_function
;
178 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
179 re_node_set
*cur_nodes
, int cur_str
,
180 int subexp_num
, int type
)
182 static int build_trtable (const re_dfa_t
*dfa
,
183 re_dfastate_t
*state
) internal_function
;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
186 const re_string_t
*input
, int idx
)
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
195 const re_dfastate_t
*state
,
196 re_node_set
*states_node
,
197 bitset_t
*states_ch
) internal_function
;
198 static int check_node_accept (const re_match_context_t
*mctx
,
199 const re_token_t
*node
, int idx
)
201 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
222 const regex_t
*__restrict preg
;
223 const char *__restrict string
;
230 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
232 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
235 if (eflags
& REG_STARTEND
)
237 start
= pmatch
[0].rm_so
;
238 length
= pmatch
[0].rm_eo
;
243 length
= strlen (string
);
246 __libc_lock_lock (dfa
->lock
);
248 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
249 length
, 0, NULL
, eflags
);
251 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
252 length
, nmatch
, pmatch
, eflags
);
253 __libc_lock_unlock (dfa
->lock
);
254 return err
!= REG_NOERROR
;
258 # include <shlib-compat.h>
259 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec
) __compat_regexec
;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t
*__restrict preg
,
267 const char *__restrict string
, size_t nmatch
,
268 regmatch_t pmatch
[], int eflags
)
270 return regexec (preg
, string
, nmatch
, pmatch
,
271 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
273 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp
, string
, length
, start
, regs
)
308 struct re_pattern_buffer
*bufp
;
311 struct re_registers
*regs
;
313 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
316 weak_alias (__re_match
, re_match
)
320 re_search (bufp
, string
, length
, start
, range
, regs
)
321 struct re_pattern_buffer
*bufp
;
323 int length
, start
, range
;
324 struct re_registers
*regs
;
326 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
329 weak_alias (__re_search
, re_search
)
333 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
334 struct re_pattern_buffer
*bufp
;
335 const char *string1
, *string2
;
336 int length1
, length2
, start
, stop
;
337 struct re_registers
*regs
;
339 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
340 start
, 0, regs
, stop
, 1);
343 weak_alias (__re_match_2
, re_match_2
)
347 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
348 struct re_pattern_buffer
*bufp
;
349 const char *string1
, *string2
;
350 int length1
, length2
, start
, range
, stop
;
351 struct re_registers
*regs
;
353 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
354 start
, range
, regs
, stop
, 0);
357 weak_alias (__re_search_2
, re_search_2
)
361 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
363 struct re_pattern_buffer
*bufp
;
364 const char *string1
, *string2
;
365 int length1
, length2
, start
, range
, stop
, ret_len
;
366 struct re_registers
*regs
;
370 int len
= length1
+ length2
;
373 if (BE (length1
< 0 || length2
< 0 || stop
< 0 || len
< length1
, 0))
376 /* Concatenate the strings. */
380 s
= re_malloc (char, len
);
382 if (BE (s
== NULL
, 0))
385 memcpy (__mempcpy (s
, string1
, length1
), string2
, length2
);
387 memcpy (s
, string1
, length1
);
388 memcpy (s
+ length1
, string2
, length2
);
397 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
402 /* The parameters have the same meaning as those of re_search.
403 Additional parameters:
404 If RET_LEN is nonzero the length of the match is returned (re_match style);
405 otherwise the position of the match is returned. */
408 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
409 struct re_pattern_buffer
*bufp
;
411 int length
, start
, range
, stop
, ret_len
;
412 struct re_registers
*regs
;
414 reg_errcode_t result
;
418 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
420 /* Check for out-of-range. */
421 if (BE (start
< 0 || start
> length
, 0))
423 if (BE (start
+ range
> length
, 0))
424 range
= length
- start
;
425 else if (BE (start
+ range
< 0, 0))
428 __libc_lock_lock (dfa
->lock
);
430 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
431 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
433 /* Compile fastmap if we haven't yet. */
434 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
435 re_compile_fastmap (bufp
);
437 if (BE (bufp
->no_sub
, 0))
440 /* We need at least 1 register. */
443 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
444 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
446 nregs
= regs
->num_regs
;
447 if (BE (nregs
< 1, 0))
449 /* Nothing can be copied to regs. */
455 nregs
= bufp
->re_nsub
+ 1;
456 pmatch
= re_malloc (regmatch_t
, nregs
);
457 if (BE (pmatch
== NULL
, 0))
463 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
464 nregs
, pmatch
, eflags
);
468 /* I hope we needn't fill ther regs with -1's when no match was found. */
469 if (result
!= REG_NOERROR
)
471 else if (regs
!= NULL
)
473 /* If caller wants register contents data back, copy them. */
474 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
475 bufp
->regs_allocated
);
476 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
480 if (BE (rval
== 0, 1))
484 assert (pmatch
[0].rm_so
== start
);
485 rval
= pmatch
[0].rm_eo
- start
;
488 rval
= pmatch
[0].rm_so
;
492 __libc_lock_unlock (dfa
->lock
);
497 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
498 struct re_registers
*regs
;
500 int nregs
, regs_allocated
;
502 int rval
= REGS_REALLOCATE
;
504 int need_regs
= nregs
+ 1;
505 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
508 /* Have the register data arrays been allocated? */
509 if (regs_allocated
== REGS_UNALLOCATED
)
510 { /* No. So allocate them with malloc. */
511 regs
->start
= re_malloc (regoff_t
, need_regs
);
512 if (BE (regs
->start
== NULL
, 0))
513 return REGS_UNALLOCATED
;
514 regs
->end
= re_malloc (regoff_t
, need_regs
);
515 if (BE (regs
->end
== NULL
, 0))
517 re_free (regs
->start
);
518 return REGS_UNALLOCATED
;
520 regs
->num_regs
= need_regs
;
522 else if (regs_allocated
== REGS_REALLOCATE
)
523 { /* Yes. If we need more elements than were already
524 allocated, reallocate them. If we need fewer, just
526 if (BE (need_regs
> regs
->num_regs
, 0))
528 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
530 if (BE (new_start
== NULL
, 0))
531 return REGS_UNALLOCATED
;
532 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
533 if (BE (new_end
== NULL
, 0))
536 return REGS_UNALLOCATED
;
538 regs
->start
= new_start
;
540 regs
->num_regs
= need_regs
;
545 assert (regs_allocated
== REGS_FIXED
);
546 /* This function may not be called with REGS_FIXED and nregs too big. */
547 assert (regs
->num_regs
>= nregs
);
552 for (i
= 0; i
< nregs
; ++i
)
554 regs
->start
[i
] = pmatch
[i
].rm_so
;
555 regs
->end
[i
] = pmatch
[i
].rm_eo
;
557 for ( ; i
< regs
->num_regs
; ++i
)
558 regs
->start
[i
] = regs
->end
[i
] = -1;
563 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
564 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
565 this memory for recording register information. STARTS and ENDS
566 must be allocated using the malloc library routine, and must each
567 be at least NUM_REGS * sizeof (regoff_t) bytes long.
569 If NUM_REGS == 0, then subsequent matches should allocate their own
572 Unless this function is called, the first search or match using
573 PATTERN_BUFFER will allocate its own register data, without
574 freeing the old data. */
577 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
578 struct re_pattern_buffer
*bufp
;
579 struct re_registers
*regs
;
581 regoff_t
*starts
, *ends
;
585 bufp
->regs_allocated
= REGS_REALLOCATE
;
586 regs
->num_regs
= num_regs
;
587 regs
->start
= starts
;
592 bufp
->regs_allocated
= REGS_UNALLOCATED
;
594 regs
->start
= regs
->end
= (regoff_t
*) 0;
598 weak_alias (__re_set_registers
, re_set_registers
)
601 /* Entry points compatible with 4.2 BSD regex library. We don't define
602 them unless specifically requested. */
604 #if defined _REGEX_RE_COMP || defined _LIBC
612 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
614 #endif /* _REGEX_RE_COMP */
616 /* Internal entry point. */
618 /* Searches for a compiled pattern PREG in the string STRING, whose
619 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
620 mingings with regexec. START, and RANGE have the same meanings
622 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
623 otherwise return the error code.
624 Note: We assume front end functions already check ranges.
625 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
628 __attribute_warn_unused_result__
629 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
633 int length
, start
, range
, stop
, eflags
;
638 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
639 int left_lim
, right_lim
, incr
;
640 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644 re_match_context_t mctx
= { .dfa
= dfa
};
646 re_match_context_t mctx
;
648 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
649 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
650 RE_TRANSLATE_TYPE t
= preg
->translate
;
652 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
653 memset (&mctx
, '\0', sizeof (re_match_context_t
));
657 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
658 nmatch
-= extra_nmatch
;
660 /* Check if the DFA haven't been compiled. */
661 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
662 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
663 || dfa
->init_state_begbuf
== NULL
, 0))
667 /* We assume front-end functions already check them. */
668 assert (start
+ range
>= 0 && start
+ range
<= length
);
671 /* If initial states with non-begbuf contexts have no elements,
672 the regex must be anchored. If preg->newline_anchor is set,
673 we'll never use init_state_nl, so do not check it. */
674 if (dfa
->init_state
->nodes
.nelem
== 0
675 && dfa
->init_state_word
->nodes
.nelem
== 0
676 && (dfa
->init_state_nl
->nodes
.nelem
== 0
677 || !preg
->newline_anchor
))
679 if (start
!= 0 && start
+ range
!= 0)
684 /* We must check the longest matching, if nmatch > 0. */
685 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
687 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
688 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
689 if (BE (err
!= REG_NOERROR
, 0))
691 mctx
.input
.stop
= stop
;
692 mctx
.input
.raw_stop
= stop
;
693 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
695 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
696 if (BE (err
!= REG_NOERROR
, 0))
699 /* We will log all the DFA states through which the dfa pass,
700 if nmatch > 1, or this dfa has "multibyte node", which is a
701 back-reference or a node which can accept multibyte character or
702 multi character collating element. */
703 if (nmatch
> 1 || dfa
->has_mb_node
)
705 /* Avoid overflow. */
706 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
712 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
713 if (BE (mctx
.state_log
== NULL
, 0))
720 mctx
.state_log
= NULL
;
723 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
724 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
726 /* Check incrementally whether of not the input string match. */
727 incr
= (range
< 0) ? -1 : 1;
728 left_lim
= (range
< 0) ? start
+ range
: start
;
729 right_lim
= (range
< 0) ? start
: start
+ range
;
730 sb
= dfa
->mb_cur_max
== 1;
733 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
734 | (range
>= 0 ? 2 : 0)
735 | (t
!= NULL
? 1 : 0))
738 for (;; match_first
+= incr
)
741 if (match_first
< left_lim
|| right_lim
< match_first
)
744 /* Advance as rapidly as possible through the string, until we
745 find a plausible place to start matching. This may be done
746 with varying efficiency, so there are various possibilities:
747 only the most common of them are specialized, in order to
748 save on code size. We use a switch statement for speed. */
756 /* Fastmap with single-byte translation, match forward. */
757 while (BE (match_first
< right_lim
, 1)
758 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
760 goto forward_match_found_start_or_reached_end
;
763 /* Fastmap without translation, match forward. */
764 while (BE (match_first
< right_lim
, 1)
765 && !fastmap
[(unsigned char) string
[match_first
]])
768 forward_match_found_start_or_reached_end
:
769 if (BE (match_first
== right_lim
, 0))
771 ch
= match_first
>= length
772 ? 0 : (unsigned char) string
[match_first
];
773 if (!fastmap
[t
? t
[ch
] : ch
])
780 /* Fastmap without multi-byte translation, match backwards. */
781 while (match_first
>= left_lim
)
783 ch
= match_first
>= length
784 ? 0 : (unsigned char) string
[match_first
];
785 if (fastmap
[t
? t
[ch
] : ch
])
789 if (match_first
< left_lim
)
794 /* In this case, we can't determine easily the current byte,
795 since it might be a component byte of a multibyte
796 character. Then we use the constructed buffer instead. */
799 /* If MATCH_FIRST is out of the valid range, reconstruct the
801 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
802 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
804 err
= re_string_reconstruct (&mctx
.input
, match_first
,
806 if (BE (err
!= REG_NOERROR
, 0))
809 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
811 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
812 Note that MATCH_FIRST must not be smaller than 0. */
813 ch
= (match_first
>= length
814 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
818 if (match_first
< left_lim
|| match_first
> right_lim
)
827 /* Reconstruct the buffers so that the matcher can assume that
828 the matching starts from the beginning of the buffer. */
829 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
830 if (BE (err
!= REG_NOERROR
, 0))
833 #ifdef RE_ENABLE_I18N
834 /* Don't consider this char as a possible match start if it part,
835 yet isn't the head, of a multibyte character. */
836 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
840 /* It seems to be appropriate one, then use the matcher. */
841 /* We assume that the matching starts from 0. */
842 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
843 match_last
= check_matching (&mctx
, fl_longest_match
,
844 range
>= 0 ? &match_first
: NULL
);
845 if (match_last
!= -1)
847 if (BE (match_last
== -2, 0))
854 mctx
.match_last
= match_last
;
855 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
857 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
858 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
861 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
864 err
= prune_impossible_nodes (&mctx
);
865 if (err
== REG_NOERROR
)
867 if (BE (err
!= REG_NOMATCH
, 0))
872 break; /* We found a match. */
876 match_ctx_clean (&mctx
);
880 assert (match_last
!= -1);
881 assert (err
== REG_NOERROR
);
884 /* Set pmatch[] if we need. */
889 /* Initialize registers. */
890 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
891 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
893 /* Set the points where matching start/end. */
895 pmatch
[0].rm_eo
= mctx
.match_last
;
897 if (!preg
->no_sub
&& nmatch
> 1)
899 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
900 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
901 if (BE (err
!= REG_NOERROR
, 0))
905 /* At last, add the offset to the each registers, since we slided
906 the buffers so that we could assume that the matching starts
908 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
909 if (pmatch
[reg_idx
].rm_so
!= -1)
911 #ifdef RE_ENABLE_I18N
912 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
914 pmatch
[reg_idx
].rm_so
=
915 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
916 ? mctx
.input
.valid_raw_len
917 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
918 pmatch
[reg_idx
].rm_eo
=
919 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
920 ? mctx
.input
.valid_raw_len
921 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
924 assert (mctx
.input
.offsets_needed
== 0);
926 pmatch
[reg_idx
].rm_so
+= match_first
;
927 pmatch
[reg_idx
].rm_eo
+= match_first
;
929 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
931 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
932 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
936 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
937 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
939 pmatch
[reg_idx
+ 1].rm_so
940 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
941 pmatch
[reg_idx
+ 1].rm_eo
942 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
947 re_free (mctx
.state_log
);
949 match_ctx_free (&mctx
);
950 re_string_destruct (&mctx
.input
);
955 __attribute_warn_unused_result__
956 prune_impossible_nodes (mctx
)
957 re_match_context_t
*mctx
;
959 const re_dfa_t
*const dfa
= mctx
->dfa
;
960 int halt_node
, match_last
;
962 re_dfastate_t
**sifted_states
;
963 re_dfastate_t
**lim_states
= NULL
;
964 re_sift_context_t sctx
;
966 assert (mctx
->state_log
!= NULL
);
968 match_last
= mctx
->match_last
;
969 halt_node
= mctx
->last_node
;
971 /* Avoid overflow. */
972 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
975 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
976 if (BE (sifted_states
== NULL
, 0))
983 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
984 if (BE (lim_states
== NULL
, 0))
991 memset (lim_states
, '\0',
992 sizeof (re_dfastate_t
*) * (match_last
+ 1));
993 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
995 ret
= sift_states_backward (mctx
, &sctx
);
996 re_node_set_free (&sctx
.limits
);
997 if (BE (ret
!= REG_NOERROR
, 0))
999 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
1009 } while (mctx
->state_log
[match_last
] == NULL
1010 || !mctx
->state_log
[match_last
]->halt
);
1011 halt_node
= check_halt_state_context (mctx
,
1012 mctx
->state_log
[match_last
],
1015 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
1017 re_free (lim_states
);
1019 if (BE (ret
!= REG_NOERROR
, 0))
1024 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1025 ret
= sift_states_backward (mctx
, &sctx
);
1026 re_node_set_free (&sctx
.limits
);
1027 if (BE (ret
!= REG_NOERROR
, 0))
1029 if (sifted_states
[0] == NULL
)
1035 re_free (mctx
->state_log
);
1036 mctx
->state_log
= sifted_states
;
1037 sifted_states
= NULL
;
1038 mctx
->last_node
= halt_node
;
1039 mctx
->match_last
= match_last
;
1042 re_free (sifted_states
);
1043 re_free (lim_states
);
1047 /* Acquire an initial state and return it.
1048 We must select appropriate initial state depending on the context,
1049 since initial states may have constraints like "\<", "^", etc.. */
1051 static inline re_dfastate_t
*
1052 __attribute ((always_inline
)) internal_function
1053 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1056 const re_dfa_t
*const dfa
= mctx
->dfa
;
1057 if (dfa
->init_state
->has_constraint
)
1059 unsigned int context
;
1060 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1061 if (IS_WORD_CONTEXT (context
))
1062 return dfa
->init_state_word
;
1063 else if (IS_ORDINARY_CONTEXT (context
))
1064 return dfa
->init_state
;
1065 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1066 return dfa
->init_state_begbuf
;
1067 else if (IS_NEWLINE_CONTEXT (context
))
1068 return dfa
->init_state_nl
;
1069 else if (IS_BEGBUF_CONTEXT (context
))
1071 /* It is relatively rare case, then calculate on demand. */
1072 return re_acquire_state_context (err
, dfa
,
1073 dfa
->init_state
->entrance_nodes
,
1077 /* Must not happen? */
1078 return dfa
->init_state
;
1081 return dfa
->init_state
;
1084 /* Check whether the regular expression match input string INPUT or not,
1085 and return the index where the matching end, return -1 if not match,
1086 or return -2 in case of an error.
1087 FL_LONGEST_MATCH means we want the POSIX longest matching.
1088 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1089 next place where we may want to try matching.
1090 Note that the matcher assume that the maching starts from the current
1091 index of the buffer. */
1094 internal_function __attribute_warn_unused_result__
1095 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1098 const re_dfa_t
*const dfa
= mctx
->dfa
;
1101 int match_last
= -1;
1102 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1103 re_dfastate_t
*cur_state
;
1104 int at_init_state
= p_match_first
!= NULL
;
1105 int next_start_idx
= cur_str_idx
;
1108 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1109 /* An initial state must not be NULL (invalid). */
1110 if (BE (cur_state
== NULL
, 0))
1112 assert (err
== REG_ESPACE
);
1116 if (mctx
->state_log
!= NULL
)
1118 mctx
->state_log
[cur_str_idx
] = cur_state
;
1120 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1121 later. E.g. Processing back references. */
1122 if (BE (dfa
->nbackref
, 0))
1125 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1126 if (BE (err
!= REG_NOERROR
, 0))
1129 if (cur_state
->has_backref
)
1131 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1132 if (BE (err
!= REG_NOERROR
, 0))
1138 /* If the RE accepts NULL string. */
1139 if (BE (cur_state
->halt
, 0))
1141 if (!cur_state
->has_constraint
1142 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1144 if (!fl_longest_match
)
1148 match_last
= cur_str_idx
;
1154 while (!re_string_eoi (&mctx
->input
))
1156 re_dfastate_t
*old_state
= cur_state
;
1157 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1159 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1160 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1161 && mctx
->input
.valid_len
< mctx
->input
.len
))
1163 err
= extend_buffers (mctx
);
1164 if (BE (err
!= REG_NOERROR
, 0))
1166 assert (err
== REG_ESPACE
);
1171 cur_state
= transit_state (&err
, mctx
, cur_state
);
1172 if (mctx
->state_log
!= NULL
)
1173 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1175 if (cur_state
== NULL
)
1177 /* Reached the invalid state or an error. Try to recover a valid
1178 state using the state log, if available and if we have not
1179 already found a valid (even if not the longest) match. */
1180 if (BE (err
!= REG_NOERROR
, 0))
1183 if (mctx
->state_log
== NULL
1184 || (match
&& !fl_longest_match
)
1185 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1189 if (BE (at_init_state
, 0))
1191 if (old_state
== cur_state
)
1192 next_start_idx
= next_char_idx
;
1197 if (cur_state
->halt
)
1199 /* Reached a halt state.
1200 Check the halt state can satisfy the current context. */
1201 if (!cur_state
->has_constraint
1202 || check_halt_state_context (mctx
, cur_state
,
1203 re_string_cur_idx (&mctx
->input
)))
1205 /* We found an appropriate halt state. */
1206 match_last
= re_string_cur_idx (&mctx
->input
);
1209 /* We found a match, do not modify match_first below. */
1210 p_match_first
= NULL
;
1211 if (!fl_longest_match
)
1218 *p_match_first
+= next_start_idx
;
1223 /* Check NODE match the current context. */
1227 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1229 re_token_type_t type
= dfa
->nodes
[node
].type
;
1230 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1231 if (type
!= END_OF_RE
)
1235 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1240 /* Check the halt state STATE match the current context.
1241 Return 0 if not match, if the node, STATE has, is a halt node and
1242 match the context, return the node. */
1246 check_halt_state_context (const re_match_context_t
*mctx
,
1247 const re_dfastate_t
*state
, int idx
)
1250 unsigned int context
;
1252 assert (state
->halt
);
1254 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1255 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1256 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1257 return state
->nodes
.elems
[i
];
1261 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1262 corresponding to the DFA).
1263 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1268 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1269 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1270 struct re_fail_stack_t
*fs
)
1272 const re_dfa_t
*const dfa
= mctx
->dfa
;
1274 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1276 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1277 re_node_set
*edests
= &dfa
->edests
[node
];
1279 err
= re_node_set_insert (eps_via_nodes
, node
);
1280 if (BE (err
< 0, 0))
1282 /* Pick up a valid destination, or return -1 if none is found. */
1283 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1285 int candidate
= edests
->elems
[i
];
1286 if (!re_node_set_contains (cur_nodes
, candidate
))
1288 if (dest_node
== -1)
1289 dest_node
= candidate
;
1293 /* In order to avoid infinite loop like "(a*)*", return the second
1294 epsilon-transition if the first was already considered. */
1295 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1298 /* Otherwise, push the second epsilon-transition on the fail stack. */
1300 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1304 /* We know we are going to exit. */
1313 re_token_type_t type
= dfa
->nodes
[node
].type
;
1315 #ifdef RE_ENABLE_I18N
1316 if (dfa
->nodes
[node
].accept_mb
)
1317 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1319 #endif /* RE_ENABLE_I18N */
1320 if (type
== OP_BACK_REF
)
1322 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1323 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1326 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1330 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1331 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1340 err
= re_node_set_insert (eps_via_nodes
, node
);
1341 if (BE (err
< 0, 0))
1343 dest_node
= dfa
->edests
[node
].elems
[0];
1344 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1351 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1353 int dest_node
= dfa
->nexts
[node
];
1354 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1355 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1356 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1359 re_node_set_empty (eps_via_nodes
);
1366 static reg_errcode_t
1367 internal_function __attribute_warn_unused_result__
1368 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1369 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1372 int num
= fs
->num
++;
1373 if (fs
->num
== fs
->alloc
)
1375 struct re_fail_stack_ent_t
*new_array
;
1376 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1378 if (new_array
== NULL
)
1381 fs
->stack
= new_array
;
1383 fs
->stack
[num
].idx
= str_idx
;
1384 fs
->stack
[num
].node
= dest_node
;
1385 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1386 if (fs
->stack
[num
].regs
== NULL
)
1388 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1389 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1395 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1396 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1398 int num
= --fs
->num
;
1400 *pidx
= fs
->stack
[num
].idx
;
1401 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1402 re_node_set_free (eps_via_nodes
);
1403 re_free (fs
->stack
[num
].regs
);
1404 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1405 return fs
->stack
[num
].node
;
1408 /* Set the positions where the subexpressions are starts/ends to registers
1410 Note: We assume that pmatch[0] is already set, and
1411 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1413 static reg_errcode_t
1414 internal_function __attribute_warn_unused_result__
1415 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1416 regmatch_t
*pmatch
, int fl_backtrack
)
1418 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1420 re_node_set eps_via_nodes
;
1421 struct re_fail_stack_t
*fs
;
1422 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1423 regmatch_t
*prev_idx_match
;
1424 int prev_idx_match_malloced
= 0;
1427 assert (nmatch
> 1);
1428 assert (mctx
->state_log
!= NULL
);
1433 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1434 if (fs
->stack
== NULL
)
1440 cur_node
= dfa
->init_node
;
1441 re_node_set_init_empty (&eps_via_nodes
);
1443 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1444 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1447 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1448 if (prev_idx_match
== NULL
)
1450 free_fail_stack_return (fs
);
1453 prev_idx_match_malloced
= 1;
1455 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1457 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1459 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1461 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1466 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1467 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1469 if (reg_idx
== nmatch
)
1471 re_node_set_free (&eps_via_nodes
);
1472 if (prev_idx_match_malloced
)
1473 re_free (prev_idx_match
);
1474 return free_fail_stack_return (fs
);
1476 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1481 re_node_set_free (&eps_via_nodes
);
1482 if (prev_idx_match_malloced
)
1483 re_free (prev_idx_match
);
1488 /* Proceed to next node. */
1489 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1490 &eps_via_nodes
, fs
);
1492 if (BE (cur_node
< 0, 0))
1494 if (BE (cur_node
== -2, 0))
1496 re_node_set_free (&eps_via_nodes
);
1497 if (prev_idx_match_malloced
)
1498 re_free (prev_idx_match
);
1499 free_fail_stack_return (fs
);
1503 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1507 re_node_set_free (&eps_via_nodes
);
1508 if (prev_idx_match_malloced
)
1509 re_free (prev_idx_match
);
1514 re_node_set_free (&eps_via_nodes
);
1515 if (prev_idx_match_malloced
)
1516 re_free (prev_idx_match
);
1517 return free_fail_stack_return (fs
);
1520 static reg_errcode_t
1522 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1527 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1529 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1530 re_free (fs
->stack
[fs_idx
].regs
);
1532 re_free (fs
->stack
);
1539 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1540 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1542 int type
= dfa
->nodes
[cur_node
].type
;
1543 if (type
== OP_OPEN_SUBEXP
)
1545 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1547 /* We are at the first node of this sub expression. */
1548 if (reg_num
< nmatch
)
1550 pmatch
[reg_num
].rm_so
= cur_idx
;
1551 pmatch
[reg_num
].rm_eo
= -1;
1554 else if (type
== OP_CLOSE_SUBEXP
)
1556 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1557 if (reg_num
< nmatch
)
1559 /* We are at the last node of this sub expression. */
1560 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1562 pmatch
[reg_num
].rm_eo
= cur_idx
;
1563 /* This is a non-empty match or we are not inside an optional
1564 subexpression. Accept this right away. */
1565 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1569 if (dfa
->nodes
[cur_node
].opt_subexp
1570 && prev_idx_match
[reg_num
].rm_so
!= -1)
1571 /* We transited through an empty match for an optional
1572 subexpression, like (a?)*, and this is not the subexp's
1573 first match. Copy back the old content of the registers
1574 so that matches of an inner subexpression are undone as
1575 well, like in ((a?))*. */
1576 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1578 /* We completed a subexpression, but it may be part of
1579 an optional one, so do not update PREV_IDX_MATCH. */
1580 pmatch
[reg_num
].rm_eo
= cur_idx
;
1586 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1587 and sift the nodes in each states according to the following rules.
1588 Updated state_log will be wrote to STATE_LOG.
1590 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1591 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1592 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1593 the LAST_NODE, we throw away the node `a'.
1594 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1595 string `s' and transit to `b':
1596 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1598 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1599 thrown away, we throw away the node `a'.
1600 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1601 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1603 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1604 we throw away the node `a'. */
1606 #define STATE_NODE_CONTAINS(state,node) \
1607 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1609 static reg_errcode_t
1611 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1615 int str_idx
= sctx
->last_str_idx
;
1616 re_node_set cur_dest
;
1619 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1622 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1623 transit to the last_node and the last_node itself. */
1624 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1625 if (BE (err
!= REG_NOERROR
, 0))
1627 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1628 if (BE (err
!= REG_NOERROR
, 0))
1631 /* Then check each states in the state_log. */
1634 /* Update counters. */
1635 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1636 if (null_cnt
> mctx
->max_mb_elem_len
)
1638 memset (sctx
->sifted_states
, '\0',
1639 sizeof (re_dfastate_t
*) * str_idx
);
1640 re_node_set_free (&cur_dest
);
1643 re_node_set_empty (&cur_dest
);
1646 if (mctx
->state_log
[str_idx
])
1648 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1649 if (BE (err
!= REG_NOERROR
, 0))
1653 /* Add all the nodes which satisfy the following conditions:
1654 - It can epsilon transit to a node in CUR_DEST.
1656 And update state_log. */
1657 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1658 if (BE (err
!= REG_NOERROR
, 0))
1663 re_node_set_free (&cur_dest
);
1667 static reg_errcode_t
1668 internal_function __attribute_warn_unused_result__
1669 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1670 int str_idx
, re_node_set
*cur_dest
)
1672 const re_dfa_t
*const dfa
= mctx
->dfa
;
1673 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1676 /* Then build the next sifted state.
1677 We build the next sifted state on `cur_dest', and update
1678 `sifted_states[str_idx]' with `cur_dest'.
1680 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1681 `cur_src' points the node_set of the old `state_log[str_idx]'
1682 (with the epsilon nodes pre-filtered out). */
1683 for (i
= 0; i
< cur_src
->nelem
; i
++)
1685 int prev_node
= cur_src
->elems
[i
];
1690 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1691 assert (!IS_EPSILON_NODE (type
));
1693 #ifdef RE_ENABLE_I18N
1694 /* If the node may accept `multi byte'. */
1695 if (dfa
->nodes
[prev_node
].accept_mb
)
1696 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1697 str_idx
, sctx
->last_str_idx
);
1698 #endif /* RE_ENABLE_I18N */
1700 /* We don't check backreferences here.
1701 See update_cur_sifted_state(). */
1703 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1704 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1705 dfa
->nexts
[prev_node
]))
1711 if (sctx
->limits
.nelem
)
1713 int to_idx
= str_idx
+ naccepted
;
1714 if (check_dst_limits (mctx
, &sctx
->limits
,
1715 dfa
->nexts
[prev_node
], to_idx
,
1716 prev_node
, str_idx
))
1719 ret
= re_node_set_insert (cur_dest
, prev_node
);
1720 if (BE (ret
== -1, 0))
1727 /* Helper functions. */
1729 static reg_errcode_t
1731 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1733 int top
= mctx
->state_log_top
;
1735 if (next_state_log_idx
>= mctx
->input
.bufs_len
1736 || (next_state_log_idx
>= mctx
->input
.valid_len
1737 && mctx
->input
.valid_len
< mctx
->input
.len
))
1740 err
= extend_buffers (mctx
);
1741 if (BE (err
!= REG_NOERROR
, 0))
1745 if (top
< next_state_log_idx
)
1747 memset (mctx
->state_log
+ top
+ 1, '\0',
1748 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1749 mctx
->state_log_top
= next_state_log_idx
;
1754 static reg_errcode_t
1756 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1757 re_dfastate_t
**src
, int num
)
1761 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1763 if (dst
[st_idx
] == NULL
)
1764 dst
[st_idx
] = src
[st_idx
];
1765 else if (src
[st_idx
] != NULL
)
1767 re_node_set merged_set
;
1768 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1769 &src
[st_idx
]->nodes
);
1770 if (BE (err
!= REG_NOERROR
, 0))
1772 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1773 re_node_set_free (&merged_set
);
1774 if (BE (err
!= REG_NOERROR
, 0))
1781 static reg_errcode_t
1783 update_cur_sifted_state (const re_match_context_t
*mctx
,
1784 re_sift_context_t
*sctx
, int str_idx
,
1785 re_node_set
*dest_nodes
)
1787 const re_dfa_t
*const dfa
= mctx
->dfa
;
1788 reg_errcode_t err
= REG_NOERROR
;
1789 const re_node_set
*candidates
;
1790 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1791 : &mctx
->state_log
[str_idx
]->nodes
);
1793 if (dest_nodes
->nelem
== 0)
1794 sctx
->sifted_states
[str_idx
] = NULL
;
1799 /* At first, add the nodes which can epsilon transit to a node in
1801 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1802 if (BE (err
!= REG_NOERROR
, 0))
1805 /* Then, check the limitations in the current sift_context. */
1806 if (sctx
->limits
.nelem
)
1808 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1809 mctx
->bkref_ents
, str_idx
);
1810 if (BE (err
!= REG_NOERROR
, 0))
1815 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1816 if (BE (err
!= REG_NOERROR
, 0))
1820 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1822 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1823 if (BE (err
!= REG_NOERROR
, 0))
1829 static reg_errcode_t
1830 internal_function __attribute_warn_unused_result__
1831 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1832 const re_node_set
*candidates
)
1834 reg_errcode_t err
= REG_NOERROR
;
1837 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1838 if (BE (err
!= REG_NOERROR
, 0))
1841 if (!state
->inveclosure
.alloc
)
1843 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1844 if (BE (err
!= REG_NOERROR
, 0))
1846 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1848 err
= re_node_set_merge (&state
->inveclosure
,
1849 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1850 if (BE (err
!= REG_NOERROR
, 0))
1854 return re_node_set_add_intersect (dest_nodes
, candidates
,
1855 &state
->inveclosure
);
1858 static reg_errcode_t
1860 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1861 const re_node_set
*candidates
)
1865 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1866 re_node_set except_nodes
;
1867 re_node_set_init_empty (&except_nodes
);
1868 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1870 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1871 if (cur_node
== node
)
1873 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1875 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1876 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1877 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1878 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1879 && re_node_set_contains (dest_nodes
, edst1
))
1881 && !re_node_set_contains (inv_eclosure
, edst2
)
1882 && re_node_set_contains (dest_nodes
, edst2
)))
1884 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1885 dfa
->inveclosures
+ cur_node
);
1886 if (BE (err
!= REG_NOERROR
, 0))
1888 re_node_set_free (&except_nodes
);
1894 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1896 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1897 if (!re_node_set_contains (&except_nodes
, cur_node
))
1899 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1900 re_node_set_remove_at (dest_nodes
, idx
);
1903 re_node_set_free (&except_nodes
);
1909 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1910 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1912 const re_dfa_t
*const dfa
= mctx
->dfa
;
1913 int lim_idx
, src_pos
, dst_pos
;
1915 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1916 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1917 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1920 struct re_backref_cache_entry
*ent
;
1921 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1922 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1924 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1925 subexp_idx
, dst_node
, dst_idx
,
1927 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1928 subexp_idx
, src_node
, src_idx
,
1932 <src> <dst> ( <subexp> )
1933 ( <subexp> ) <src> <dst>
1934 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1935 if (src_pos
== dst_pos
)
1936 continue; /* This is unrelated limitation. */
1945 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1946 int subexp_idx
, int from_node
, int bkref_idx
)
1948 const re_dfa_t
*const dfa
= mctx
->dfa
;
1949 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1952 /* Else, we are on the boundary: examine the nodes on the epsilon
1954 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1956 int node
= eclosures
->elems
[node_idx
];
1957 switch (dfa
->nodes
[node
].type
)
1960 if (bkref_idx
!= -1)
1962 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1967 if (ent
->node
!= node
)
1970 if (subexp_idx
< BITSET_WORD_BITS
1971 && !(ent
->eps_reachable_subexps_map
1972 & ((bitset_word_t
) 1 << subexp_idx
)))
1975 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1976 OP_CLOSE_SUBEXP cases below. But, if the
1977 destination node is the same node as the source
1978 node, don't recurse because it would cause an
1979 infinite loop: a regex that exhibits this behavior
1981 dst
= dfa
->edests
[node
].elems
[0];
1982 if (dst
== from_node
)
1986 else /* if (boundaries & 2) */
1991 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1993 if (cpos
== -1 /* && (boundaries & 1) */)
1995 if (cpos
== 0 && (boundaries
& 2))
1998 if (subexp_idx
< BITSET_WORD_BITS
)
1999 ent
->eps_reachable_subexps_map
2000 &= ~((bitset_word_t
) 1 << subexp_idx
);
2002 while (ent
++->more
);
2006 case OP_OPEN_SUBEXP
:
2007 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2011 case OP_CLOSE_SUBEXP
:
2012 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2021 return (boundaries
& 2) ? 1 : 0;
2026 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2027 int subexp_idx
, int from_node
, int str_idx
,
2030 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2033 /* If we are outside the range of the subexpression, return -1 or 1. */
2034 if (str_idx
< lim
->subexp_from
)
2037 if (lim
->subexp_to
< str_idx
)
2040 /* If we are within the subexpression, return 0. */
2041 boundaries
= (str_idx
== lim
->subexp_from
);
2042 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2043 if (boundaries
== 0)
2046 /* Else, examine epsilon closure. */
2047 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2048 from_node
, bkref_idx
);
2051 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2052 which are against limitations from DEST_NODES. */
2054 static reg_errcode_t
2056 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2057 const re_node_set
*candidates
, re_node_set
*limits
,
2058 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2061 int node_idx
, lim_idx
;
2063 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2066 struct re_backref_cache_entry
*ent
;
2067 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2069 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2070 continue; /* This is unrelated limitation. */
2072 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2073 if (ent
->subexp_to
== str_idx
)
2077 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2079 int node
= dest_nodes
->elems
[node_idx
];
2080 re_token_type_t type
= dfa
->nodes
[node
].type
;
2081 if (type
== OP_OPEN_SUBEXP
2082 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2084 else if (type
== OP_CLOSE_SUBEXP
2085 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2089 /* Check the limitation of the open subexpression. */
2090 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2093 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2095 if (BE (err
!= REG_NOERROR
, 0))
2099 /* Check the limitation of the close subexpression. */
2101 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2103 int node
= dest_nodes
->elems
[node_idx
];
2104 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2106 && !re_node_set_contains (dfa
->eclosures
+ node
,
2109 /* It is against this limitation.
2110 Remove it form the current sifted state. */
2111 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2113 if (BE (err
!= REG_NOERROR
, 0))
2119 else /* (ent->subexp_to != str_idx) */
2121 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2123 int node
= dest_nodes
->elems
[node_idx
];
2124 re_token_type_t type
= dfa
->nodes
[node
].type
;
2125 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2127 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2129 /* It is against this limitation.
2130 Remove it form the current sifted state. */
2131 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2133 if (BE (err
!= REG_NOERROR
, 0))
2142 static reg_errcode_t
2143 internal_function __attribute_warn_unused_result__
2144 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2145 int str_idx
, const re_node_set
*candidates
)
2147 const re_dfa_t
*const dfa
= mctx
->dfa
;
2150 re_sift_context_t local_sctx
;
2151 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2153 if (first_idx
== -1)
2156 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2158 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2161 re_token_type_t type
;
2162 struct re_backref_cache_entry
*entry
;
2163 node
= candidates
->elems
[node_idx
];
2164 type
= dfa
->nodes
[node
].type
;
2165 /* Avoid infinite loop for the REs like "()\1+". */
2166 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2168 if (type
!= OP_BACK_REF
)
2171 entry
= mctx
->bkref_ents
+ first_idx
;
2172 enabled_idx
= first_idx
;
2179 re_dfastate_t
*cur_state
;
2181 if (entry
->node
!= node
)
2183 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2184 to_idx
= str_idx
+ subexp_len
;
2185 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2186 : dfa
->edests
[node
].elems
[0]);
2188 if (to_idx
> sctx
->last_str_idx
2189 || sctx
->sifted_states
[to_idx
] == NULL
2190 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2191 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2192 str_idx
, dst_node
, to_idx
))
2195 if (local_sctx
.sifted_states
== NULL
)
2198 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2199 if (BE (err
!= REG_NOERROR
, 0))
2202 local_sctx
.last_node
= node
;
2203 local_sctx
.last_str_idx
= str_idx
;
2204 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2205 if (BE (ret
< 0, 0))
2210 cur_state
= local_sctx
.sifted_states
[str_idx
];
2211 err
= sift_states_backward (mctx
, &local_sctx
);
2212 if (BE (err
!= REG_NOERROR
, 0))
2214 if (sctx
->limited_states
!= NULL
)
2216 err
= merge_state_array (dfa
, sctx
->limited_states
,
2217 local_sctx
.sifted_states
,
2219 if (BE (err
!= REG_NOERROR
, 0))
2222 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2223 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2225 /* mctx->bkref_ents may have changed, reload the pointer. */
2226 entry
= mctx
->bkref_ents
+ enabled_idx
;
2228 while (enabled_idx
++, entry
++->more
);
2232 if (local_sctx
.sifted_states
!= NULL
)
2234 re_node_set_free (&local_sctx
.limits
);
2241 #ifdef RE_ENABLE_I18N
2244 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2245 int node_idx
, int str_idx
, int max_str_idx
)
2247 const re_dfa_t
*const dfa
= mctx
->dfa
;
2249 /* Check the node can accept `multi byte'. */
2250 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2251 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2252 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2253 dfa
->nexts
[node_idx
]))
2254 /* The node can't accept the `multi byte', or the
2255 destination was already thrown away, then the node
2256 could't accept the current input `multi byte'. */
2258 /* Otherwise, it is sure that the node could accept
2259 `naccepted' bytes input. */
2262 #endif /* RE_ENABLE_I18N */
2265 /* Functions for state transition. */
2267 /* Return the next state to which the current state STATE will transit by
2268 accepting the current input byte, and update STATE_LOG if necessary.
2269 If STATE can accept a multibyte char/collating element/back reference
2270 update the destination of STATE_LOG. */
2272 static re_dfastate_t
*
2273 internal_function __attribute_warn_unused_result__
2274 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2275 re_dfastate_t
*state
)
2277 re_dfastate_t
**trtable
;
2280 #ifdef RE_ENABLE_I18N
2281 /* If the current state can accept multibyte. */
2282 if (BE (state
->accept_mb
, 0))
2284 *err
= transit_state_mb (mctx
, state
);
2285 if (BE (*err
!= REG_NOERROR
, 0))
2288 #endif /* RE_ENABLE_I18N */
2290 /* Then decide the next state with the single byte. */
2293 /* don't use transition table */
2294 return transit_state_sb (err
, mctx
, state
);
2297 /* Use transition table */
2298 ch
= re_string_fetch_byte (&mctx
->input
);
2301 trtable
= state
->trtable
;
2302 if (BE (trtable
!= NULL
, 1))
2305 trtable
= state
->word_trtable
;
2306 if (BE (trtable
!= NULL
, 1))
2308 unsigned int context
;
2310 = re_string_context_at (&mctx
->input
,
2311 re_string_cur_idx (&mctx
->input
) - 1,
2313 if (IS_WORD_CONTEXT (context
))
2314 return trtable
[ch
+ SBC_MAX
];
2319 if (!build_trtable (mctx
->dfa
, state
))
2325 /* Retry, we now have a transition table. */
2329 /* Update the state_log if we need */
2332 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2333 re_dfastate_t
*next_state
)
2335 const re_dfa_t
*const dfa
= mctx
->dfa
;
2336 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2338 if (cur_idx
> mctx
->state_log_top
)
2340 mctx
->state_log
[cur_idx
] = next_state
;
2341 mctx
->state_log_top
= cur_idx
;
2343 else if (mctx
->state_log
[cur_idx
] == 0)
2345 mctx
->state_log
[cur_idx
] = next_state
;
2349 re_dfastate_t
*pstate
;
2350 unsigned int context
;
2351 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2352 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2353 the destination of a multibyte char/collating element/
2354 back reference. Then the next state is the union set of
2355 these destinations and the results of the transition table. */
2356 pstate
= mctx
->state_log
[cur_idx
];
2357 log_nodes
= pstate
->entrance_nodes
;
2358 if (next_state
!= NULL
)
2360 table_nodes
= next_state
->entrance_nodes
;
2361 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2363 if (BE (*err
!= REG_NOERROR
, 0))
2367 next_nodes
= *log_nodes
;
2368 /* Note: We already add the nodes of the initial state,
2369 then we don't need to add them here. */
2371 context
= re_string_context_at (&mctx
->input
,
2372 re_string_cur_idx (&mctx
->input
) - 1,
2374 next_state
= mctx
->state_log
[cur_idx
]
2375 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2376 /* We don't need to check errors here, since the return value of
2377 this function is next_state and ERR is already set. */
2379 if (table_nodes
!= NULL
)
2380 re_node_set_free (&next_nodes
);
2383 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2385 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2386 later. We must check them here, since the back references in the
2387 next state might use them. */
2388 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2390 if (BE (*err
!= REG_NOERROR
, 0))
2393 /* If the next state has back references. */
2394 if (next_state
->has_backref
)
2396 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2397 if (BE (*err
!= REG_NOERROR
, 0))
2399 next_state
= mctx
->state_log
[cur_idx
];
2406 /* Skip bytes in the input that correspond to part of a
2407 multi-byte match, then look in the log for a state
2408 from which to restart matching. */
2411 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2413 re_dfastate_t
*cur_state
;
2416 int max
= mctx
->state_log_top
;
2417 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2421 if (++cur_str_idx
> max
)
2423 re_string_skip_bytes (&mctx
->input
, 1);
2425 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2427 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2429 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2433 /* Helper functions for transit_state. */
2435 /* From the node set CUR_NODES, pick up the nodes whose types are
2436 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2437 expression. And register them to use them later for evaluating the
2438 correspoding back references. */
2440 static reg_errcode_t
2442 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2445 const re_dfa_t
*const dfa
= mctx
->dfa
;
2449 /* TODO: This isn't efficient.
2450 Because there might be more than one nodes whose types are
2451 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2454 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2456 int node
= cur_nodes
->elems
[node_idx
];
2457 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2458 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2459 && (dfa
->used_bkref_map
2460 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2462 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2463 if (BE (err
!= REG_NOERROR
, 0))
2471 /* Return the next state to which the current state STATE will transit by
2472 accepting the current input byte. */
2474 static re_dfastate_t
*
2475 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2476 re_dfastate_t
*state
)
2478 const re_dfa_t
*const dfa
= mctx
->dfa
;
2479 re_node_set next_nodes
;
2480 re_dfastate_t
*next_state
;
2481 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2482 unsigned int context
;
2484 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2485 if (BE (*err
!= REG_NOERROR
, 0))
2487 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2489 int cur_node
= state
->nodes
.elems
[node_cnt
];
2490 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2492 *err
= re_node_set_merge (&next_nodes
,
2493 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2494 if (BE (*err
!= REG_NOERROR
, 0))
2496 re_node_set_free (&next_nodes
);
2501 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2502 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2503 /* We don't need to check errors here, since the return value of
2504 this function is next_state and ERR is already set. */
2506 re_node_set_free (&next_nodes
);
2507 re_string_skip_bytes (&mctx
->input
, 1);
2512 #ifdef RE_ENABLE_I18N
2513 static reg_errcode_t
2515 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2517 const re_dfa_t
*const dfa
= mctx
->dfa
;
2521 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2523 re_node_set dest_nodes
, *new_nodes
;
2524 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2525 int naccepted
, dest_idx
;
2526 unsigned int context
;
2527 re_dfastate_t
*dest_state
;
2529 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2532 if (dfa
->nodes
[cur_node_idx
].constraint
)
2534 context
= re_string_context_at (&mctx
->input
,
2535 re_string_cur_idx (&mctx
->input
),
2537 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2542 /* How many bytes the node can accept? */
2543 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2544 re_string_cur_idx (&mctx
->input
));
2548 /* The node can accepts `naccepted' bytes. */
2549 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2550 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2551 : mctx
->max_mb_elem_len
);
2552 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2553 if (BE (err
!= REG_NOERROR
, 0))
2556 assert (dfa
->nexts
[cur_node_idx
] != -1);
2558 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2560 dest_state
= mctx
->state_log
[dest_idx
];
2561 if (dest_state
== NULL
)
2562 dest_nodes
= *new_nodes
;
2565 err
= re_node_set_init_union (&dest_nodes
,
2566 dest_state
->entrance_nodes
, new_nodes
);
2567 if (BE (err
!= REG_NOERROR
, 0))
2570 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2572 mctx
->state_log
[dest_idx
]
2573 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2574 if (dest_state
!= NULL
)
2575 re_node_set_free (&dest_nodes
);
2576 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2581 #endif /* RE_ENABLE_I18N */
2583 static reg_errcode_t
2585 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2587 const re_dfa_t
*const dfa
= mctx
->dfa
;
2590 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2592 for (i
= 0; i
< nodes
->nelem
; ++i
)
2594 int dest_str_idx
, prev_nelem
, bkc_idx
;
2595 int node_idx
= nodes
->elems
[i
];
2596 unsigned int context
;
2597 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2598 re_node_set
*new_dest_nodes
;
2600 /* Check whether `node' is a backreference or not. */
2601 if (node
->type
!= OP_BACK_REF
)
2604 if (node
->constraint
)
2606 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2608 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2612 /* `node' is a backreference.
2613 Check the substring which the substring matched. */
2614 bkc_idx
= mctx
->nbkref_ents
;
2615 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2616 if (BE (err
!= REG_NOERROR
, 0))
2619 /* And add the epsilon closures (which is `new_dest_nodes') of
2620 the backreference to appropriate state_log. */
2622 assert (dfa
->nexts
[node_idx
] != -1);
2624 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2627 re_dfastate_t
*dest_state
;
2628 struct re_backref_cache_entry
*bkref_ent
;
2629 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2630 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2632 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2633 new_dest_nodes
= (subexp_len
== 0
2634 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2635 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2636 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2637 - bkref_ent
->subexp_from
);
2638 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2640 dest_state
= mctx
->state_log
[dest_str_idx
];
2641 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2642 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2643 /* Add `new_dest_node' to state_log. */
2644 if (dest_state
== NULL
)
2646 mctx
->state_log
[dest_str_idx
]
2647 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2649 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2650 && err
!= REG_NOERROR
, 0))
2655 re_node_set dest_nodes
;
2656 err
= re_node_set_init_union (&dest_nodes
,
2657 dest_state
->entrance_nodes
,
2659 if (BE (err
!= REG_NOERROR
, 0))
2661 re_node_set_free (&dest_nodes
);
2664 mctx
->state_log
[dest_str_idx
]
2665 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2666 re_node_set_free (&dest_nodes
);
2667 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2668 && err
!= REG_NOERROR
, 0))
2671 /* We need to check recursively if the backreference can epsilon
2674 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2676 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2678 if (BE (err
!= REG_NOERROR
, 0))
2680 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2681 if (BE (err
!= REG_NOERROR
, 0))
2691 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2692 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2693 Note that we might collect inappropriate candidates here.
2694 However, the cost of checking them strictly here is too high, then we
2695 delay these checking for prune_impossible_nodes(). */
2697 static reg_errcode_t
2698 internal_function __attribute_warn_unused_result__
2699 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2701 const re_dfa_t
*const dfa
= mctx
->dfa
;
2702 int subexp_num
, sub_top_idx
;
2703 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2704 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2705 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2706 if (cache_idx
!= -1)
2708 const struct re_backref_cache_entry
*entry
2709 = mctx
->bkref_ents
+ cache_idx
;
2711 if (entry
->node
== bkref_node
)
2712 return REG_NOERROR
; /* We already checked it. */
2713 while (entry
++->more
);
2716 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2718 /* For each sub expression */
2719 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2722 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2723 re_sub_match_last_t
*sub_last
;
2724 int sub_last_idx
, sl_str
, bkref_str_off
;
2726 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2727 continue; /* It isn't related. */
2729 sl_str
= sub_top
->str_idx
;
2730 bkref_str_off
= bkref_str_idx
;
2731 /* At first, check the last node of sub expressions we already
2733 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2736 sub_last
= sub_top
->lasts
[sub_last_idx
];
2737 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2738 /* The matched string by the sub expression match with the substring
2739 at the back reference? */
2740 if (sl_str_diff
> 0)
2742 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2744 /* Not enough chars for a successful match. */
2745 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2748 err
= clean_state_log_if_needed (mctx
,
2751 if (BE (err
!= REG_NOERROR
, 0))
2753 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2755 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2756 /* We don't need to search this sub expression any more. */
2759 bkref_str_off
+= sl_str_diff
;
2760 sl_str
+= sl_str_diff
;
2761 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2764 /* Reload buf, since the preceding call might have reallocated
2766 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2768 if (err
== REG_NOMATCH
)
2770 if (BE (err
!= REG_NOERROR
, 0))
2774 if (sub_last_idx
< sub_top
->nlasts
)
2776 if (sub_last_idx
> 0)
2778 /* Then, search for the other last nodes of the sub expression. */
2779 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2781 int cls_node
, sl_str_off
;
2782 const re_node_set
*nodes
;
2783 sl_str_off
= sl_str
- sub_top
->str_idx
;
2784 /* The matched string by the sub expression match with the substring
2785 at the back reference? */
2788 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2790 /* If we are at the end of the input, we cannot match. */
2791 if (bkref_str_off
>= mctx
->input
.len
)
2794 err
= extend_buffers (mctx
);
2795 if (BE (err
!= REG_NOERROR
, 0))
2798 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2800 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2801 break; /* We don't need to search this sub expression
2804 if (mctx
->state_log
[sl_str
] == NULL
)
2806 /* Does this state have a ')' of the sub expression? */
2807 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2808 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2812 if (sub_top
->path
== NULL
)
2814 sub_top
->path
= calloc (sizeof (state_array_t
),
2815 sl_str
- sub_top
->str_idx
+ 1);
2816 if (sub_top
->path
== NULL
)
2819 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2820 in the current context? */
2821 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2822 sub_top
->str_idx
, cls_node
, sl_str
,
2824 if (err
== REG_NOMATCH
)
2826 if (BE (err
!= REG_NOERROR
, 0))
2828 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2829 if (BE (sub_last
== NULL
, 0))
2831 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2833 if (err
== REG_NOMATCH
)
2840 /* Helper functions for get_subexp(). */
2842 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2843 If it can arrive, register the sub expression expressed with SUB_TOP
2846 static reg_errcode_t
2848 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2849 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2853 /* Can the subexpression arrive the back reference? */
2854 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2855 sub_last
->str_idx
, bkref_node
, bkref_str
,
2857 if (err
!= REG_NOERROR
)
2859 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2861 if (BE (err
!= REG_NOERROR
, 0))
2863 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2864 return clean_state_log_if_needed (mctx
, to_idx
);
2867 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2868 Search '(' if FL_OPEN, or search ')' otherwise.
2869 TODO: This function isn't efficient...
2870 Because there might be more than one nodes whose types are
2871 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2877 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2878 int subexp_idx
, int type
)
2881 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2883 int cls_node
= nodes
->elems
[cls_idx
];
2884 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2885 if (node
->type
== type
2886 && node
->opr
.idx
== subexp_idx
)
2892 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2893 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2895 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2897 static reg_errcode_t
2898 internal_function __attribute_warn_unused_result__
2899 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2900 int top_str
, int last_node
, int last_str
, int type
)
2902 const re_dfa_t
*const dfa
= mctx
->dfa
;
2903 reg_errcode_t err
= REG_NOERROR
;
2904 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2905 re_dfastate_t
*cur_state
= NULL
;
2906 re_node_set
*cur_nodes
, next_nodes
;
2907 re_dfastate_t
**backup_state_log
;
2908 unsigned int context
;
2910 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2911 /* Extend the buffer if we need. */
2912 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2914 re_dfastate_t
**new_array
;
2915 int old_alloc
= path
->alloc
;
2916 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2917 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2918 if (BE (new_array
== NULL
, 0))
2920 path
->alloc
= old_alloc
;
2923 path
->array
= new_array
;
2924 memset (new_array
+ old_alloc
, '\0',
2925 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2928 str_idx
= path
->next_idx
?: top_str
;
2930 /* Temporary modify MCTX. */
2931 backup_state_log
= mctx
->state_log
;
2932 backup_cur_idx
= mctx
->input
.cur_idx
;
2933 mctx
->state_log
= path
->array
;
2934 mctx
->input
.cur_idx
= str_idx
;
2936 /* Setup initial node set. */
2937 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2938 if (str_idx
== top_str
)
2940 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2941 if (BE (err
!= REG_NOERROR
, 0))
2943 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2944 if (BE (err
!= REG_NOERROR
, 0))
2946 re_node_set_free (&next_nodes
);
2952 cur_state
= mctx
->state_log
[str_idx
];
2953 if (cur_state
&& cur_state
->has_backref
)
2955 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2956 if (BE (err
!= REG_NOERROR
, 0))
2960 re_node_set_init_empty (&next_nodes
);
2962 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2964 if (next_nodes
.nelem
)
2966 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2968 if (BE (err
!= REG_NOERROR
, 0))
2970 re_node_set_free (&next_nodes
);
2974 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2975 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2977 re_node_set_free (&next_nodes
);
2980 mctx
->state_log
[str_idx
] = cur_state
;
2983 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2985 re_node_set_empty (&next_nodes
);
2986 if (mctx
->state_log
[str_idx
+ 1])
2988 err
= re_node_set_merge (&next_nodes
,
2989 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2990 if (BE (err
!= REG_NOERROR
, 0))
2992 re_node_set_free (&next_nodes
);
2998 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2999 &cur_state
->non_eps_nodes
,
3001 if (BE (err
!= REG_NOERROR
, 0))
3003 re_node_set_free (&next_nodes
);
3008 if (next_nodes
.nelem
)
3010 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
3011 if (BE (err
!= REG_NOERROR
, 0))
3013 re_node_set_free (&next_nodes
);
3016 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
3018 if (BE (err
!= REG_NOERROR
, 0))
3020 re_node_set_free (&next_nodes
);
3024 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3025 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3026 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3028 re_node_set_free (&next_nodes
);
3031 mctx
->state_log
[str_idx
] = cur_state
;
3032 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3034 re_node_set_free (&next_nodes
);
3035 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3036 : &mctx
->state_log
[last_str
]->nodes
);
3037 path
->next_idx
= str_idx
;
3040 mctx
->state_log
= backup_state_log
;
3041 mctx
->input
.cur_idx
= backup_cur_idx
;
3043 /* Then check the current node set has the node LAST_NODE. */
3044 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3050 /* Helper functions for check_arrival. */
3052 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3054 TODO: This function is similar to the functions transit_state*(),
3055 however this function has many additional works.
3056 Can't we unify them? */
3058 static reg_errcode_t
3059 internal_function __attribute_warn_unused_result__
3060 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3061 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3063 const re_dfa_t
*const dfa
= mctx
->dfa
;
3066 reg_errcode_t err
= REG_NOERROR
;
3067 re_node_set union_set
;
3068 re_node_set_init_empty (&union_set
);
3069 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3072 int cur_node
= cur_nodes
->elems
[cur_idx
];
3074 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3075 assert (!IS_EPSILON_NODE (type
));
3077 #ifdef RE_ENABLE_I18N
3078 /* If the node may accept `multi byte'. */
3079 if (dfa
->nodes
[cur_node
].accept_mb
)
3081 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3085 re_dfastate_t
*dest_state
;
3086 int next_node
= dfa
->nexts
[cur_node
];
3087 int next_idx
= str_idx
+ naccepted
;
3088 dest_state
= mctx
->state_log
[next_idx
];
3089 re_node_set_empty (&union_set
);
3092 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3093 if (BE (err
!= REG_NOERROR
, 0))
3095 re_node_set_free (&union_set
);
3099 result
= re_node_set_insert (&union_set
, next_node
);
3100 if (BE (result
< 0, 0))
3102 re_node_set_free (&union_set
);
3105 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3107 if (BE (mctx
->state_log
[next_idx
] == NULL
3108 && err
!= REG_NOERROR
, 0))
3110 re_node_set_free (&union_set
);
3115 #endif /* RE_ENABLE_I18N */
3117 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3119 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3120 if (BE (result
< 0, 0))
3122 re_node_set_free (&union_set
);
3127 re_node_set_free (&union_set
);
3131 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3132 CUR_NODES, however exclude the nodes which are:
3133 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3134 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3137 static reg_errcode_t
3139 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3140 int ex_subexp
, int type
)
3143 int idx
, outside_node
;
3144 re_node_set new_nodes
;
3146 assert (cur_nodes
->nelem
);
3148 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3149 if (BE (err
!= REG_NOERROR
, 0))
3151 /* Create a new node set NEW_NODES with the nodes which are epsilon
3152 closures of the node in CUR_NODES. */
3154 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3156 int cur_node
= cur_nodes
->elems
[idx
];
3157 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3158 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3159 if (outside_node
== -1)
3161 /* There are no problematic nodes, just merge them. */
3162 err
= re_node_set_merge (&new_nodes
, eclosure
);
3163 if (BE (err
!= REG_NOERROR
, 0))
3165 re_node_set_free (&new_nodes
);
3171 /* There are problematic nodes, re-calculate incrementally. */
3172 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3174 if (BE (err
!= REG_NOERROR
, 0))
3176 re_node_set_free (&new_nodes
);
3181 re_node_set_free (cur_nodes
);
3182 *cur_nodes
= new_nodes
;
3186 /* Helper function for check_arrival_expand_ecl.
3187 Check incrementally the epsilon closure of TARGET, and if it isn't
3188 problematic append it to DST_NODES. */
3190 static reg_errcode_t
3191 internal_function __attribute_warn_unused_result__
3192 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3193 int target
, int ex_subexp
, int type
)
3196 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3200 if (dfa
->nodes
[cur_node
].type
== type
3201 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3203 if (type
== OP_CLOSE_SUBEXP
)
3205 err
= re_node_set_insert (dst_nodes
, cur_node
);
3206 if (BE (err
== -1, 0))
3211 err
= re_node_set_insert (dst_nodes
, cur_node
);
3212 if (BE (err
== -1, 0))
3214 if (dfa
->edests
[cur_node
].nelem
== 0)
3216 if (dfa
->edests
[cur_node
].nelem
== 2)
3218 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3219 dfa
->edests
[cur_node
].elems
[1],
3221 if (BE (err
!= REG_NOERROR
, 0))
3224 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3230 /* For all the back references in the current state, calculate the
3231 destination of the back references by the appropriate entry
3232 in MCTX->BKREF_ENTS. */
3234 static reg_errcode_t
3235 internal_function __attribute_warn_unused_result__
3236 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3237 int cur_str
, int subexp_num
, int type
)
3239 const re_dfa_t
*const dfa
= mctx
->dfa
;
3241 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3242 struct re_backref_cache_entry
*ent
;
3244 if (cache_idx_start
== -1)
3248 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3251 int to_idx
, next_node
;
3253 /* Is this entry ENT is appropriate? */
3254 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3257 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3258 /* Calculate the destination of the back reference, and append it
3259 to MCTX->STATE_LOG. */
3260 if (to_idx
== cur_str
)
3262 /* The backreference did epsilon transit, we must re-check all the
3263 node in the current state. */
3264 re_node_set new_dests
;
3265 reg_errcode_t err2
, err3
;
3266 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3267 if (re_node_set_contains (cur_nodes
, next_node
))
3269 err
= re_node_set_init_1 (&new_dests
, next_node
);
3270 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3271 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3272 re_node_set_free (&new_dests
);
3273 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3274 || err3
!= REG_NOERROR
, 0))
3276 err
= (err
!= REG_NOERROR
? err
3277 : (err2
!= REG_NOERROR
? err2
: err3
));
3280 /* TODO: It is still inefficient... */
3285 re_node_set union_set
;
3286 next_node
= dfa
->nexts
[ent
->node
];
3287 if (mctx
->state_log
[to_idx
])
3290 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3293 err
= re_node_set_init_copy (&union_set
,
3294 &mctx
->state_log
[to_idx
]->nodes
);
3295 ret
= re_node_set_insert (&union_set
, next_node
);
3296 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3298 re_node_set_free (&union_set
);
3299 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3305 err
= re_node_set_init_1 (&union_set
, next_node
);
3306 if (BE (err
!= REG_NOERROR
, 0))
3309 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3310 re_node_set_free (&union_set
);
3311 if (BE (mctx
->state_log
[to_idx
] == NULL
3312 && err
!= REG_NOERROR
, 0))
3316 while (ent
++->more
);
3320 /* Build transition table for the state.
3321 Return 1 if succeeded, otherwise return NULL. */
3325 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3328 int i
, j
, ch
, need_word_trtable
= 0;
3329 bitset_word_t elem
, mask
;
3330 bool dests_node_malloced
= false;
3331 bool dest_states_malloced
= false;
3332 int ndests
; /* Number of the destination states from `state'. */
3333 re_dfastate_t
**trtable
;
3334 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3335 re_node_set follows
, *dests_node
;
3337 bitset_t acceptable
;
3341 re_node_set dests_node
[SBC_MAX
];
3342 bitset_t dests_ch
[SBC_MAX
];
3345 /* We build DFA states which corresponds to the destination nodes
3346 from `state'. `dests_node[i]' represents the nodes which i-th
3347 destination state contains, and `dests_ch[i]' represents the
3348 characters which i-th destination state accepts. */
3349 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3350 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3353 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3354 if (BE (dests_alloc
== NULL
, 0))
3356 dests_node_malloced
= true;
3358 dests_node
= dests_alloc
->dests_node
;
3359 dests_ch
= dests_alloc
->dests_ch
;
3361 /* Initialize transiton table. */
3362 state
->word_trtable
= state
->trtable
= NULL
;
3364 /* At first, group all nodes belonging to `state' into several
3366 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3367 if (BE (ndests
<= 0, 0))
3369 if (dests_node_malloced
)
3371 /* Return 0 in case of an error, 1 otherwise. */
3374 state
->trtable
= (re_dfastate_t
**)
3375 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3381 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3382 if (BE (err
!= REG_NOERROR
, 0))
3385 /* Avoid arithmetic overflow in size calculation. */
3386 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3387 / (3 * sizeof (re_dfastate_t
*)))
3392 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3393 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3394 dest_states
= (re_dfastate_t
**)
3395 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3398 dest_states
= (re_dfastate_t
**)
3399 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3400 if (BE (dest_states
== NULL
, 0))
3403 if (dest_states_malloced
)
3405 re_node_set_free (&follows
);
3406 for (i
= 0; i
< ndests
; ++i
)
3407 re_node_set_free (dests_node
+ i
);
3408 if (dests_node_malloced
)
3412 dest_states_malloced
= true;
3414 dest_states_word
= dest_states
+ ndests
;
3415 dest_states_nl
= dest_states_word
+ ndests
;
3416 bitset_empty (acceptable
);
3418 /* Then build the states for all destinations. */
3419 for (i
= 0; i
< ndests
; ++i
)
3422 re_node_set_empty (&follows
);
3423 /* Merge the follows of this destination states. */
3424 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3426 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3427 if (next_node
!= -1)
3429 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3430 if (BE (err
!= REG_NOERROR
, 0))
3434 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3435 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3437 /* If the new state has context constraint,
3438 build appropriate states for these contexts. */
3439 if (dest_states
[i
]->has_constraint
)
3441 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3443 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3446 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3447 need_word_trtable
= 1;
3449 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3451 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3456 dest_states_word
[i
] = dest_states
[i
];
3457 dest_states_nl
[i
] = dest_states
[i
];
3459 bitset_merge (acceptable
, dests_ch
[i
]);
3462 if (!BE (need_word_trtable
, 0))
3464 /* We don't care about whether the following character is a word
3465 character, or we are in a single-byte character set so we can
3466 discern by looking at the character code: allocate a
3467 256-entry transition table. */
3468 trtable
= state
->trtable
=
3469 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3470 if (BE (trtable
== NULL
, 0))
3473 /* For all characters ch...: */
3474 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3475 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3477 mask
<<= 1, elem
>>= 1, ++ch
)
3478 if (BE (elem
& 1, 0))
3480 /* There must be exactly one destination which accepts
3481 character ch. See group_nodes_into_DFAstates. */
3482 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3485 /* j-th destination accepts the word character ch. */
3486 if (dfa
->word_char
[i
] & mask
)
3487 trtable
[ch
] = dest_states_word
[j
];
3489 trtable
[ch
] = dest_states
[j
];
3494 /* We care about whether the following character is a word
3495 character, and we are in a multi-byte character set: discern
3496 by looking at the character code: build two 256-entry
3497 transition tables, one starting at trtable[0] and one
3498 starting at trtable[SBC_MAX]. */
3499 trtable
= state
->word_trtable
=
3500 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3501 if (BE (trtable
== NULL
, 0))
3504 /* For all characters ch...: */
3505 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3506 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3508 mask
<<= 1, elem
>>= 1, ++ch
)
3509 if (BE (elem
& 1, 0))
3511 /* There must be exactly one destination which accepts
3512 character ch. See group_nodes_into_DFAstates. */
3513 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3516 /* j-th destination accepts the word character ch. */
3517 trtable
[ch
] = dest_states
[j
];
3518 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3523 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3525 /* The current state accepts newline character. */
3526 for (j
= 0; j
< ndests
; ++j
)
3527 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3529 /* k-th destination accepts newline character. */
3530 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3531 if (need_word_trtable
)
3532 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3533 /* There must be only one destination which accepts
3534 newline. See group_nodes_into_DFAstates. */
3539 if (dest_states_malloced
)
3542 re_node_set_free (&follows
);
3543 for (i
= 0; i
< ndests
; ++i
)
3544 re_node_set_free (dests_node
+ i
);
3546 if (dests_node_malloced
)
3552 /* Group all nodes belonging to STATE into several destinations.
3553 Then for all destinations, set the nodes belonging to the destination
3554 to DESTS_NODE[i] and set the characters accepted by the destination
3555 to DEST_CH[i]. This function return the number of destinations. */
3559 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3560 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3565 int ndests
; /* Number of the destinations from `state'. */
3566 bitset_t accepts
; /* Characters a node can accept. */
3567 const re_node_set
*cur_nodes
= &state
->nodes
;
3568 bitset_empty (accepts
);
3571 /* For all the nodes belonging to `state', */
3572 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3574 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3575 re_token_type_t type
= node
->type
;
3576 unsigned int constraint
= node
->constraint
;
3578 /* Enumerate all single byte character this node can accept. */
3579 if (type
== CHARACTER
)
3580 bitset_set (accepts
, node
->opr
.c
);
3581 else if (type
== SIMPLE_BRACKET
)
3583 bitset_merge (accepts
, node
->opr
.sbcset
);
3585 else if (type
== OP_PERIOD
)
3587 #ifdef RE_ENABLE_I18N
3588 if (dfa
->mb_cur_max
> 1)
3589 bitset_merge (accepts
, dfa
->sb_char
);
3592 bitset_set_all (accepts
);
3593 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3594 bitset_clear (accepts
, '\n');
3595 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3596 bitset_clear (accepts
, '\0');
3598 #ifdef RE_ENABLE_I18N
3599 else if (type
== OP_UTF8_PERIOD
)
3601 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3602 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3603 bitset_clear (accepts
, '\n');
3604 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3605 bitset_clear (accepts
, '\0');
3611 /* Check the `accepts' and sift the characters which are not
3612 match it the context. */
3615 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3617 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3618 bitset_empty (accepts
);
3619 if (accepts_newline
)
3620 bitset_set (accepts
, NEWLINE_CHAR
);
3624 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3626 bitset_empty (accepts
);
3630 if (constraint
& NEXT_WORD_CONSTRAINT
)
3632 bitset_word_t any_set
= 0;
3633 if (type
== CHARACTER
&& !node
->word_char
)
3635 bitset_empty (accepts
);
3638 #ifdef RE_ENABLE_I18N
3639 if (dfa
->mb_cur_max
> 1)
3640 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3641 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3644 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3645 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3649 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3651 bitset_word_t any_set
= 0;
3652 if (type
== CHARACTER
&& node
->word_char
)
3654 bitset_empty (accepts
);
3657 #ifdef RE_ENABLE_I18N
3658 if (dfa
->mb_cur_max
> 1)
3659 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3660 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3663 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3664 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3670 /* Then divide `accepts' into DFA states, or create a new
3671 state. Above, we make sure that accepts is not empty. */
3672 for (j
= 0; j
< ndests
; ++j
)
3674 bitset_t intersec
; /* Intersection sets, see below. */
3676 /* Flags, see below. */
3677 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3679 /* Optimization, skip if this state doesn't accept the character. */
3680 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3683 /* Enumerate the intersection set of this state and `accepts'. */
3685 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3686 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3687 /* And skip if the intersection set is empty. */
3691 /* Then check if this state is a subset of `accepts'. */
3692 not_subset
= not_consumed
= 0;
3693 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3695 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3696 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3699 /* If this state isn't a subset of `accepts', create a
3700 new group state, which has the `remains'. */
3703 bitset_copy (dests_ch
[ndests
], remains
);
3704 bitset_copy (dests_ch
[j
], intersec
);
3705 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3706 if (BE (err
!= REG_NOERROR
, 0))
3711 /* Put the position in the current group. */
3712 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3713 if (BE (result
< 0, 0))
3716 /* If all characters are consumed, go to next node. */
3720 /* Some characters remain, create a new group. */
3723 bitset_copy (dests_ch
[ndests
], accepts
);
3724 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3725 if (BE (err
!= REG_NOERROR
, 0))
3728 bitset_empty (accepts
);
3733 for (j
= 0; j
< ndests
; ++j
)
3734 re_node_set_free (dests_node
+ j
);
3738 #ifdef RE_ENABLE_I18N
3739 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3740 Return the number of the bytes the node accepts.
3741 STR_IDX is the current index of the input string.
3743 This function handles the nodes which can accept one character, or
3744 one collating element like '.', '[a-z]', opposite to the other nodes
3745 can only accept one byte. */
3749 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3750 const re_string_t
*input
, int str_idx
)
3752 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3753 int char_len
, elem_len
;
3756 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3758 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3759 if (BE (c
< 0xc2, 1))
3762 if (str_idx
+ 2 > input
->len
)
3765 d
= re_string_byte_at (input
, str_idx
+ 1);
3767 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3771 if (c
== 0xe0 && d
< 0xa0)
3777 if (c
== 0xf0 && d
< 0x90)
3783 if (c
== 0xf8 && d
< 0x88)
3789 if (c
== 0xfc && d
< 0x84)
3795 if (str_idx
+ char_len
> input
->len
)
3798 for (i
= 1; i
< char_len
; ++i
)
3800 d
= re_string_byte_at (input
, str_idx
+ i
);
3801 if (d
< 0x80 || d
> 0xbf)
3807 char_len
= re_string_char_size_at (input
, str_idx
);
3808 if (node
->type
== OP_PERIOD
)
3812 /* FIXME: I don't think this if is needed, as both '\n'
3813 and '\0' are char_len == 1. */
3814 /* '.' accepts any one character except the following two cases. */
3815 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3816 re_string_byte_at (input
, str_idx
) == '\n') ||
3817 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3818 re_string_byte_at (input
, str_idx
) == '\0'))
3823 elem_len
= re_string_elem_size_at (input
, str_idx
);
3824 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3827 if (node
->type
== COMPLEX_BRACKET
)
3829 const re_charset_t
*cset
= node
->opr
.mbcset
;
3831 const unsigned char *pin
3832 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3837 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3838 ? re_string_wchar_at (input
, str_idx
) : 0);
3840 /* match with multibyte character? */
3841 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3842 if (wc
== cset
->mbchars
[i
])
3844 match_len
= char_len
;
3845 goto check_node_accept_bytes_match
;
3847 /* match with character_class? */
3848 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3850 wctype_t wt
= cset
->char_classes
[i
];
3851 if (__iswctype (wc
, wt
))
3853 match_len
= char_len
;
3854 goto check_node_accept_bytes_match
;
3859 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3862 unsigned int in_collseq
= 0;
3863 const int32_t *table
, *indirect
;
3864 const unsigned char *weights
, *extra
;
3865 const char *collseqwc
;
3866 /* This #include defines a local function! */
3867 # include <locale/weight.h>
3869 /* match with collating_symbol? */
3870 if (cset
->ncoll_syms
)
3871 extra
= (const unsigned char *)
3872 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3873 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3875 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3876 /* Compare the length of input collating element and
3877 the length of current collating element. */
3878 if (*coll_sym
!= elem_len
)
3880 /* Compare each bytes. */
3881 for (j
= 0; j
< *coll_sym
; j
++)
3882 if (pin
[j
] != coll_sym
[1 + j
])
3886 /* Match if every bytes is equal. */
3888 goto check_node_accept_bytes_match
;
3894 if (elem_len
<= char_len
)
3896 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3897 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3900 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3902 /* match with range expression? */
3903 for (i
= 0; i
< cset
->nranges
; ++i
)
3904 if (cset
->range_starts
[i
] <= in_collseq
3905 && in_collseq
<= cset
->range_ends
[i
])
3907 match_len
= elem_len
;
3908 goto check_node_accept_bytes_match
;
3911 /* match with equivalence_class? */
3912 if (cset
->nequiv_classes
)
3914 const unsigned char *cp
= pin
;
3915 table
= (const int32_t *)
3916 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3917 weights
= (const unsigned char *)
3918 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3919 extra
= (const unsigned char *)
3920 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3921 indirect
= (const int32_t *)
3922 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3923 int32_t idx
= findidx (&cp
);
3925 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3927 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3928 size_t weight_len
= weights
[idx
& 0xffffff];
3929 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3930 && (idx
>> 24) == (equiv_class_idx
>> 24))
3935 equiv_class_idx
&= 0xffffff;
3937 while (cnt
<= weight_len
3938 && (weights
[equiv_class_idx
+ 1 + cnt
]
3939 == weights
[idx
+ 1 + cnt
]))
3941 if (cnt
> weight_len
)
3943 match_len
= elem_len
;
3944 goto check_node_accept_bytes_match
;
3953 /* match with range expression? */
3955 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3957 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3960 for (i
= 0; i
< cset
->nranges
; ++i
)
3962 cmp_buf
[0] = cset
->range_starts
[i
];
3963 cmp_buf
[4] = cset
->range_ends
[i
];
3964 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3965 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3967 match_len
= char_len
;
3968 goto check_node_accept_bytes_match
;
3972 check_node_accept_bytes_match
:
3973 if (!cset
->non_match
)
3980 return (elem_len
> char_len
) ? elem_len
: char_len
;
3989 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3991 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3996 /* No valid character. Match it as a single byte character. */
3997 const unsigned char *collseq
= (const unsigned char *)
3998 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3999 return collseq
[mbs
[0]];
4006 const unsigned char *extra
= (const unsigned char *)
4007 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
4008 int32_t extrasize
= (const unsigned char *)
4009 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
4011 for (idx
= 0; idx
< extrasize
;)
4013 int mbs_cnt
, found
= 0;
4014 int32_t elem_mbs_len
;
4015 /* Skip the name of collating element name. */
4016 idx
= idx
+ extra
[idx
] + 1;
4017 elem_mbs_len
= extra
[idx
++];
4018 if (mbs_len
== elem_mbs_len
)
4020 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4021 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4023 if (mbs_cnt
== elem_mbs_len
)
4024 /* Found the entry. */
4027 /* Skip the byte sequence of the collating element. */
4028 idx
+= elem_mbs_len
;
4029 /* Adjust for the alignment. */
4030 idx
= (idx
+ 3) & ~3;
4031 /* Skip the collation sequence value. */
4032 idx
+= sizeof (uint32_t);
4033 /* Skip the wide char sequence of the collating element. */
4034 idx
= idx
+ sizeof (uint32_t) * (*(int32_t *) (extra
+ idx
) + 1);
4035 /* If we found the entry, return the sequence value. */
4037 return *(uint32_t *) (extra
+ idx
);
4038 /* Skip the collation sequence value. */
4039 idx
+= sizeof (uint32_t);
4045 #endif /* RE_ENABLE_I18N */
4047 /* Check whether the node accepts the byte which is IDX-th
4048 byte of the INPUT. */
4052 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4056 ch
= re_string_byte_at (&mctx
->input
, idx
);
4060 if (node
->opr
.c
!= ch
)
4064 case SIMPLE_BRACKET
:
4065 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4069 #ifdef RE_ENABLE_I18N
4070 case OP_UTF8_PERIOD
:
4076 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4077 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4085 if (node
->constraint
)
4087 /* The node has constraints. Check whether the current context
4088 satisfies the constraints. */
4089 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4091 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4098 /* Extend the buffers, if the buffers have run out. */
4100 static reg_errcode_t
4101 internal_function __attribute_warn_unused_result__
4102 extend_buffers (re_match_context_t
*mctx
)
4105 re_string_t
*pstr
= &mctx
->input
;
4107 /* Avoid overflow. */
4108 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4111 /* Double the lengthes of the buffers. */
4112 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4113 if (BE (ret
!= REG_NOERROR
, 0))
4116 if (mctx
->state_log
!= NULL
)
4118 /* And double the length of state_log. */
4119 /* XXX We have no indication of the size of this buffer. If this
4120 allocation fail we have no indication that the state_log array
4121 does not have the right size. */
4122 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4123 pstr
->bufs_len
+ 1);
4124 if (BE (new_array
== NULL
, 0))
4126 mctx
->state_log
= new_array
;
4129 /* Then reconstruct the buffers. */
4132 #ifdef RE_ENABLE_I18N
4133 if (pstr
->mb_cur_max
> 1)
4135 ret
= build_wcs_upper_buffer (pstr
);
4136 if (BE (ret
!= REG_NOERROR
, 0))
4140 #endif /* RE_ENABLE_I18N */
4141 build_upper_buffer (pstr
);
4145 #ifdef RE_ENABLE_I18N
4146 if (pstr
->mb_cur_max
> 1)
4147 build_wcs_buffer (pstr
);
4149 #endif /* RE_ENABLE_I18N */
4151 if (pstr
->trans
!= NULL
)
4152 re_string_translate_buffer (pstr
);
4159 /* Functions for matching context. */
4161 /* Initialize MCTX. */
4163 static reg_errcode_t
4164 internal_function __attribute_warn_unused_result__
4165 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4167 mctx
->eflags
= eflags
;
4168 mctx
->match_last
= -1;
4171 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4172 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4173 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4176 /* Already zero-ed by the caller.
4178 mctx->bkref_ents = NULL;
4179 mctx->nbkref_ents = 0;
4180 mctx->nsub_tops = 0; */
4181 mctx
->abkref_ents
= n
;
4182 mctx
->max_mb_elem_len
= 1;
4183 mctx
->asub_tops
= n
;
4187 /* Clean the entries which depend on the current input in MCTX.
4188 This function must be invoked when the matcher changes the start index
4189 of the input, or changes the input string. */
4193 match_ctx_clean (re_match_context_t
*mctx
)
4196 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4199 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4200 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4202 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4203 re_free (last
->path
.array
);
4206 re_free (top
->lasts
);
4209 re_free (top
->path
->array
);
4210 re_free (top
->path
);
4215 mctx
->nsub_tops
= 0;
4216 mctx
->nbkref_ents
= 0;
4219 /* Free all the memory associated with MCTX. */
4223 match_ctx_free (re_match_context_t
*mctx
)
4225 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4226 match_ctx_clean (mctx
);
4227 re_free (mctx
->sub_tops
);
4228 re_free (mctx
->bkref_ents
);
4231 /* Add a new backreference entry to MCTX.
4232 Note that we assume that caller never call this function with duplicate
4233 entry, and call with STR_IDX which isn't smaller than any existing entry.
4236 static reg_errcode_t
4237 internal_function __attribute_warn_unused_result__
4238 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4241 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4243 struct re_backref_cache_entry
* new_entry
;
4244 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4245 mctx
->abkref_ents
* 2);
4246 if (BE (new_entry
== NULL
, 0))
4248 re_free (mctx
->bkref_ents
);
4251 mctx
->bkref_ents
= new_entry
;
4252 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4253 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4254 mctx
->abkref_ents
*= 2;
4256 if (mctx
->nbkref_ents
> 0
4257 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4258 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4260 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4261 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4262 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4263 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4265 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4266 If bit N is clear, means that this entry won't epsilon-transition to
4267 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4268 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4271 A backreference does not epsilon-transition unless it is empty, so set
4272 to all zeros if FROM != TO. */
4273 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4274 = (from
== to
? ~0 : 0);
4276 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4277 if (mctx
->max_mb_elem_len
< to
- from
)
4278 mctx
->max_mb_elem_len
= to
- from
;
4282 /* Search for the first entry which has the same str_idx, or -1 if none is
4283 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4287 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4289 int left
, right
, mid
, last
;
4290 last
= right
= mctx
->nbkref_ents
;
4291 for (left
= 0; left
< right
;)
4293 mid
= (left
+ right
) / 2;
4294 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4299 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4305 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4308 static reg_errcode_t
4309 internal_function __attribute_warn_unused_result__
4310 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4313 assert (mctx
->sub_tops
!= NULL
);
4314 assert (mctx
->asub_tops
> 0);
4316 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4318 int new_asub_tops
= mctx
->asub_tops
* 2;
4319 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4320 re_sub_match_top_t
*,
4322 if (BE (new_array
== NULL
, 0))
4324 mctx
->sub_tops
= new_array
;
4325 mctx
->asub_tops
= new_asub_tops
;
4327 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4328 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4330 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4331 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4335 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4336 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4338 static re_sub_match_last_t
*
4340 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4342 re_sub_match_last_t
*new_entry
;
4343 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4345 int new_alasts
= 2 * subtop
->alasts
+ 1;
4346 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4347 re_sub_match_last_t
*,
4349 if (BE (new_array
== NULL
, 0))
4351 subtop
->lasts
= new_array
;
4352 subtop
->alasts
= new_alasts
;
4354 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4355 if (BE (new_entry
!= NULL
, 1))
4357 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4358 new_entry
->node
= node
;
4359 new_entry
->str_idx
= str_idx
;
4367 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4368 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4370 sctx
->sifted_states
= sifted_sts
;
4371 sctx
->limited_states
= limited_sts
;
4372 sctx
->last_node
= last_node
;
4373 sctx
->last_str_idx
= last_str_idx
;
4374 re_node_set_init_empty (&sctx
->limits
);