1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
21 int n
) internal_function
;
22 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
23 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
24 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
25 int str_idx
, int from
, int to
)
27 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
29 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
30 int str_idx
) internal_function
;
31 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
32 int node
, int str_idx
)
34 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
35 re_dfastate_t
**limited_sts
, int last_node
,
38 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
39 const char *string
, int length
,
40 int start
, int range
, int stop
,
41 size_t nmatch
, regmatch_t pmatch
[],
43 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
44 const char *string1
, int length1
,
45 const char *string2
, int length2
,
46 int start
, int range
, struct re_registers
*regs
,
47 int stop
, int ret_len
);
48 static int re_search_stub (struct re_pattern_buffer
*bufp
,
49 const char *string
, int length
, int start
,
50 int range
, int stop
, struct re_registers
*regs
,
52 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
53 int nregs
, int regs_allocated
);
54 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
);
55 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
56 int *p_match_first
) internal_function
;
57 static int check_halt_state_context (const re_match_context_t
*mctx
,
58 const re_dfastate_t
*state
, int idx
)
60 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
61 regmatch_t
*prev_idx_match
, int cur_node
,
62 int cur_idx
, int nmatch
) internal_function
;
63 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
64 int str_idx
, int dest_node
, int nregs
,
66 re_node_set
*eps_via_nodes
)
68 static reg_errcode_t
set_regs (const regex_t
*preg
,
69 const re_match_context_t
*mctx
,
70 size_t nmatch
, regmatch_t
*pmatch
,
71 int fl_backtrack
) internal_function
;
72 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
76 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
77 re_sift_context_t
*sctx
,
78 int node_idx
, int str_idx
, int max_str_idx
)
80 #endif /* RE_ENABLE_I18N */
81 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
82 re_sift_context_t
*sctx
)
84 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
85 re_sift_context_t
*sctx
, int str_idx
,
86 re_node_set
*cur_dest
)
88 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
89 re_sift_context_t
*sctx
,
91 re_node_set
*dest_nodes
)
93 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
94 re_node_set
*dest_nodes
,
95 const re_node_set
*candidates
)
97 static int check_dst_limits (const re_match_context_t
*mctx
,
99 int dst_node
, int dst_idx
, int src_node
,
100 int src_idx
) internal_function
;
101 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
102 int boundaries
, int subexp_idx
,
103 int from_node
, int bkref_idx
)
105 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
106 int limit
, int subexp_idx
,
107 int node
, int str_idx
,
108 int bkref_idx
) internal_function
;
109 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
110 re_node_set
*dest_nodes
,
111 const re_node_set
*candidates
,
113 struct re_backref_cache_entry
*bkref_ents
,
114 int str_idx
) internal_function
;
115 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
116 re_sift_context_t
*sctx
,
117 int str_idx
, const re_node_set
*candidates
)
119 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
121 re_dfastate_t
**src
, int num
)
123 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
124 re_match_context_t
*mctx
) internal_function
;
125 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
126 re_match_context_t
*mctx
,
127 re_dfastate_t
*state
) internal_function
;
128 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
129 re_match_context_t
*mctx
,
130 re_dfastate_t
*next_state
)
132 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
133 re_node_set
*cur_nodes
,
134 int str_idx
) internal_function
;
136 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
137 re_match_context_t
*mctx
,
138 re_dfastate_t
*pstate
)
141 #ifdef RE_ENABLE_I18N
142 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
143 re_dfastate_t
*pstate
)
145 #endif /* RE_ENABLE_I18N */
146 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
147 const re_node_set
*nodes
)
149 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
150 int bkref_node
, int bkref_str_idx
)
152 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
153 const re_sub_match_top_t
*sub_top
,
154 re_sub_match_last_t
*sub_last
,
155 int bkref_node
, int bkref_str
)
157 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
158 int subexp_idx
, int type
) internal_function
;
159 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
160 state_array_t
*path
, int top_node
,
161 int top_str
, int last_node
, int last_str
,
162 int type
) internal_function
;
163 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
165 re_node_set
*cur_nodes
,
166 re_node_set
*next_nodes
)
168 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
169 re_node_set
*cur_nodes
,
170 int ex_subexp
, int type
)
172 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
173 re_node_set
*dst_nodes
,
174 int target
, int ex_subexp
,
175 int type
) internal_function
;
176 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
177 re_node_set
*cur_nodes
, int cur_str
,
178 int subexp_num
, int type
)
180 static int build_trtable (const re_dfa_t
*dfa
,
181 re_dfastate_t
*state
) internal_function
;
182 #ifdef RE_ENABLE_I18N
183 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
184 const re_string_t
*input
, int idx
)
187 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
191 #endif /* RE_ENABLE_I18N */
192 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
193 const re_dfastate_t
*state
,
194 re_node_set
*states_node
,
195 bitset_t
*states_ch
) internal_function
;
196 static int check_node_accept (const re_match_context_t
*mctx
,
197 const re_token_t
*node
, int idx
)
199 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
202 /* Entry point for POSIX code. */
204 /* regexec searches for a given pattern, specified by PREG, in the
207 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
208 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
209 least NMATCH elements, and we set them to the offsets of the
210 corresponding matched substrings.
212 EFLAGS specifies `execution flags' which affect matching: if
213 REG_NOTBOL is set, then ^ does not match at the beginning of the
214 string; if REG_NOTEOL is set, then $ does not match at the end.
216 We return 0 if we find a match and REG_NOMATCH if not. */
220 const regex_t
*__restrict preg
,
221 const char *__restrict string
,
229 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
232 if (eflags
& REG_STARTEND
)
234 start
= pmatch
[0].rm_so
;
235 length
= pmatch
[0].rm_eo
;
240 length
= strlen (string
);
243 __libc_lock_lock (dfa
->lock
);
245 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
246 length
, 0, NULL
, eflags
);
248 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
249 length
, nmatch
, pmatch
, eflags
);
250 __libc_lock_unlock (dfa
->lock
);
251 return err
!= REG_NOERROR
;
255 # include <shlib-compat.h>
256 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec
) __compat_regexec
;
262 attribute_compat_text_section
263 __compat_regexec (const regex_t
*__restrict preg
,
264 const char *__restrict string
, size_t nmatch
,
265 regmatch_t pmatch
[], int eflags
)
267 return regexec (preg
, string
, nmatch
, pmatch
,
268 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
270 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
274 /* Entry points for GNU code. */
276 /* re_match, re_search, re_match_2, re_search_2
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
304 re_match (struct re_pattern_buffer
*bufp
,
308 struct re_registers
*regs
)
310 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
313 weak_alias (__re_match
, re_match
)
317 re_search (struct re_pattern_buffer
*bufp
,
319 int length
, int start
, int range
,
320 struct re_registers
*regs
)
322 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
325 weak_alias (__re_search
, re_search
)
329 re_match_2 (struct re_pattern_buffer
*bufp
,
330 const char *string1
, int length1
,
331 const char *string2
, int length2
, int start
,
332 struct re_registers
*regs
, int stop
)
334 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
335 start
, 0, regs
, stop
, 1);
338 weak_alias (__re_match_2
, re_match_2
)
342 re_search_2 (struct re_pattern_buffer
*bufp
,
343 const char *string1
, int length1
,
344 const char *string2
, int length2
, int start
,
345 int range
, struct re_registers
*regs
, int stop
)
347 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
348 start
, range
, regs
, stop
, 0);
351 weak_alias (__re_search_2
, re_search_2
)
355 re_search_2_stub (struct re_pattern_buffer
*bufp
,
356 const char *string1
, int length1
,
357 const char *string2
, int length2
, int start
,
358 int range
, struct re_registers
*regs
,
359 int stop
, int ret_len
)
363 int len
= length1
+ length2
;
366 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
369 /* Concatenate the strings. */
373 char *s
= re_malloc (char, len
);
375 if (BE (s
== NULL
, 0))
377 memcpy (s
, string1
, length1
);
378 memcpy (s
+ length1
, string2
, length2
);
387 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
389 re_free ((char *) str
);
393 /* The parameters have the same meaning as those of re_search.
394 Additional parameters:
395 If RET_LEN is nonzero the length of the match is returned (re_match style);
396 otherwise the position of the match is returned. */
399 re_search_stub (struct re_pattern_buffer
*bufp
,
400 const char *string
, int length
, int start
,
402 struct re_registers
*regs
, int ret_len
)
404 reg_errcode_t result
;
409 /* Check for out-of-range. */
410 if (BE (start
< 0 || start
> length
, 0))
412 if (BE (start
+ range
> length
, 0))
413 range
= length
- start
;
414 else if (BE (start
+ range
< 0, 0))
417 __libc_lock_lock (dfa
->lock
);
419 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
420 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
422 /* Compile fastmap if we haven't yet. */
423 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
424 re_compile_fastmap (bufp
);
426 if (BE (bufp
->no_sub
, 0))
429 /* We need at least 1 register. */
432 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
433 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
435 nregs
= regs
->num_regs
;
436 if (BE (nregs
< 1, 0))
438 /* Nothing can be copied to regs. */
444 nregs
= bufp
->re_nsub
+ 1;
445 pmatch
= re_malloc (regmatch_t
, nregs
);
446 if (BE (pmatch
== NULL
, 0))
452 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
453 nregs
, pmatch
, eflags
);
457 /* I hope we needn't fill their regs with -1's when no match was found. */
458 if (result
!= REG_NOERROR
)
460 else if (regs
!= NULL
)
462 /* If caller wants register contents data back, copy them. */
463 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
464 bufp
->regs_allocated
);
465 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
469 if (BE (rval
== 0, 1))
473 assert (pmatch
[0].rm_so
== start
);
474 rval
= pmatch
[0].rm_eo
- start
;
477 rval
= pmatch
[0].rm_so
;
481 __libc_lock_unlock (dfa
->lock
);
486 re_copy_regs (struct re_registers
*regs
,
488 int nregs
, int regs_allocated
)
490 int rval
= REGS_REALLOCATE
;
492 int need_regs
= nregs
+ 1;
493 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
496 /* Have the register data arrays been allocated? */
497 if (regs_allocated
== REGS_UNALLOCATED
)
498 { /* No. So allocate them with malloc. */
499 regs
->start
= re_malloc (regoff_t
, need_regs
);
500 if (BE (regs
->start
== NULL
, 0))
501 return REGS_UNALLOCATED
;
502 regs
->end
= re_malloc (regoff_t
, need_regs
);
503 if (BE (regs
->end
== NULL
, 0))
505 re_free (regs
->start
);
506 return REGS_UNALLOCATED
;
508 regs
->num_regs
= need_regs
;
510 else if (regs_allocated
== REGS_REALLOCATE
)
511 { /* Yes. If we need more elements than were already
512 allocated, reallocate them. If we need fewer, just
514 if (BE (need_regs
> regs
->num_regs
, 0))
516 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
518 if (BE (new_start
== NULL
, 0))
519 return REGS_UNALLOCATED
;
520 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
521 if (BE (new_end
== NULL
, 0))
524 return REGS_UNALLOCATED
;
526 regs
->start
= new_start
;
528 regs
->num_regs
= need_regs
;
533 assert (regs_allocated
== REGS_FIXED
);
534 /* This function may not be called with REGS_FIXED and nregs too big. */
535 assert (regs
->num_regs
>= nregs
);
540 for (i
= 0; i
< nregs
; ++i
)
542 regs
->start
[i
] = pmatch
[i
].rm_so
;
543 regs
->end
[i
] = pmatch
[i
].rm_eo
;
545 for ( ; i
< regs
->num_regs
; ++i
)
546 regs
->start
[i
] = regs
->end
[i
] = -1;
551 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
552 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
553 this memory for recording register information. STARTS and ENDS
554 must be allocated using the malloc library routine, and must each
555 be at least NUM_REGS * sizeof (regoff_t) bytes long.
557 If NUM_REGS == 0, then subsequent matches should allocate their own
560 Unless this function is called, the first search or match using
561 PATTERN_BUFFER will allocate its own register data, without
562 freeing the old data. */
565 re_set_registers (struct re_pattern_buffer
*bufp
,
566 struct re_registers
*regs
,
573 bufp
->regs_allocated
= REGS_REALLOCATE
;
574 regs
->num_regs
= num_regs
;
575 regs
->start
= starts
;
580 bufp
->regs_allocated
= REGS_UNALLOCATED
;
582 regs
->start
= regs
->end
= (regoff_t
*) 0;
586 weak_alias (__re_set_registers
, re_set_registers
)
589 /* Entry points compatible with 4.2 BSD regex library. We don't define
590 them unless specifically requested. */
592 #if defined _REGEX_RE_COMP || defined _LIBC
600 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
602 #endif /* _REGEX_RE_COMP */
604 /* Internal entry point. */
606 /* Searches for a compiled pattern PREG in the string STRING, whose
607 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
608 mingings with regexec. START, and RANGE have the same meanings
610 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
611 otherwise return the error code.
612 Note: We assume front end functions already check ranges.
613 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
616 re_search_internal (const regex_t
*preg
,
618 int length
, int start
, int range
, int stop
,
619 size_t nmatch
, regmatch_t pmatch
[],
623 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
624 int left_lim
, right_lim
, incr
;
625 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
628 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
629 re_match_context_t mctx
= { .dfa
= dfa
};
631 re_match_context_t mctx
;
633 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
634 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
635 RE_TRANSLATE_TYPE t
= preg
->translate
;
637 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
638 memset (&mctx
, '\0', sizeof (re_match_context_t
));
642 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
643 nmatch
-= extra_nmatch
;
645 /* Check if the DFA haven't been compiled. */
646 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
647 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
648 || dfa
->init_state_begbuf
== NULL
, 0))
652 /* We assume front-end functions already check them. */
653 assert (start
+ range
>= 0 && start
+ range
<= length
);
656 /* If initial states with non-begbuf contexts have no elements,
657 the regex must be anchored. If preg->newline_anchor is set,
658 we'll never use init_state_nl, so do not check it. */
659 if (dfa
->init_state
->nodes
.nelem
== 0
660 && dfa
->init_state_word
->nodes
.nelem
== 0
661 && (dfa
->init_state_nl
->nodes
.nelem
== 0
662 || !preg
->newline_anchor
))
664 if (start
!= 0 && start
+ range
!= 0)
669 /* We must check the longest matching, if nmatch > 0. */
670 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
672 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
673 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
674 if (BE (err
!= REG_NOERROR
, 0))
676 mctx
.input
.stop
= stop
;
677 mctx
.input
.raw_stop
= stop
;
678 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
680 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
681 if (BE (err
!= REG_NOERROR
, 0))
684 /* We will log all the DFA states through which the dfa pass,
685 if nmatch > 1, or this dfa has "multibyte node", which is a
686 back-reference or a node which can accept multibyte character or
687 multi character collating element. */
688 if (nmatch
> 1 || dfa
->has_mb_node
)
690 /* Avoid overflow. */
691 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
697 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
698 if (BE (mctx
.state_log
== NULL
, 0))
705 mctx
.state_log
= NULL
;
708 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
709 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
711 /* Check incrementally whether of not the input string match. */
712 incr
= (range
< 0) ? -1 : 1;
713 left_lim
= (range
< 0) ? start
+ range
: start
;
714 right_lim
= (range
< 0) ? start
: start
+ range
;
715 sb
= dfa
->mb_cur_max
== 1;
718 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
719 | (range
>= 0 ? 2 : 0)
720 | (t
!= NULL
? 1 : 0))
723 for (;; match_first
+= incr
)
726 if (match_first
< left_lim
|| right_lim
< match_first
)
729 /* Advance as rapidly as possible through the string, until we
730 find a plausible place to start matching. This may be done
731 with varying efficiency, so there are various possibilities:
732 only the most common of them are specialized, in order to
733 save on code size. We use a switch statement for speed. */
741 /* Fastmap with single-byte translation, match forward. */
742 while (BE (match_first
< right_lim
, 1)
743 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
745 goto forward_match_found_start_or_reached_end
;
748 /* Fastmap without translation, match forward. */
749 while (BE (match_first
< right_lim
, 1)
750 && !fastmap
[(unsigned char) string
[match_first
]])
753 forward_match_found_start_or_reached_end
:
754 if (BE (match_first
== right_lim
, 0))
756 ch
= match_first
>= length
757 ? 0 : (unsigned char) string
[match_first
];
758 if (!fastmap
[t
? t
[ch
] : ch
])
765 /* Fastmap without multi-byte translation, match backwards. */
766 while (match_first
>= left_lim
)
768 ch
= match_first
>= length
769 ? 0 : (unsigned char) string
[match_first
];
770 if (fastmap
[t
? t
[ch
] : ch
])
774 if (match_first
< left_lim
)
779 /* In this case, we can't determine easily the current byte,
780 since it might be a component byte of a multibyte
781 character. Then we use the constructed buffer instead. */
784 /* If MATCH_FIRST is out of the valid range, reconstruct the
786 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
787 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
789 err
= re_string_reconstruct (&mctx
.input
, match_first
,
791 if (BE (err
!= REG_NOERROR
, 0))
794 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
796 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
797 Note that MATCH_FIRST must not be smaller than 0. */
798 ch
= (match_first
>= length
799 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
803 if (match_first
< left_lim
|| match_first
> right_lim
)
812 /* Reconstruct the buffers so that the matcher can assume that
813 the matching starts from the beginning of the buffer. */
814 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
815 if (BE (err
!= REG_NOERROR
, 0))
818 #ifdef RE_ENABLE_I18N
819 /* Don't consider this char as a possible match start if it part,
820 yet isn't the head, of a multibyte character. */
821 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
825 /* It seems to be appropriate one, then use the matcher. */
826 /* We assume that the matching starts from 0. */
827 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
828 match_last
= check_matching (&mctx
, fl_longest_match
,
829 range
>= 0 ? &match_first
: NULL
);
830 if (match_last
!= -1)
832 if (BE (match_last
== -2, 0))
839 mctx
.match_last
= match_last
;
840 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
842 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
843 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
846 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
849 err
= prune_impossible_nodes (&mctx
);
850 if (err
== REG_NOERROR
)
852 if (BE (err
!= REG_NOMATCH
, 0))
857 break; /* We found a match. */
861 match_ctx_clean (&mctx
);
865 assert (match_last
!= -1);
866 assert (err
== REG_NOERROR
);
869 /* Set pmatch[] if we need. */
874 /* Initialize registers. */
875 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
876 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
878 /* Set the points where matching start/end. */
880 pmatch
[0].rm_eo
= mctx
.match_last
;
882 if (!preg
->no_sub
&& nmatch
> 1)
884 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
885 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
886 if (BE (err
!= REG_NOERROR
, 0))
890 /* At last, add the offset to the each registers, since we slided
891 the buffers so that we could assume that the matching starts
893 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
894 if (pmatch
[reg_idx
].rm_so
!= -1)
896 #ifdef RE_ENABLE_I18N
897 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
899 pmatch
[reg_idx
].rm_so
=
900 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
901 ? mctx
.input
.valid_raw_len
902 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
903 pmatch
[reg_idx
].rm_eo
=
904 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
905 ? mctx
.input
.valid_raw_len
906 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
909 assert (mctx
.input
.offsets_needed
== 0);
911 pmatch
[reg_idx
].rm_so
+= match_first
;
912 pmatch
[reg_idx
].rm_eo
+= match_first
;
914 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
916 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
917 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
921 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
922 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
924 pmatch
[reg_idx
+ 1].rm_so
925 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
926 pmatch
[reg_idx
+ 1].rm_eo
927 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
932 re_free (mctx
.state_log
);
934 match_ctx_free (&mctx
);
935 re_string_destruct (&mctx
.input
);
940 prune_impossible_nodes (re_match_context_t
*mctx
)
942 const re_dfa_t
*const dfa
= mctx
->dfa
;
943 int halt_node
, match_last
;
945 re_dfastate_t
**sifted_states
;
946 re_dfastate_t
**lim_states
= NULL
;
947 re_sift_context_t sctx
;
949 assert (mctx
->state_log
!= NULL
);
951 match_last
= mctx
->match_last
;
952 halt_node
= mctx
->last_node
;
954 /* Avoid overflow. */
955 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
958 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
959 if (BE (sifted_states
== NULL
, 0))
966 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
967 if (BE (lim_states
== NULL
, 0))
974 memset (lim_states
, '\0',
975 sizeof (re_dfastate_t
*) * (match_last
+ 1));
976 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
978 ret
= sift_states_backward (mctx
, &sctx
);
979 re_node_set_free (&sctx
.limits
);
980 if (BE (ret
!= REG_NOERROR
, 0))
982 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
992 } while (mctx
->state_log
[match_last
] == NULL
993 || !mctx
->state_log
[match_last
]->halt
);
994 halt_node
= check_halt_state_context (mctx
,
995 mctx
->state_log
[match_last
],
998 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
1000 re_free (lim_states
);
1002 if (BE (ret
!= REG_NOERROR
, 0))
1007 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1008 ret
= sift_states_backward (mctx
, &sctx
);
1009 re_node_set_free (&sctx
.limits
);
1010 if (BE (ret
!= REG_NOERROR
, 0))
1012 if (sifted_states
[0] == NULL
)
1018 re_free (mctx
->state_log
);
1019 mctx
->state_log
= sifted_states
;
1020 sifted_states
= NULL
;
1021 mctx
->last_node
= halt_node
;
1022 mctx
->match_last
= match_last
;
1025 re_free (sifted_states
);
1026 re_free (lim_states
);
1030 /* Acquire an initial state and return it.
1031 We must select appropriate initial state depending on the context,
1032 since initial states may have constraints like "\<", "^", etc.. */
1034 static inline re_dfastate_t
*
1035 __attribute ((always_inline
)) internal_function
1036 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1039 const re_dfa_t
*const dfa
= mctx
->dfa
;
1040 if (dfa
->init_state
->has_constraint
)
1042 unsigned int context
;
1043 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1044 if (IS_WORD_CONTEXT (context
))
1045 return dfa
->init_state_word
;
1046 else if (IS_ORDINARY_CONTEXT (context
))
1047 return dfa
->init_state
;
1048 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1049 return dfa
->init_state_begbuf
;
1050 else if (IS_NEWLINE_CONTEXT (context
))
1051 return dfa
->init_state_nl
;
1052 else if (IS_BEGBUF_CONTEXT (context
))
1054 /* It is relatively rare case, then calculate on demand. */
1055 return re_acquire_state_context (err
, dfa
,
1056 dfa
->init_state
->entrance_nodes
,
1060 /* Must not happen? */
1061 return dfa
->init_state
;
1064 return dfa
->init_state
;
1067 /* Check whether the regular expression match input string INPUT or not,
1068 and return the index where the matching end, return -1 if not match,
1069 or return -2 in case of an error.
1070 FL_LONGEST_MATCH means we want the POSIX longest matching.
1071 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1072 next place where we may want to try matching.
1073 Note that the matcher assume that the matching starts from the current
1074 index of the buffer. */
1078 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1081 const re_dfa_t
*const dfa
= mctx
->dfa
;
1084 int match_last
= -1;
1085 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1086 re_dfastate_t
*cur_state
;
1087 int at_init_state
= p_match_first
!= NULL
;
1088 int next_start_idx
= cur_str_idx
;
1091 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1092 /* An initial state must not be NULL (invalid). */
1093 if (BE (cur_state
== NULL
, 0))
1095 assert (err
== REG_ESPACE
);
1099 if (mctx
->state_log
!= NULL
)
1101 mctx
->state_log
[cur_str_idx
] = cur_state
;
1103 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1104 later. E.g. Processing back references. */
1105 if (BE (dfa
->nbackref
, 0))
1108 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1109 if (BE (err
!= REG_NOERROR
, 0))
1112 if (cur_state
->has_backref
)
1114 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1115 if (BE (err
!= REG_NOERROR
, 0))
1121 /* If the RE accepts NULL string. */
1122 if (BE (cur_state
->halt
, 0))
1124 if (!cur_state
->has_constraint
1125 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1127 if (!fl_longest_match
)
1131 match_last
= cur_str_idx
;
1137 while (!re_string_eoi (&mctx
->input
))
1139 re_dfastate_t
*old_state
= cur_state
;
1140 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1142 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1143 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1144 && mctx
->input
.valid_len
< mctx
->input
.len
))
1146 err
= extend_buffers (mctx
);
1147 if (BE (err
!= REG_NOERROR
, 0))
1149 assert (err
== REG_ESPACE
);
1154 cur_state
= transit_state (&err
, mctx
, cur_state
);
1155 if (mctx
->state_log
!= NULL
)
1156 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1158 if (cur_state
== NULL
)
1160 /* Reached the invalid state or an error. Try to recover a valid
1161 state using the state log, if available and if we have not
1162 already found a valid (even if not the longest) match. */
1163 if (BE (err
!= REG_NOERROR
, 0))
1166 if (mctx
->state_log
== NULL
1167 || (match
&& !fl_longest_match
)
1168 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1172 if (BE (at_init_state
, 0))
1174 if (old_state
== cur_state
)
1175 next_start_idx
= next_char_idx
;
1180 if (cur_state
->halt
)
1182 /* Reached a halt state.
1183 Check the halt state can satisfy the current context. */
1184 if (!cur_state
->has_constraint
1185 || check_halt_state_context (mctx
, cur_state
,
1186 re_string_cur_idx (&mctx
->input
)))
1188 /* We found an appropriate halt state. */
1189 match_last
= re_string_cur_idx (&mctx
->input
);
1192 /* We found a match, do not modify match_first below. */
1193 p_match_first
= NULL
;
1194 if (!fl_longest_match
)
1201 *p_match_first
+= next_start_idx
;
1206 /* Check NODE match the current context. */
1210 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1212 re_token_type_t type
= dfa
->nodes
[node
].type
;
1213 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1214 if (type
!= END_OF_RE
)
1218 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1223 /* Check the halt state STATE match the current context.
1224 Return 0 if not match, if the node, STATE has, is a halt node and
1225 match the context, return the node. */
1229 check_halt_state_context (const re_match_context_t
*mctx
,
1230 const re_dfastate_t
*state
, int idx
)
1233 unsigned int context
;
1235 assert (state
->halt
);
1237 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1238 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1239 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1240 return state
->nodes
.elems
[i
];
1244 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1245 corresponding to the DFA).
1246 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1251 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1252 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1253 struct re_fail_stack_t
*fs
)
1255 const re_dfa_t
*const dfa
= mctx
->dfa
;
1257 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1259 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1260 re_node_set
*edests
= &dfa
->edests
[node
];
1262 err
= re_node_set_insert (eps_via_nodes
, node
);
1263 if (BE (err
< 0, 0))
1265 /* Pick up a valid destination, or return -1 if none is found. */
1266 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1268 int candidate
= edests
->elems
[i
];
1269 if (!re_node_set_contains (cur_nodes
, candidate
))
1271 if (dest_node
== -1)
1272 dest_node
= candidate
;
1276 /* In order to avoid infinite loop like "(a*)*", return the second
1277 epsilon-transition if the first was already considered. */
1278 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1281 /* Otherwise, push the second epsilon-transition on the fail stack. */
1283 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1287 /* We know we are going to exit. */
1296 re_token_type_t type
= dfa
->nodes
[node
].type
;
1298 #ifdef RE_ENABLE_I18N
1299 if (dfa
->nodes
[node
].accept_mb
)
1300 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1302 #endif /* RE_ENABLE_I18N */
1303 if (type
== OP_BACK_REF
)
1305 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1306 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1309 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1313 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1314 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1323 err
= re_node_set_insert (eps_via_nodes
, node
);
1324 if (BE (err
< 0, 0))
1326 dest_node
= dfa
->edests
[node
].elems
[0];
1327 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1334 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1336 int dest_node
= dfa
->nexts
[node
];
1337 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1338 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1339 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1342 re_node_set_empty (eps_via_nodes
);
1349 static reg_errcode_t
1351 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1352 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1355 int num
= fs
->num
++;
1356 if (fs
->num
== fs
->alloc
)
1358 struct re_fail_stack_ent_t
*new_array
;
1359 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1361 if (new_array
== NULL
)
1364 fs
->stack
= new_array
;
1366 fs
->stack
[num
].idx
= str_idx
;
1367 fs
->stack
[num
].node
= dest_node
;
1368 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1369 if (fs
->stack
[num
].regs
== NULL
)
1371 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1372 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1378 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1379 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1381 int num
= --fs
->num
;
1383 *pidx
= fs
->stack
[num
].idx
;
1384 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1385 re_node_set_free (eps_via_nodes
);
1386 re_free (fs
->stack
[num
].regs
);
1387 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1388 return fs
->stack
[num
].node
;
1391 /* Set the positions where the subexpressions are starts/ends to registers
1393 Note: We assume that pmatch[0] is already set, and
1394 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1396 static reg_errcode_t
1398 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1399 regmatch_t
*pmatch
, int fl_backtrack
)
1401 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1403 re_node_set eps_via_nodes
;
1404 struct re_fail_stack_t
*fs
;
1405 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1406 regmatch_t
*prev_idx_match
;
1407 int prev_idx_match_malloced
= 0;
1410 assert (nmatch
> 1);
1411 assert (mctx
->state_log
!= NULL
);
1416 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1417 if (fs
->stack
== NULL
)
1423 cur_node
= dfa
->init_node
;
1424 re_node_set_init_empty (&eps_via_nodes
);
1427 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1428 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1432 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1433 if (prev_idx_match
== NULL
)
1435 free_fail_stack_return (fs
);
1438 prev_idx_match_malloced
= 1;
1440 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1442 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1444 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1446 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1451 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1452 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1454 if (reg_idx
== nmatch
)
1456 re_node_set_free (&eps_via_nodes
);
1457 if (prev_idx_match_malloced
)
1458 re_free (prev_idx_match
);
1459 return free_fail_stack_return (fs
);
1461 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1466 re_node_set_free (&eps_via_nodes
);
1467 if (prev_idx_match_malloced
)
1468 re_free (prev_idx_match
);
1473 /* Proceed to next node. */
1474 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1475 &eps_via_nodes
, fs
);
1477 if (BE (cur_node
< 0, 0))
1479 if (BE (cur_node
== -2, 0))
1481 re_node_set_free (&eps_via_nodes
);
1482 if (prev_idx_match_malloced
)
1483 re_free (prev_idx_match
);
1484 free_fail_stack_return (fs
);
1488 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1492 re_node_set_free (&eps_via_nodes
);
1493 if (prev_idx_match_malloced
)
1494 re_free (prev_idx_match
);
1499 re_node_set_free (&eps_via_nodes
);
1500 if (prev_idx_match_malloced
)
1501 re_free (prev_idx_match
);
1502 return free_fail_stack_return (fs
);
1505 static reg_errcode_t
1507 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1512 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1514 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1515 re_free (fs
->stack
[fs_idx
].regs
);
1517 re_free (fs
->stack
);
1524 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1525 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1527 int type
= dfa
->nodes
[cur_node
].type
;
1528 if (type
== OP_OPEN_SUBEXP
)
1530 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1532 /* We are at the first node of this sub expression. */
1533 if (reg_num
< nmatch
)
1535 pmatch
[reg_num
].rm_so
= cur_idx
;
1536 pmatch
[reg_num
].rm_eo
= -1;
1539 else if (type
== OP_CLOSE_SUBEXP
)
1541 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1542 if (reg_num
< nmatch
)
1544 /* We are at the last node of this sub expression. */
1545 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1547 pmatch
[reg_num
].rm_eo
= cur_idx
;
1548 /* This is a non-empty match or we are not inside an optional
1549 subexpression. Accept this right away. */
1550 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1554 if (dfa
->nodes
[cur_node
].opt_subexp
1555 && prev_idx_match
[reg_num
].rm_so
!= -1)
1556 /* We transited through an empty match for an optional
1557 subexpression, like (a?)*, and this is not the subexp's
1558 first match. Copy back the old content of the registers
1559 so that matches of an inner subexpression are undone as
1560 well, like in ((a?))*. */
1561 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1563 /* We completed a subexpression, but it may be part of
1564 an optional one, so do not update PREV_IDX_MATCH. */
1565 pmatch
[reg_num
].rm_eo
= cur_idx
;
1571 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1572 and sift the nodes in each states according to the following rules.
1573 Updated state_log will be wrote to STATE_LOG.
1575 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1576 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1577 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1578 the LAST_NODE, we throw away the node `a'.
1579 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1580 string `s' and transit to `b':
1581 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1583 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1584 thrown away, we throw away the node `a'.
1585 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1586 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1588 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1589 we throw away the node `a'. */
1591 #define STATE_NODE_CONTAINS(state,node) \
1592 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1594 static reg_errcode_t
1596 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1600 int str_idx
= sctx
->last_str_idx
;
1601 re_node_set cur_dest
;
1604 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1607 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1608 transit to the last_node and the last_node itself. */
1609 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1610 if (BE (err
!= REG_NOERROR
, 0))
1612 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1613 if (BE (err
!= REG_NOERROR
, 0))
1616 /* Then check each states in the state_log. */
1619 /* Update counters. */
1620 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1621 if (null_cnt
> mctx
->max_mb_elem_len
)
1623 memset (sctx
->sifted_states
, '\0',
1624 sizeof (re_dfastate_t
*) * str_idx
);
1625 re_node_set_free (&cur_dest
);
1628 re_node_set_empty (&cur_dest
);
1631 if (mctx
->state_log
[str_idx
])
1633 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1634 if (BE (err
!= REG_NOERROR
, 0))
1638 /* Add all the nodes which satisfy the following conditions:
1639 - It can epsilon transit to a node in CUR_DEST.
1641 And update state_log. */
1642 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1643 if (BE (err
!= REG_NOERROR
, 0))
1648 re_node_set_free (&cur_dest
);
1652 static reg_errcode_t
1654 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1655 int str_idx
, re_node_set
*cur_dest
)
1657 const re_dfa_t
*const dfa
= mctx
->dfa
;
1658 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1661 /* Then build the next sifted state.
1662 We build the next sifted state on `cur_dest', and update
1663 `sifted_states[str_idx]' with `cur_dest'.
1665 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1666 `cur_src' points the node_set of the old `state_log[str_idx]'
1667 (with the epsilon nodes pre-filtered out). */
1668 for (i
= 0; i
< cur_src
->nelem
; i
++)
1670 int prev_node
= cur_src
->elems
[i
];
1675 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1676 assert (!IS_EPSILON_NODE (type
));
1678 #ifdef RE_ENABLE_I18N
1679 /* If the node may accept `multi byte'. */
1680 if (dfa
->nodes
[prev_node
].accept_mb
)
1681 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1682 str_idx
, sctx
->last_str_idx
);
1683 #endif /* RE_ENABLE_I18N */
1685 /* We don't check backreferences here.
1686 See update_cur_sifted_state(). */
1688 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1689 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1690 dfa
->nexts
[prev_node
]))
1696 if (sctx
->limits
.nelem
)
1698 int to_idx
= str_idx
+ naccepted
;
1699 if (check_dst_limits (mctx
, &sctx
->limits
,
1700 dfa
->nexts
[prev_node
], to_idx
,
1701 prev_node
, str_idx
))
1704 ret
= re_node_set_insert (cur_dest
, prev_node
);
1705 if (BE (ret
== -1, 0))
1712 /* Helper functions. */
1714 static reg_errcode_t
1716 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1718 int top
= mctx
->state_log_top
;
1720 if (next_state_log_idx
>= mctx
->input
.bufs_len
1721 || (next_state_log_idx
>= mctx
->input
.valid_len
1722 && mctx
->input
.valid_len
< mctx
->input
.len
))
1725 err
= extend_buffers (mctx
);
1726 if (BE (err
!= REG_NOERROR
, 0))
1730 if (top
< next_state_log_idx
)
1732 memset (mctx
->state_log
+ top
+ 1, '\0',
1733 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1734 mctx
->state_log_top
= next_state_log_idx
;
1739 static reg_errcode_t
1741 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1742 re_dfastate_t
**src
, int num
)
1746 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1748 if (dst
[st_idx
] == NULL
)
1749 dst
[st_idx
] = src
[st_idx
];
1750 else if (src
[st_idx
] != NULL
)
1752 re_node_set merged_set
;
1753 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1754 &src
[st_idx
]->nodes
);
1755 if (BE (err
!= REG_NOERROR
, 0))
1757 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1758 re_node_set_free (&merged_set
);
1759 if (BE (err
!= REG_NOERROR
, 0))
1766 static reg_errcode_t
1768 update_cur_sifted_state (const re_match_context_t
*mctx
,
1769 re_sift_context_t
*sctx
, int str_idx
,
1770 re_node_set
*dest_nodes
)
1772 const re_dfa_t
*const dfa
= mctx
->dfa
;
1773 reg_errcode_t err
= REG_NOERROR
;
1774 const re_node_set
*candidates
;
1775 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1776 : &mctx
->state_log
[str_idx
]->nodes
);
1778 if (dest_nodes
->nelem
== 0)
1779 sctx
->sifted_states
[str_idx
] = NULL
;
1784 /* At first, add the nodes which can epsilon transit to a node in
1786 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1787 if (BE (err
!= REG_NOERROR
, 0))
1790 /* Then, check the limitations in the current sift_context. */
1791 if (sctx
->limits
.nelem
)
1793 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1794 mctx
->bkref_ents
, str_idx
);
1795 if (BE (err
!= REG_NOERROR
, 0))
1800 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1801 if (BE (err
!= REG_NOERROR
, 0))
1805 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1807 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1808 if (BE (err
!= REG_NOERROR
, 0))
1814 static reg_errcode_t
1816 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1817 const re_node_set
*candidates
)
1819 reg_errcode_t err
= REG_NOERROR
;
1822 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1823 if (BE (err
!= REG_NOERROR
, 0))
1826 if (!state
->inveclosure
.alloc
)
1828 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1829 if (BE (err
!= REG_NOERROR
, 0))
1831 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1833 err
= re_node_set_merge (&state
->inveclosure
,
1834 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1835 if (BE (err
!= REG_NOERROR
, 0))
1839 return re_node_set_add_intersect (dest_nodes
, candidates
,
1840 &state
->inveclosure
);
1843 static reg_errcode_t
1845 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1846 const re_node_set
*candidates
)
1850 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1851 re_node_set except_nodes
;
1852 re_node_set_init_empty (&except_nodes
);
1853 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1855 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1856 if (cur_node
== node
)
1858 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1860 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1861 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1862 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1863 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1864 && re_node_set_contains (dest_nodes
, edst1
))
1866 && !re_node_set_contains (inv_eclosure
, edst2
)
1867 && re_node_set_contains (dest_nodes
, edst2
)))
1869 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1870 dfa
->inveclosures
+ cur_node
);
1871 if (BE (err
!= REG_NOERROR
, 0))
1873 re_node_set_free (&except_nodes
);
1879 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1881 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1882 if (!re_node_set_contains (&except_nodes
, cur_node
))
1884 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1885 re_node_set_remove_at (dest_nodes
, idx
);
1888 re_node_set_free (&except_nodes
);
1894 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1895 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1897 const re_dfa_t
*const dfa
= mctx
->dfa
;
1898 int lim_idx
, src_pos
, dst_pos
;
1900 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1901 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1902 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1905 struct re_backref_cache_entry
*ent
;
1906 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1907 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1909 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1910 subexp_idx
, dst_node
, dst_idx
,
1912 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1913 subexp_idx
, src_node
, src_idx
,
1917 <src> <dst> ( <subexp> )
1918 ( <subexp> ) <src> <dst>
1919 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1920 if (src_pos
== dst_pos
)
1921 continue; /* This is unrelated limitation. */
1930 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1931 int subexp_idx
, int from_node
, int bkref_idx
)
1933 const re_dfa_t
*const dfa
= mctx
->dfa
;
1934 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1937 /* Else, we are on the boundary: examine the nodes on the epsilon
1939 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1941 int node
= eclosures
->elems
[node_idx
];
1942 switch (dfa
->nodes
[node
].type
)
1945 if (bkref_idx
!= -1)
1947 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1952 if (ent
->node
!= node
)
1955 if (subexp_idx
< BITSET_WORD_BITS
1956 && !(ent
->eps_reachable_subexps_map
1957 & ((bitset_word_t
) 1 << subexp_idx
)))
1960 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961 OP_CLOSE_SUBEXP cases below. But, if the
1962 destination node is the same node as the source
1963 node, don't recurse because it would cause an
1964 infinite loop: a regex that exhibits this behavior
1966 dst
= dfa
->edests
[node
].elems
[0];
1967 if (dst
== from_node
)
1971 else /* if (boundaries & 2) */
1976 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1978 if (cpos
== -1 /* && (boundaries & 1) */)
1980 if (cpos
== 0 && (boundaries
& 2))
1983 if (subexp_idx
< BITSET_WORD_BITS
)
1984 ent
->eps_reachable_subexps_map
1985 &= ~((bitset_word_t
) 1 << subexp_idx
);
1987 while (ent
++->more
);
1991 case OP_OPEN_SUBEXP
:
1992 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1996 case OP_CLOSE_SUBEXP
:
1997 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2006 return (boundaries
& 2) ? 1 : 0;
2011 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2012 int subexp_idx
, int from_node
, int str_idx
,
2015 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2018 /* If we are outside the range of the subexpression, return -1 or 1. */
2019 if (str_idx
< lim
->subexp_from
)
2022 if (lim
->subexp_to
< str_idx
)
2025 /* If we are within the subexpression, return 0. */
2026 boundaries
= (str_idx
== lim
->subexp_from
);
2027 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2028 if (boundaries
== 0)
2031 /* Else, examine epsilon closure. */
2032 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2033 from_node
, bkref_idx
);
2036 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2037 which are against limitations from DEST_NODES. */
2039 static reg_errcode_t
2041 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2042 const re_node_set
*candidates
, re_node_set
*limits
,
2043 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2046 int node_idx
, lim_idx
;
2048 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2051 struct re_backref_cache_entry
*ent
;
2052 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2054 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2055 continue; /* This is unrelated limitation. */
2057 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2058 if (ent
->subexp_to
== str_idx
)
2062 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2064 int node
= dest_nodes
->elems
[node_idx
];
2065 re_token_type_t type
= dfa
->nodes
[node
].type
;
2066 if (type
== OP_OPEN_SUBEXP
2067 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2069 else if (type
== OP_CLOSE_SUBEXP
2070 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2074 /* Check the limitation of the open subexpression. */
2075 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2078 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2080 if (BE (err
!= REG_NOERROR
, 0))
2084 /* Check the limitation of the close subexpression. */
2086 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2088 int node
= dest_nodes
->elems
[node_idx
];
2089 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2091 && !re_node_set_contains (dfa
->eclosures
+ node
,
2094 /* It is against this limitation.
2095 Remove it form the current sifted state. */
2096 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2098 if (BE (err
!= REG_NOERROR
, 0))
2104 else /* (ent->subexp_to != str_idx) */
2106 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2108 int node
= dest_nodes
->elems
[node_idx
];
2109 re_token_type_t type
= dfa
->nodes
[node
].type
;
2110 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2112 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2114 /* It is against this limitation.
2115 Remove it form the current sifted state. */
2116 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2118 if (BE (err
!= REG_NOERROR
, 0))
2127 static reg_errcode_t
2129 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2130 int str_idx
, const re_node_set
*candidates
)
2132 const re_dfa_t
*const dfa
= mctx
->dfa
;
2135 re_sift_context_t local_sctx
;
2136 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2138 if (first_idx
== -1)
2141 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2143 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2146 re_token_type_t type
;
2147 struct re_backref_cache_entry
*entry
;
2148 node
= candidates
->elems
[node_idx
];
2149 type
= dfa
->nodes
[node
].type
;
2150 /* Avoid infinite loop for the REs like "()\1+". */
2151 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2153 if (type
!= OP_BACK_REF
)
2156 entry
= mctx
->bkref_ents
+ first_idx
;
2157 enabled_idx
= first_idx
;
2164 re_dfastate_t
*cur_state
;
2166 if (entry
->node
!= node
)
2168 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2169 to_idx
= str_idx
+ subexp_len
;
2170 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2171 : dfa
->edests
[node
].elems
[0]);
2173 if (to_idx
> sctx
->last_str_idx
2174 || sctx
->sifted_states
[to_idx
] == NULL
2175 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2176 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2177 str_idx
, dst_node
, to_idx
))
2180 if (local_sctx
.sifted_states
== NULL
)
2183 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2184 if (BE (err
!= REG_NOERROR
, 0))
2187 local_sctx
.last_node
= node
;
2188 local_sctx
.last_str_idx
= str_idx
;
2189 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2190 if (BE (ret
< 0, 0))
2195 cur_state
= local_sctx
.sifted_states
[str_idx
];
2196 err
= sift_states_backward (mctx
, &local_sctx
);
2197 if (BE (err
!= REG_NOERROR
, 0))
2199 if (sctx
->limited_states
!= NULL
)
2201 err
= merge_state_array (dfa
, sctx
->limited_states
,
2202 local_sctx
.sifted_states
,
2204 if (BE (err
!= REG_NOERROR
, 0))
2207 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2208 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2210 /* mctx->bkref_ents may have changed, reload the pointer. */
2211 entry
= mctx
->bkref_ents
+ enabled_idx
;
2213 while (enabled_idx
++, entry
++->more
);
2217 if (local_sctx
.sifted_states
!= NULL
)
2219 re_node_set_free (&local_sctx
.limits
);
2226 #ifdef RE_ENABLE_I18N
2229 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2230 int node_idx
, int str_idx
, int max_str_idx
)
2232 const re_dfa_t
*const dfa
= mctx
->dfa
;
2234 /* Check the node can accept `multi byte'. */
2235 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2236 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2237 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2238 dfa
->nexts
[node_idx
]))
2239 /* The node can't accept the `multi byte', or the
2240 destination was already thrown away, then the node
2241 couldn't accept the current input `multi byte'. */
2243 /* Otherwise, it is sure that the node could accept
2244 `naccepted' bytes input. */
2247 #endif /* RE_ENABLE_I18N */
2250 /* Functions for state transition. */
2252 /* Return the next state to which the current state STATE will transit by
2253 accepting the current input byte, and update STATE_LOG if necessary.
2254 If STATE can accept a multibyte char/collating element/back reference
2255 update the destination of STATE_LOG. */
2257 static re_dfastate_t
*
2259 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2260 re_dfastate_t
*state
)
2262 re_dfastate_t
**trtable
;
2265 #ifdef RE_ENABLE_I18N
2266 /* If the current state can accept multibyte. */
2267 if (BE (state
->accept_mb
, 0))
2269 *err
= transit_state_mb (mctx
, state
);
2270 if (BE (*err
!= REG_NOERROR
, 0))
2273 #endif /* RE_ENABLE_I18N */
2275 /* Then decide the next state with the single byte. */
2278 /* don't use transition table */
2279 return transit_state_sb (err
, mctx
, state
);
2282 /* Use transition table */
2283 ch
= re_string_fetch_byte (&mctx
->input
);
2286 trtable
= state
->trtable
;
2287 if (BE (trtable
!= NULL
, 1))
2290 trtable
= state
->word_trtable
;
2291 if (BE (trtable
!= NULL
, 1))
2293 unsigned int context
;
2295 = re_string_context_at (&mctx
->input
,
2296 re_string_cur_idx (&mctx
->input
) - 1,
2298 if (IS_WORD_CONTEXT (context
))
2299 return trtable
[ch
+ SBC_MAX
];
2304 if (!build_trtable (mctx
->dfa
, state
))
2310 /* Retry, we now have a transition table. */
2314 /* Update the state_log if we need */
2315 static re_dfastate_t
*
2317 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2318 re_dfastate_t
*next_state
)
2320 const re_dfa_t
*const dfa
= mctx
->dfa
;
2321 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2323 if (cur_idx
> mctx
->state_log_top
)
2325 mctx
->state_log
[cur_idx
] = next_state
;
2326 mctx
->state_log_top
= cur_idx
;
2328 else if (mctx
->state_log
[cur_idx
] == NULL
)
2330 mctx
->state_log
[cur_idx
] = next_state
;
2334 re_dfastate_t
*pstate
;
2335 unsigned int context
;
2336 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2337 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338 the destination of a multibyte char/collating element/
2339 back reference. Then the next state is the union set of
2340 these destinations and the results of the transition table. */
2341 pstate
= mctx
->state_log
[cur_idx
];
2342 log_nodes
= pstate
->entrance_nodes
;
2343 if (next_state
!= NULL
)
2345 table_nodes
= next_state
->entrance_nodes
;
2346 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2348 if (BE (*err
!= REG_NOERROR
, 0))
2352 next_nodes
= *log_nodes
;
2353 /* Note: We already add the nodes of the initial state,
2354 then we don't need to add them here. */
2356 context
= re_string_context_at (&mctx
->input
,
2357 re_string_cur_idx (&mctx
->input
) - 1,
2359 next_state
= mctx
->state_log
[cur_idx
]
2360 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2361 /* We don't need to check errors here, since the return value of
2362 this function is next_state and ERR is already set. */
2364 if (table_nodes
!= NULL
)
2365 re_node_set_free (&next_nodes
);
2368 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2370 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371 later. We must check them here, since the back references in the
2372 next state might use them. */
2373 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2375 if (BE (*err
!= REG_NOERROR
, 0))
2378 /* If the next state has back references. */
2379 if (next_state
->has_backref
)
2381 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2382 if (BE (*err
!= REG_NOERROR
, 0))
2384 next_state
= mctx
->state_log
[cur_idx
];
2391 /* Skip bytes in the input that correspond to part of a
2392 multi-byte match, then look in the log for a state
2393 from which to restart matching. */
2394 static re_dfastate_t
*
2396 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2398 re_dfastate_t
*cur_state
;
2401 int max
= mctx
->state_log_top
;
2402 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2406 if (++cur_str_idx
> max
)
2408 re_string_skip_bytes (&mctx
->input
, 1);
2410 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2412 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2414 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2418 /* Helper functions for transit_state. */
2420 /* From the node set CUR_NODES, pick up the nodes whose types are
2421 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422 expression. And register them to use them later for evaluating the
2423 correspoding back references. */
2425 static reg_errcode_t
2427 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2430 const re_dfa_t
*const dfa
= mctx
->dfa
;
2434 /* TODO: This isn't efficient.
2435 Because there might be more than one nodes whose types are
2436 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2439 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2441 int node
= cur_nodes
->elems
[node_idx
];
2442 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2443 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2444 && (dfa
->used_bkref_map
2445 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2447 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2448 if (BE (err
!= REG_NOERROR
, 0))
2456 /* Return the next state to which the current state STATE will transit by
2457 accepting the current input byte. */
2459 static re_dfastate_t
*
2460 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2461 re_dfastate_t
*state
)
2463 const re_dfa_t
*const dfa
= mctx
->dfa
;
2464 re_node_set next_nodes
;
2465 re_dfastate_t
*next_state
;
2466 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2467 unsigned int context
;
2469 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2470 if (BE (*err
!= REG_NOERROR
, 0))
2472 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2474 int cur_node
= state
->nodes
.elems
[node_cnt
];
2475 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2477 *err
= re_node_set_merge (&next_nodes
,
2478 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2479 if (BE (*err
!= REG_NOERROR
, 0))
2481 re_node_set_free (&next_nodes
);
2486 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2487 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2488 /* We don't need to check errors here, since the return value of
2489 this function is next_state and ERR is already set. */
2491 re_node_set_free (&next_nodes
);
2492 re_string_skip_bytes (&mctx
->input
, 1);
2497 #ifdef RE_ENABLE_I18N
2498 static reg_errcode_t
2500 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2502 const re_dfa_t
*const dfa
= mctx
->dfa
;
2506 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2508 re_node_set dest_nodes
, *new_nodes
;
2509 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2510 int naccepted
, dest_idx
;
2511 unsigned int context
;
2512 re_dfastate_t
*dest_state
;
2514 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2517 if (dfa
->nodes
[cur_node_idx
].constraint
)
2519 context
= re_string_context_at (&mctx
->input
,
2520 re_string_cur_idx (&mctx
->input
),
2522 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2527 /* How many bytes the node can accept? */
2528 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2529 re_string_cur_idx (&mctx
->input
));
2533 /* The node can accepts `naccepted' bytes. */
2534 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2535 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2536 : mctx
->max_mb_elem_len
);
2537 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2538 if (BE (err
!= REG_NOERROR
, 0))
2541 assert (dfa
->nexts
[cur_node_idx
] != -1);
2543 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2545 dest_state
= mctx
->state_log
[dest_idx
];
2546 if (dest_state
== NULL
)
2547 dest_nodes
= *new_nodes
;
2550 err
= re_node_set_init_union (&dest_nodes
,
2551 dest_state
->entrance_nodes
, new_nodes
);
2552 if (BE (err
!= REG_NOERROR
, 0))
2555 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2557 mctx
->state_log
[dest_idx
]
2558 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2559 if (dest_state
!= NULL
)
2560 re_node_set_free (&dest_nodes
);
2561 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2566 #endif /* RE_ENABLE_I18N */
2568 static reg_errcode_t
2570 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2572 const re_dfa_t
*const dfa
= mctx
->dfa
;
2575 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2577 for (i
= 0; i
< nodes
->nelem
; ++i
)
2579 int dest_str_idx
, prev_nelem
, bkc_idx
;
2580 int node_idx
= nodes
->elems
[i
];
2581 unsigned int context
;
2582 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2583 re_node_set
*new_dest_nodes
;
2585 /* Check whether `node' is a backreference or not. */
2586 if (node
->type
!= OP_BACK_REF
)
2589 if (node
->constraint
)
2591 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2593 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2597 /* `node' is a backreference.
2598 Check the substring which the substring matched. */
2599 bkc_idx
= mctx
->nbkref_ents
;
2600 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2601 if (BE (err
!= REG_NOERROR
, 0))
2604 /* And add the epsilon closures (which is `new_dest_nodes') of
2605 the backreference to appropriate state_log. */
2607 assert (dfa
->nexts
[node_idx
] != -1);
2609 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2612 re_dfastate_t
*dest_state
;
2613 struct re_backref_cache_entry
*bkref_ent
;
2614 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2615 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2617 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2618 new_dest_nodes
= (subexp_len
== 0
2619 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2620 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2621 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2622 - bkref_ent
->subexp_from
);
2623 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2625 dest_state
= mctx
->state_log
[dest_str_idx
];
2626 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2627 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2628 /* Add `new_dest_node' to state_log. */
2629 if (dest_state
== NULL
)
2631 mctx
->state_log
[dest_str_idx
]
2632 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2634 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2635 && err
!= REG_NOERROR
, 0))
2640 re_node_set dest_nodes
;
2641 err
= re_node_set_init_union (&dest_nodes
,
2642 dest_state
->entrance_nodes
,
2644 if (BE (err
!= REG_NOERROR
, 0))
2646 re_node_set_free (&dest_nodes
);
2649 mctx
->state_log
[dest_str_idx
]
2650 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2651 re_node_set_free (&dest_nodes
);
2652 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2653 && err
!= REG_NOERROR
, 0))
2656 /* We need to check recursively if the backreference can epsilon
2659 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2661 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2663 if (BE (err
!= REG_NOERROR
, 0))
2665 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2666 if (BE (err
!= REG_NOERROR
, 0))
2676 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2677 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678 Note that we might collect inappropriate candidates here.
2679 However, the cost of checking them strictly here is too high, then we
2680 delay these checking for prune_impossible_nodes(). */
2682 static reg_errcode_t
2684 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2686 const re_dfa_t
*const dfa
= mctx
->dfa
;
2687 int subexp_num
, sub_top_idx
;
2688 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2689 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2690 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2691 if (cache_idx
!= -1)
2693 const struct re_backref_cache_entry
*entry
2694 = mctx
->bkref_ents
+ cache_idx
;
2696 if (entry
->node
== bkref_node
)
2697 return REG_NOERROR
; /* We already checked it. */
2698 while (entry
++->more
);
2701 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2703 /* For each sub expression */
2704 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2707 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2708 re_sub_match_last_t
*sub_last
;
2709 int sub_last_idx
, sl_str
, bkref_str_off
;
2711 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2712 continue; /* It isn't related. */
2714 sl_str
= sub_top
->str_idx
;
2715 bkref_str_off
= bkref_str_idx
;
2716 /* At first, check the last node of sub expressions we already
2718 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2721 sub_last
= sub_top
->lasts
[sub_last_idx
];
2722 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2723 /* The matched string by the sub expression match with the substring
2724 at the back reference? */
2725 if (sl_str_diff
> 0)
2727 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2729 /* Not enough chars for a successful match. */
2730 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2733 err
= clean_state_log_if_needed (mctx
,
2736 if (BE (err
!= REG_NOERROR
, 0))
2738 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2740 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2741 /* We don't need to search this sub expression any more. */
2744 bkref_str_off
+= sl_str_diff
;
2745 sl_str
+= sl_str_diff
;
2746 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2749 /* Reload buf, since the preceding call might have reallocated
2751 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2753 if (err
== REG_NOMATCH
)
2755 if (BE (err
!= REG_NOERROR
, 0))
2759 if (sub_last_idx
< sub_top
->nlasts
)
2761 if (sub_last_idx
> 0)
2763 /* Then, search for the other last nodes of the sub expression. */
2764 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2766 int cls_node
, sl_str_off
;
2767 const re_node_set
*nodes
;
2768 sl_str_off
= sl_str
- sub_top
->str_idx
;
2769 /* The matched string by the sub expression match with the substring
2770 at the back reference? */
2773 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2775 /* If we are at the end of the input, we cannot match. */
2776 if (bkref_str_off
>= mctx
->input
.len
)
2779 err
= extend_buffers (mctx
);
2780 if (BE (err
!= REG_NOERROR
, 0))
2783 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2785 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2786 break; /* We don't need to search this sub expression
2789 if (mctx
->state_log
[sl_str
] == NULL
)
2791 /* Does this state have a ')' of the sub expression? */
2792 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2793 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2797 if (sub_top
->path
== NULL
)
2799 sub_top
->path
= calloc (sizeof (state_array_t
),
2800 sl_str
- sub_top
->str_idx
+ 1);
2801 if (sub_top
->path
== NULL
)
2804 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805 in the current context? */
2806 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2807 sub_top
->str_idx
, cls_node
, sl_str
,
2809 if (err
== REG_NOMATCH
)
2811 if (BE (err
!= REG_NOERROR
, 0))
2813 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2814 if (BE (sub_last
== NULL
, 0))
2816 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2818 if (err
== REG_NOMATCH
)
2825 /* Helper functions for get_subexp(). */
2827 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828 If it can arrive, register the sub expression expressed with SUB_TOP
2831 static reg_errcode_t
2833 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2834 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2838 /* Can the subexpression arrive the back reference? */
2839 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2840 sub_last
->str_idx
, bkref_node
, bkref_str
,
2842 if (err
!= REG_NOERROR
)
2844 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2846 if (BE (err
!= REG_NOERROR
, 0))
2848 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2849 return clean_state_log_if_needed (mctx
, to_idx
);
2852 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853 Search '(' if FL_OPEN, or search ')' otherwise.
2854 TODO: This function isn't efficient...
2855 Because there might be more than one nodes whose types are
2856 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2862 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2863 int subexp_idx
, int type
)
2866 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2868 int cls_node
= nodes
->elems
[cls_idx
];
2869 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2870 if (node
->type
== type
2871 && node
->opr
.idx
== subexp_idx
)
2877 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2880 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2882 static reg_errcode_t
2884 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2885 int top_str
, int last_node
, int last_str
, int type
)
2887 const re_dfa_t
*const dfa
= mctx
->dfa
;
2888 reg_errcode_t err
= REG_NOERROR
;
2889 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2890 re_dfastate_t
*cur_state
= NULL
;
2891 re_node_set
*cur_nodes
, next_nodes
;
2892 re_dfastate_t
**backup_state_log
;
2893 unsigned int context
;
2895 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2896 /* Extend the buffer if we need. */
2897 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2899 re_dfastate_t
**new_array
;
2900 int old_alloc
= path
->alloc
;
2901 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2902 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2903 if (BE (new_array
== NULL
, 0))
2905 path
->alloc
= old_alloc
;
2908 path
->array
= new_array
;
2909 memset (new_array
+ old_alloc
, '\0',
2910 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2913 str_idx
= path
->next_idx
? path
->next_idx
: top_str
;
2915 /* Temporary modify MCTX. */
2916 backup_state_log
= mctx
->state_log
;
2917 backup_cur_idx
= mctx
->input
.cur_idx
;
2918 mctx
->state_log
= path
->array
;
2919 mctx
->input
.cur_idx
= str_idx
;
2921 /* Setup initial node set. */
2922 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2923 if (str_idx
== top_str
)
2925 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2926 if (BE (err
!= REG_NOERROR
, 0))
2928 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2929 if (BE (err
!= REG_NOERROR
, 0))
2931 re_node_set_free (&next_nodes
);
2937 cur_state
= mctx
->state_log
[str_idx
];
2938 if (cur_state
&& cur_state
->has_backref
)
2940 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2941 if (BE (err
!= REG_NOERROR
, 0))
2945 re_node_set_init_empty (&next_nodes
);
2947 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2949 if (next_nodes
.nelem
)
2951 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2953 if (BE (err
!= REG_NOERROR
, 0))
2955 re_node_set_free (&next_nodes
);
2959 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2960 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2962 re_node_set_free (&next_nodes
);
2965 mctx
->state_log
[str_idx
] = cur_state
;
2968 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2970 re_node_set_empty (&next_nodes
);
2971 if (mctx
->state_log
[str_idx
+ 1])
2973 err
= re_node_set_merge (&next_nodes
,
2974 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2975 if (BE (err
!= REG_NOERROR
, 0))
2977 re_node_set_free (&next_nodes
);
2983 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2984 &cur_state
->non_eps_nodes
,
2986 if (BE (err
!= REG_NOERROR
, 0))
2988 re_node_set_free (&next_nodes
);
2993 if (next_nodes
.nelem
)
2995 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2996 if (BE (err
!= REG_NOERROR
, 0))
2998 re_node_set_free (&next_nodes
);
3001 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
3003 if (BE (err
!= REG_NOERROR
, 0))
3005 re_node_set_free (&next_nodes
);
3009 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3010 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3011 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3013 re_node_set_free (&next_nodes
);
3016 mctx
->state_log
[str_idx
] = cur_state
;
3017 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3019 re_node_set_free (&next_nodes
);
3020 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3021 : &mctx
->state_log
[last_str
]->nodes
);
3022 path
->next_idx
= str_idx
;
3025 mctx
->state_log
= backup_state_log
;
3026 mctx
->input
.cur_idx
= backup_cur_idx
;
3028 /* Then check the current node set has the node LAST_NODE. */
3029 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3035 /* Helper functions for check_arrival. */
3037 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3039 TODO: This function is similar to the functions transit_state*(),
3040 however this function has many additional works.
3041 Can't we unify them? */
3043 static reg_errcode_t
3045 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3046 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3048 const re_dfa_t
*const dfa
= mctx
->dfa
;
3051 #ifdef RE_ENABLE_I18N
3052 reg_errcode_t err
= REG_NOERROR
;
3054 re_node_set union_set
;
3055 re_node_set_init_empty (&union_set
);
3056 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3059 int cur_node
= cur_nodes
->elems
[cur_idx
];
3061 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3062 assert (!IS_EPSILON_NODE (type
));
3064 #ifdef RE_ENABLE_I18N
3065 /* If the node may accept `multi byte'. */
3066 if (dfa
->nodes
[cur_node
].accept_mb
)
3068 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3072 re_dfastate_t
*dest_state
;
3073 int next_node
= dfa
->nexts
[cur_node
];
3074 int next_idx
= str_idx
+ naccepted
;
3075 dest_state
= mctx
->state_log
[next_idx
];
3076 re_node_set_empty (&union_set
);
3079 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3080 if (BE (err
!= REG_NOERROR
, 0))
3082 re_node_set_free (&union_set
);
3086 result
= re_node_set_insert (&union_set
, next_node
);
3087 if (BE (result
< 0, 0))
3089 re_node_set_free (&union_set
);
3092 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3094 if (BE (mctx
->state_log
[next_idx
] == NULL
3095 && err
!= REG_NOERROR
, 0))
3097 re_node_set_free (&union_set
);
3102 #endif /* RE_ENABLE_I18N */
3104 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3106 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3107 if (BE (result
< 0, 0))
3109 re_node_set_free (&union_set
);
3114 re_node_set_free (&union_set
);
3118 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3119 CUR_NODES, however exclude the nodes which are:
3120 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3121 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3124 static reg_errcode_t
3126 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3127 int ex_subexp
, int type
)
3130 int idx
, outside_node
;
3131 re_node_set new_nodes
;
3133 assert (cur_nodes
->nelem
);
3135 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3136 if (BE (err
!= REG_NOERROR
, 0))
3138 /* Create a new node set NEW_NODES with the nodes which are epsilon
3139 closures of the node in CUR_NODES. */
3141 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3143 int cur_node
= cur_nodes
->elems
[idx
];
3144 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3145 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3146 if (outside_node
== -1)
3148 /* There are no problematic nodes, just merge them. */
3149 err
= re_node_set_merge (&new_nodes
, eclosure
);
3150 if (BE (err
!= REG_NOERROR
, 0))
3152 re_node_set_free (&new_nodes
);
3158 /* There are problematic nodes, re-calculate incrementally. */
3159 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3161 if (BE (err
!= REG_NOERROR
, 0))
3163 re_node_set_free (&new_nodes
);
3168 re_node_set_free (cur_nodes
);
3169 *cur_nodes
= new_nodes
;
3173 /* Helper function for check_arrival_expand_ecl.
3174 Check incrementally the epsilon closure of TARGET, and if it isn't
3175 problematic append it to DST_NODES. */
3177 static reg_errcode_t
3179 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3180 int target
, int ex_subexp
, int type
)
3183 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3187 if (dfa
->nodes
[cur_node
].type
== type
3188 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3190 if (type
== OP_CLOSE_SUBEXP
)
3192 err
= re_node_set_insert (dst_nodes
, cur_node
);
3193 if (BE (err
== -1, 0))
3198 err
= re_node_set_insert (dst_nodes
, cur_node
);
3199 if (BE (err
== -1, 0))
3201 if (dfa
->edests
[cur_node
].nelem
== 0)
3203 if (dfa
->edests
[cur_node
].nelem
== 2)
3205 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3206 dfa
->edests
[cur_node
].elems
[1],
3208 if (BE (err
!= REG_NOERROR
, 0))
3211 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3217 /* For all the back references in the current state, calculate the
3218 destination of the back references by the appropriate entry
3219 in MCTX->BKREF_ENTS. */
3221 static reg_errcode_t
3223 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3224 int cur_str
, int subexp_num
, int type
)
3226 const re_dfa_t
*const dfa
= mctx
->dfa
;
3228 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3229 struct re_backref_cache_entry
*ent
;
3231 if (cache_idx_start
== -1)
3235 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3238 int to_idx
, next_node
;
3240 /* Is this entry ENT is appropriate? */
3241 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3244 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3245 /* Calculate the destination of the back reference, and append it
3246 to MCTX->STATE_LOG. */
3247 if (to_idx
== cur_str
)
3249 /* The backreference did epsilon transit, we must re-check all the
3250 node in the current state. */
3251 re_node_set new_dests
;
3252 reg_errcode_t err2
, err3
;
3253 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3254 if (re_node_set_contains (cur_nodes
, next_node
))
3256 err
= re_node_set_init_1 (&new_dests
, next_node
);
3257 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3258 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3259 re_node_set_free (&new_dests
);
3260 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3261 || err3
!= REG_NOERROR
, 0))
3263 err
= (err
!= REG_NOERROR
? err
3264 : (err2
!= REG_NOERROR
? err2
: err3
));
3267 /* TODO: It is still inefficient... */
3272 re_node_set union_set
;
3273 next_node
= dfa
->nexts
[ent
->node
];
3274 if (mctx
->state_log
[to_idx
])
3277 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3280 err
= re_node_set_init_copy (&union_set
,
3281 &mctx
->state_log
[to_idx
]->nodes
);
3282 ret
= re_node_set_insert (&union_set
, next_node
);
3283 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3285 re_node_set_free (&union_set
);
3286 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3292 err
= re_node_set_init_1 (&union_set
, next_node
);
3293 if (BE (err
!= REG_NOERROR
, 0))
3296 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3297 re_node_set_free (&union_set
);
3298 if (BE (mctx
->state_log
[to_idx
] == NULL
3299 && err
!= REG_NOERROR
, 0))
3303 while (ent
++->more
);
3307 /* Build transition table for the state.
3308 Return 1 if succeeded, otherwise return NULL. */
3312 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3315 int i
, j
, ch
, need_word_trtable
= 0;
3316 bitset_word_t elem
, mask
;
3317 bool dests_node_malloced
= false;
3318 bool dest_states_malloced
= false;
3319 int ndests
; /* Number of the destination states from `state'. */
3320 re_dfastate_t
**trtable
;
3321 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3322 re_node_set follows
, *dests_node
;
3324 bitset_t acceptable
;
3328 re_node_set dests_node
[SBC_MAX
];
3329 bitset_t dests_ch
[SBC_MAX
];
3332 /* We build DFA states which corresponds to the destination nodes
3333 from `state'. `dests_node[i]' represents the nodes which i-th
3334 destination state contains, and `dests_ch[i]' represents the
3335 characters which i-th destination state accepts. */
3337 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3338 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3342 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3343 if (BE (dests_alloc
== NULL
, 0))
3345 dests_node_malloced
= true;
3347 dests_node
= dests_alloc
->dests_node
;
3348 dests_ch
= dests_alloc
->dests_ch
;
3350 /* Initialize transiton table. */
3351 state
->word_trtable
= state
->trtable
= NULL
;
3353 /* At first, group all nodes belonging to `state' into several
3355 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3356 if (BE (ndests
<= 0, 0))
3358 if (dests_node_malloced
)
3360 /* Return 0 in case of an error, 1 otherwise. */
3363 state
->trtable
= (re_dfastate_t
**)
3364 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3370 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3371 if (BE (err
!= REG_NOERROR
, 0))
3374 /* Avoid arithmetic overflow in size calculation. */
3375 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3376 / (3 * sizeof (re_dfastate_t
*)))
3382 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3383 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3384 dest_states
= (re_dfastate_t
**)
3385 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3389 dest_states
= (re_dfastate_t
**)
3390 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3391 if (BE (dest_states
== NULL
, 0))
3394 if (dest_states_malloced
)
3396 re_node_set_free (&follows
);
3397 for (i
= 0; i
< ndests
; ++i
)
3398 re_node_set_free (dests_node
+ i
);
3399 if (dests_node_malloced
)
3403 dest_states_malloced
= true;
3405 dest_states_word
= dest_states
+ ndests
;
3406 dest_states_nl
= dest_states_word
+ ndests
;
3407 bitset_empty (acceptable
);
3409 /* Then build the states for all destinations. */
3410 for (i
= 0; i
< ndests
; ++i
)
3413 re_node_set_empty (&follows
);
3414 /* Merge the follows of this destination states. */
3415 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3417 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3418 if (next_node
!= -1)
3420 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3421 if (BE (err
!= REG_NOERROR
, 0))
3425 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3426 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3428 /* If the new state has context constraint,
3429 build appropriate states for these contexts. */
3430 if (dest_states
[i
]->has_constraint
)
3432 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3434 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3437 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3438 need_word_trtable
= 1;
3440 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3442 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3447 dest_states_word
[i
] = dest_states
[i
];
3448 dest_states_nl
[i
] = dest_states
[i
];
3450 bitset_merge (acceptable
, dests_ch
[i
]);
3453 if (!BE (need_word_trtable
, 0))
3455 /* We don't care about whether the following character is a word
3456 character, or we are in a single-byte character set so we can
3457 discern by looking at the character code: allocate a
3458 256-entry transition table. */
3459 trtable
= state
->trtable
=
3460 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3461 if (BE (trtable
== NULL
, 0))
3464 /* For all characters ch...: */
3465 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3466 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3468 mask
<<= 1, elem
>>= 1, ++ch
)
3469 if (BE (elem
& 1, 0))
3471 /* There must be exactly one destination which accepts
3472 character ch. See group_nodes_into_DFAstates. */
3473 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3476 /* j-th destination accepts the word character ch. */
3477 if (dfa
->word_char
[i
] & mask
)
3478 trtable
[ch
] = dest_states_word
[j
];
3480 trtable
[ch
] = dest_states
[j
];
3485 /* We care about whether the following character is a word
3486 character, and we are in a multi-byte character set: discern
3487 by looking at the character code: build two 256-entry
3488 transition tables, one starting at trtable[0] and one
3489 starting at trtable[SBC_MAX]. */
3490 trtable
= state
->word_trtable
=
3491 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3492 if (BE (trtable
== NULL
, 0))
3495 /* For all characters ch...: */
3496 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3497 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3499 mask
<<= 1, elem
>>= 1, ++ch
)
3500 if (BE (elem
& 1, 0))
3502 /* There must be exactly one destination which accepts
3503 character ch. See group_nodes_into_DFAstates. */
3504 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3507 /* j-th destination accepts the word character ch. */
3508 trtable
[ch
] = dest_states
[j
];
3509 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3514 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3516 /* The current state accepts newline character. */
3517 for (j
= 0; j
< ndests
; ++j
)
3518 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3520 /* k-th destination accepts newline character. */
3521 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3522 if (need_word_trtable
)
3523 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3524 /* There must be only one destination which accepts
3525 newline. See group_nodes_into_DFAstates. */
3530 if (dest_states_malloced
)
3533 re_node_set_free (&follows
);
3534 for (i
= 0; i
< ndests
; ++i
)
3535 re_node_set_free (dests_node
+ i
);
3537 if (dests_node_malloced
)
3543 /* Group all nodes belonging to STATE into several destinations.
3544 Then for all destinations, set the nodes belonging to the destination
3545 to DESTS_NODE[i] and set the characters accepted by the destination
3546 to DEST_CH[i]. This function return the number of destinations. */
3550 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3551 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3556 int ndests
; /* Number of the destinations from `state'. */
3557 bitset_t accepts
; /* Characters a node can accept. */
3558 const re_node_set
*cur_nodes
= &state
->nodes
;
3559 bitset_empty (accepts
);
3562 /* For all the nodes belonging to `state', */
3563 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3565 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3566 re_token_type_t type
= node
->type
;
3567 unsigned int constraint
= node
->constraint
;
3569 /* Enumerate all single byte character this node can accept. */
3570 if (type
== CHARACTER
)
3571 bitset_set (accepts
, node
->opr
.c
);
3572 else if (type
== SIMPLE_BRACKET
)
3574 bitset_merge (accepts
, node
->opr
.sbcset
);
3576 else if (type
== OP_PERIOD
)
3578 #ifdef RE_ENABLE_I18N
3579 if (dfa
->mb_cur_max
> 1)
3580 bitset_merge (accepts
, dfa
->sb_char
);
3583 bitset_set_all (accepts
);
3584 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3585 bitset_clear (accepts
, '\n');
3586 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3587 bitset_clear (accepts
, '\0');
3589 #ifdef RE_ENABLE_I18N
3590 else if (type
== OP_UTF8_PERIOD
)
3592 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3593 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3594 bitset_clear (accepts
, '\n');
3595 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3596 bitset_clear (accepts
, '\0');
3602 /* Check the `accepts' and sift the characters which are not
3603 match it the context. */
3606 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3608 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3609 bitset_empty (accepts
);
3610 if (accepts_newline
)
3611 bitset_set (accepts
, NEWLINE_CHAR
);
3615 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3617 bitset_empty (accepts
);
3621 if (constraint
& NEXT_WORD_CONSTRAINT
)
3623 bitset_word_t any_set
= 0;
3624 if (type
== CHARACTER
&& !node
->word_char
)
3626 bitset_empty (accepts
);
3629 #ifdef RE_ENABLE_I18N
3630 if (dfa
->mb_cur_max
> 1)
3631 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3632 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3635 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3636 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3640 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3642 bitset_word_t any_set
= 0;
3643 if (type
== CHARACTER
&& node
->word_char
)
3645 bitset_empty (accepts
);
3648 #ifdef RE_ENABLE_I18N
3649 if (dfa
->mb_cur_max
> 1)
3650 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3651 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3654 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3655 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3661 /* Then divide `accepts' into DFA states, or create a new
3662 state. Above, we make sure that accepts is not empty. */
3663 for (j
= 0; j
< ndests
; ++j
)
3665 bitset_t intersec
; /* Intersection sets, see below. */
3667 /* Flags, see below. */
3668 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3670 /* Optimization, skip if this state doesn't accept the character. */
3671 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3674 /* Enumerate the intersection set of this state and `accepts'. */
3676 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3677 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3678 /* And skip if the intersection set is empty. */
3682 /* Then check if this state is a subset of `accepts'. */
3683 not_subset
= not_consumed
= 0;
3684 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3686 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3687 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3690 /* If this state isn't a subset of `accepts', create a
3691 new group state, which has the `remains'. */
3694 bitset_copy (dests_ch
[ndests
], remains
);
3695 bitset_copy (dests_ch
[j
], intersec
);
3696 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3697 if (BE (err
!= REG_NOERROR
, 0))
3702 /* Put the position in the current group. */
3703 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3704 if (BE (result
< 0, 0))
3707 /* If all characters are consumed, go to next node. */
3711 /* Some characters remain, create a new group. */
3714 bitset_copy (dests_ch
[ndests
], accepts
);
3715 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3716 if (BE (err
!= REG_NOERROR
, 0))
3719 bitset_empty (accepts
);
3724 for (j
= 0; j
< ndests
; ++j
)
3725 re_node_set_free (dests_node
+ j
);
3729 #ifdef RE_ENABLE_I18N
3730 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3731 Return the number of the bytes the node accepts.
3732 STR_IDX is the current index of the input string.
3734 This function handles the nodes which can accept one character, or
3735 one collating element like '.', '[a-z]', opposite to the other nodes
3736 can only accept one byte. */
3740 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3741 const re_string_t
*input
, int str_idx
)
3743 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3744 int char_len
, elem_len
;
3748 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3750 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3751 if (BE (c
< 0xc2, 1))
3754 if (str_idx
+ 2 > input
->len
)
3757 d
= re_string_byte_at (input
, str_idx
+ 1);
3759 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3763 if (c
== 0xe0 && d
< 0xa0)
3769 if (c
== 0xf0 && d
< 0x90)
3775 if (c
== 0xf8 && d
< 0x88)
3781 if (c
== 0xfc && d
< 0x84)
3787 if (str_idx
+ char_len
> input
->len
)
3790 for (i
= 1; i
< char_len
; ++i
)
3792 d
= re_string_byte_at (input
, str_idx
+ i
);
3793 if (d
< 0x80 || d
> 0xbf)
3799 char_len
= re_string_char_size_at (input
, str_idx
);
3800 if (node
->type
== OP_PERIOD
)
3804 /* FIXME: I don't think this if is needed, as both '\n'
3805 and '\0' are char_len == 1. */
3806 /* '.' accepts any one character except the following two cases. */
3807 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3808 re_string_byte_at (input
, str_idx
) == '\n') ||
3809 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3810 re_string_byte_at (input
, str_idx
) == '\0'))
3815 elem_len
= re_string_elem_size_at (input
, str_idx
);
3816 wc
= __btowc(*(input
->mbs
+str_idx
));
3817 if (((elem_len
<= 1 && char_len
<= 1) || char_len
== 0) && (wc
!= WEOF
&& wc
< SBC_MAX
))
3820 if (node
->type
== COMPLEX_BRACKET
)
3822 const re_charset_t
*cset
= node
->opr
.mbcset
;
3824 const unsigned char *pin
3825 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3830 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3831 ? re_string_wchar_at (input
, str_idx
) : 0);
3833 /* match with multibyte character? */
3834 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3835 if (wc
== cset
->mbchars
[i
])
3837 match_len
= char_len
;
3838 goto check_node_accept_bytes_match
;
3840 /* match with character_class? */
3841 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3843 wctype_t wt
= cset
->char_classes
[i
];
3844 if (__iswctype (wc
, wt
))
3846 match_len
= char_len
;
3847 goto check_node_accept_bytes_match
;
3852 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3855 unsigned int in_collseq
= 0;
3856 const int32_t *table
, *indirect
;
3857 const unsigned char *weights
, *extra
;
3858 const char *collseqwc
;
3859 /* This #include defines a local function! */
3860 # include <locale/weight.h>
3862 /* match with collating_symbol? */
3863 if (cset
->ncoll_syms
)
3864 extra
= (const unsigned char *)
3865 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3866 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3868 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3869 /* Compare the length of input collating element and
3870 the length of current collating element. */
3871 if (*coll_sym
!= elem_len
)
3873 /* Compare each bytes. */
3874 for (j
= 0; j
< *coll_sym
; j
++)
3875 if (pin
[j
] != coll_sym
[1 + j
])
3879 /* Match if every bytes is equal. */
3881 goto check_node_accept_bytes_match
;
3887 if (elem_len
<= char_len
)
3889 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3890 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3893 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3895 /* match with range expression? */
3896 for (i
= 0; i
< cset
->nranges
; ++i
)
3897 if (cset
->range_starts
[i
] <= in_collseq
3898 && in_collseq
<= cset
->range_ends
[i
])
3900 match_len
= elem_len
;
3901 goto check_node_accept_bytes_match
;
3904 /* match with equivalence_class? */
3905 if (cset
->nequiv_classes
)
3907 const unsigned char *cp
= pin
;
3908 table
= (const int32_t *)
3909 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3910 weights
= (const unsigned char *)
3911 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3912 extra
= (const unsigned char *)
3913 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3914 indirect
= (const int32_t *)
3915 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3916 int32_t idx
= findidx (&cp
);
3918 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3920 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3921 size_t weight_len
= weights
[idx
& 0xffffff];
3922 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3923 && (idx
>> 24) == (equiv_class_idx
>> 24))
3928 equiv_class_idx
&= 0xffffff;
3930 while (cnt
<= weight_len
3931 && (weights
[equiv_class_idx
+ 1 + cnt
]
3932 == weights
[idx
+ 1 + cnt
]))
3934 if (cnt
> weight_len
)
3936 match_len
= elem_len
;
3937 goto check_node_accept_bytes_match
;
3946 /* match with range expression? */
3948 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3950 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3953 for (i
= 0; i
< cset
->nranges
; ++i
)
3955 cmp_buf
[0] = cset
->range_starts
[i
];
3956 cmp_buf
[4] = cset
->range_ends
[i
];
3957 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3958 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3960 match_len
= char_len
;
3961 goto check_node_accept_bytes_match
;
3965 check_node_accept_bytes_match
:
3966 if (!cset
->non_match
)
3973 return (elem_len
> char_len
) ? elem_len
: char_len
;
3982 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3984 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3989 /* No valid character. Match it as a single byte character. */
3990 const unsigned char *collseq
= (const unsigned char *)
3991 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3992 return collseq
[mbs
[0]];
3999 const unsigned char *extra
= (const unsigned char *)
4000 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
4001 int32_t extrasize
= (const unsigned char *)
4002 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
4004 for (idx
= 0; idx
< extrasize
;)
4006 int mbs_cnt
, found
= 0;
4007 int32_t elem_mbs_len
;
4008 /* Skip the name of collating element name. */
4009 idx
= idx
+ extra
[idx
] + 1;
4010 elem_mbs_len
= extra
[idx
++];
4011 if (mbs_len
== elem_mbs_len
)
4013 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4014 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4016 if (mbs_cnt
== elem_mbs_len
)
4017 /* Found the entry. */
4020 /* Skip the byte sequence of the collating element. */
4021 idx
+= elem_mbs_len
;
4022 /* Adjust for the alignment. */
4023 idx
= (idx
+ 3) & ~3;
4024 /* Skip the collation sequence value. */
4025 idx
+= sizeof (uint32_t);
4026 /* Skip the wide char sequence of the collating element. */
4027 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
4028 /* If we found the entry, return the sequence value. */
4030 return *(uint32_t *) (extra
+ idx
);
4031 /* Skip the collation sequence value. */
4032 idx
+= sizeof (uint32_t);
4038 #endif /* RE_ENABLE_I18N */
4040 /* Check whether the node accepts the byte which is IDX-th
4041 byte of the INPUT. */
4045 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4049 ch
= re_string_byte_at (&mctx
->input
, idx
);
4053 if (node
->opr
.c
!= ch
)
4057 case SIMPLE_BRACKET
:
4058 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4062 #ifdef RE_ENABLE_I18N
4063 case OP_UTF8_PERIOD
:
4069 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4070 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4078 if (node
->constraint
)
4080 /* The node has constraints. Check whether the current context
4081 satisfies the constraints. */
4082 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4084 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4091 /* Extend the buffers, if the buffers have run out. */
4093 static reg_errcode_t
4095 extend_buffers (re_match_context_t
*mctx
)
4098 re_string_t
*pstr
= &mctx
->input
;
4100 /* Avoid overflow. */
4101 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4104 /* Double the lengths of the buffers. */
4105 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4106 if (BE (ret
!= REG_NOERROR
, 0))
4109 if (mctx
->state_log
!= NULL
)
4111 /* And double the length of state_log. */
4112 /* XXX We have no indication of the size of this buffer. If this
4113 allocation fail we have no indication that the state_log array
4114 does not have the right size. */
4115 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4116 pstr
->bufs_len
+ 1);
4117 if (BE (new_array
== NULL
, 0))
4119 mctx
->state_log
= new_array
;
4122 /* Then reconstruct the buffers. */
4125 #ifdef RE_ENABLE_I18N
4126 if (pstr
->mb_cur_max
> 1)
4128 ret
= build_wcs_upper_buffer (pstr
);
4129 if (BE (ret
!= REG_NOERROR
, 0))
4133 #endif /* RE_ENABLE_I18N */
4134 build_upper_buffer (pstr
);
4138 #ifdef RE_ENABLE_I18N
4139 if (pstr
->mb_cur_max
> 1)
4140 build_wcs_buffer (pstr
);
4142 #endif /* RE_ENABLE_I18N */
4144 if (pstr
->trans
!= NULL
)
4145 re_string_translate_buffer (pstr
);
4152 /* Functions for matching context. */
4154 /* Initialize MCTX. */
4156 static reg_errcode_t
4158 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4160 mctx
->eflags
= eflags
;
4161 mctx
->match_last
= -1;
4164 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4165 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4166 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4169 /* Already zero-ed by the caller.
4171 mctx->bkref_ents = NULL;
4172 mctx->nbkref_ents = 0;
4173 mctx->nsub_tops = 0; */
4174 mctx
->abkref_ents
= n
;
4175 mctx
->max_mb_elem_len
= 1;
4176 mctx
->asub_tops
= n
;
4180 /* Clean the entries which depend on the current input in MCTX.
4181 This function must be invoked when the matcher changes the start index
4182 of the input, or changes the input string. */
4186 match_ctx_clean (re_match_context_t
*mctx
)
4189 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4192 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4193 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4195 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4196 re_free (last
->path
.array
);
4199 re_free (top
->lasts
);
4202 re_free (top
->path
->array
);
4203 re_free (top
->path
);
4208 mctx
->nsub_tops
= 0;
4209 mctx
->nbkref_ents
= 0;
4212 /* Free all the memory associated with MCTX. */
4216 match_ctx_free (re_match_context_t
*mctx
)
4218 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4219 match_ctx_clean (mctx
);
4220 re_free (mctx
->sub_tops
);
4221 re_free (mctx
->bkref_ents
);
4224 /* Add a new backreference entry to MCTX.
4225 Note that we assume that caller never call this function with duplicate
4226 entry, and call with STR_IDX which isn't smaller than any existing entry.
4229 static reg_errcode_t
4231 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4234 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4236 struct re_backref_cache_entry
* new_entry
;
4237 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4238 mctx
->abkref_ents
* 2);
4239 if (BE (new_entry
== NULL
, 0))
4241 re_free (mctx
->bkref_ents
);
4244 mctx
->bkref_ents
= new_entry
;
4245 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4246 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4247 mctx
->abkref_ents
*= 2;
4249 if (mctx
->nbkref_ents
> 0
4250 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4251 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4253 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4254 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4255 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4256 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4258 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4259 If bit N is clear, means that this entry won't epsilon-transition to
4260 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4261 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4264 A backreference does not epsilon-transition unless it is empty, so set
4265 to all zeros if FROM != TO. */
4266 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4267 = (from
== to
? ~0 : 0);
4269 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4270 if (mctx
->max_mb_elem_len
< to
- from
)
4271 mctx
->max_mb_elem_len
= to
- from
;
4275 /* Search for the first entry which has the same str_idx, or -1 if none is
4276 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4280 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4282 int left
, right
, mid
, last
;
4283 last
= right
= mctx
->nbkref_ents
;
4284 for (left
= 0; left
< right
;)
4286 mid
= left
+ (right
- left
) / 2;
4287 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4292 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4298 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4301 static reg_errcode_t
4303 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4306 assert (mctx
->sub_tops
!= NULL
);
4307 assert (mctx
->asub_tops
> 0);
4309 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4311 int new_asub_tops
= mctx
->asub_tops
* 2;
4312 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4313 re_sub_match_top_t
*,
4315 if (BE (new_array
== NULL
, 0))
4317 mctx
->sub_tops
= new_array
;
4318 mctx
->asub_tops
= new_asub_tops
;
4320 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4321 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4323 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4324 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4328 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4329 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4331 static re_sub_match_last_t
*
4333 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4335 re_sub_match_last_t
*new_entry
;
4336 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4338 int new_alasts
= 2 * subtop
->alasts
+ 1;
4339 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4340 re_sub_match_last_t
*,
4342 if (BE (new_array
== NULL
, 0))
4344 subtop
->lasts
= new_array
;
4345 subtop
->alasts
= new_alasts
;
4347 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4348 if (BE (new_entry
!= NULL
, 1))
4350 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4351 new_entry
->node
= node
;
4352 new_entry
->str_idx
= str_idx
;
4360 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4361 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4363 sctx
->sifted_states
= sifted_sts
;
4364 sctx
->limited_states
= limited_sts
;
4365 sctx
->last_node
= last_node
;
4366 sctx
->last_str_idx
= last_str_idx
;
4367 re_node_set_init_empty (&sctx
->limits
);