1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
24 static void match_ctx_clean (re_match_context_t
*mctx
);
25 static void match_ctx_free (re_match_context_t
*cache
);
26 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
27 int str_idx
, int from
, int to
);
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
,
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
);
34 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
35 re_dfastate_t
**limited_sts
, int last_node
,
37 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
38 const char *string
, int length
,
39 int start
, int range
, int stop
,
40 size_t nmatch
, regmatch_t pmatch
[],
42 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
43 const char *string1
, int length1
,
44 const char *string2
, int length2
,
45 int start
, int range
, struct re_registers
*regs
,
46 int stop
, int ret_len
);
47 static int re_search_stub (struct re_pattern_buffer
*bufp
,
48 const char *string
, int length
, int start
,
49 int range
, int stop
, struct re_registers
*regs
,
51 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
52 int nregs
, int regs_allocated
);
53 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
);
54 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
56 static int check_halt_state_context (const re_match_context_t
*mctx
,
57 const re_dfastate_t
*state
, int idx
);
58 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
59 regmatch_t
*prev_idx_match
, int cur_node
,
60 int cur_idx
, int nmatch
);
61 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
62 int str_idx
, int dest_node
, int nregs
,
64 re_node_set
*eps_via_nodes
);
65 static reg_errcode_t
set_regs (const regex_t
*preg
,
66 const re_match_context_t
*mctx
,
67 size_t nmatch
, regmatch_t
*pmatch
,
69 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
);
72 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
73 re_sift_context_t
*sctx
,
74 int node_idx
, int str_idx
, int max_str_idx
);
75 #endif /* RE_ENABLE_I18N */
76 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
77 re_sift_context_t
*sctx
);
78 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
79 re_sift_context_t
*sctx
, int str_idx
,
80 re_node_set
*cur_dest
);
81 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
82 re_sift_context_t
*sctx
,
84 re_node_set
*dest_nodes
);
85 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
86 re_node_set
*dest_nodes
,
87 const re_node_set
*candidates
);
88 static int check_dst_limits (const re_match_context_t
*mctx
,
90 int dst_node
, int dst_idx
, int src_node
,
92 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
93 int boundaries
, int subexp_idx
,
94 int from_node
, int bkref_idx
);
95 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
96 int limit
, int subexp_idx
,
97 int node
, int str_idx
,
99 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
100 re_node_set
*dest_nodes
,
101 const re_node_set
*candidates
,
103 struct re_backref_cache_entry
*bkref_ents
,
105 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
106 re_sift_context_t
*sctx
,
108 const re_node_set
*candidates
);
109 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
111 re_dfastate_t
**src
, int num
);
112 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
113 re_match_context_t
*mctx
);
114 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
115 re_match_context_t
*mctx
,
116 re_dfastate_t
*state
);
117 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
118 re_match_context_t
*mctx
,
119 re_dfastate_t
*next_state
);
120 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
121 re_node_set
*cur_nodes
,
124 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
125 re_match_context_t
*mctx
,
126 re_dfastate_t
*pstate
);
128 #ifdef RE_ENABLE_I18N
129 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
130 re_dfastate_t
*pstate
);
131 #endif /* RE_ENABLE_I18N */
132 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
133 const re_node_set
*nodes
);
134 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
135 int bkref_node
, int bkref_str_idx
);
136 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
137 const re_sub_match_top_t
*sub_top
,
138 re_sub_match_last_t
*sub_last
,
139 int bkref_node
, int bkref_str
);
140 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
141 int subexp_idx
, int type
);
142 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
143 state_array_t
*path
, int top_node
,
144 int top_str
, int last_node
, int last_str
,
146 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
148 re_node_set
*cur_nodes
,
149 re_node_set
*next_nodes
);
150 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
151 re_node_set
*cur_nodes
,
152 int ex_subexp
, int type
);
153 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
154 re_node_set
*dst_nodes
,
155 int target
, int ex_subexp
,
157 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
158 re_node_set
*cur_nodes
, int cur_str
,
159 int subexp_num
, int type
);
160 static int build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
);
161 #ifdef RE_ENABLE_I18N
162 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
163 const re_string_t
*input
, int idx
);
165 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
168 #endif /* RE_ENABLE_I18N */
169 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
170 const re_dfastate_t
*state
,
171 re_node_set
*states_node
,
172 bitset_t
*states_ch
);
173 static int check_node_accept (const re_match_context_t
*mctx
,
174 const re_token_t
*node
, int idx
);
175 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
, int min_len
);
177 /* Entry point for POSIX code. */
179 /* regexec searches for a given pattern, specified by PREG, in the
182 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
183 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
184 least NMATCH elements, and we set them to the offsets of the
185 corresponding matched substrings.
187 EFLAGS specifies `execution flags' which affect matching: if
188 REG_NOTBOL is set, then ^ does not match at the beginning of the
189 string; if REG_NOTEOL is set, then $ does not match at the end.
191 We return 0 if we find a match and REG_NOMATCH if not. */
194 regexec (const regex_t
*__restrict preg
, const char *__restrict string
,
195 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
199 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
201 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
204 if (eflags
& REG_STARTEND
)
206 start
= pmatch
[0].rm_so
;
207 length
= pmatch
[0].rm_eo
;
212 length
= strlen (string
);
215 __libc_lock_lock (dfa
->lock
);
217 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
218 length
, 0, NULL
, eflags
);
220 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
221 length
, nmatch
, pmatch
, eflags
);
222 __libc_lock_unlock (dfa
->lock
);
223 return err
!= REG_NOERROR
;
227 libc_hidden_def (__regexec
)
229 # include <shlib-compat.h>
230 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
232 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
233 __typeof__ (__regexec
) __compat_regexec
;
236 attribute_compat_text_section
237 __compat_regexec (const regex_t
*__restrict preg
,
238 const char *__restrict string
, size_t nmatch
,
239 regmatch_t pmatch
[], int eflags
)
241 return regexec (preg
, string
, nmatch
, pmatch
,
242 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
244 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
248 /* Entry points for GNU code. */
250 /* re_match, re_search, re_match_2, re_search_2
252 The former two functions operate on STRING with length LENGTH,
253 while the later two operate on concatenation of STRING1 and STRING2
254 with lengths LENGTH1 and LENGTH2, respectively.
256 re_match() matches the compiled pattern in BUFP against the string,
257 starting at index START.
259 re_search() first tries matching at index START, then it tries to match
260 starting from index START + 1, and so on. The last start position tried
261 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
264 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
265 the first STOP characters of the concatenation of the strings should be
268 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
269 and all groups is stored in REGS. (For the "_2" variants, the offsets are
270 computed relative to the concatenation, not relative to the individual
273 On success, re_match* functions return the length of the match, re_search*
274 return the position of the start of the match. Return value -1 means no
275 match was found and -2 indicates an internal error. */
278 re_match (struct re_pattern_buffer
*bufp
, const char *string
, int length
,
279 int start
, struct re_registers
*regs
)
281 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
284 weak_alias (__re_match
, re_match
)
288 re_search (struct re_pattern_buffer
*bufp
, const char *string
, int length
,
289 int start
, int range
, struct re_registers
*regs
)
291 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
294 weak_alias (__re_search
, re_search
)
298 re_match_2 (struct re_pattern_buffer
*bufp
, const char *string1
, int length1
,
299 const char *string2
, int length2
, int start
,
300 struct re_registers
*regs
, int stop
)
302 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
303 start
, 0, regs
, stop
, 1);
306 weak_alias (__re_match_2
, re_match_2
)
310 re_search_2 (struct re_pattern_buffer
*bufp
, const char *string1
, int length1
,
311 const char *string2
, int length2
, int start
, int range
,
312 struct re_registers
*regs
, int stop
)
314 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
315 start
, range
, regs
, stop
, 0);
318 weak_alias (__re_search_2
, re_search_2
)
322 re_search_2_stub (struct re_pattern_buffer
*bufp
, const char *string1
,
323 int length1
, const char *string2
, int length2
, int start
,
324 int range
, struct re_registers
*regs
,
325 int stop
, int ret_len
)
329 int len
= length1
+ length2
;
332 if (BE (length1
< 0 || length2
< 0 || stop
< 0 || len
< length1
, 0))
335 /* Concatenate the strings. */
339 s
= re_malloc (char, len
);
341 if (BE (s
== NULL
, 0))
344 memcpy (__mempcpy (s
, string1
, length1
), string2
, length2
);
346 memcpy (s
, string1
, length1
);
347 memcpy (s
+ length1
, string2
, length2
);
356 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
361 /* The parameters have the same meaning as those of re_search.
362 Additional parameters:
363 If RET_LEN is nonzero the length of the match is returned (re_match style);
364 otherwise the position of the match is returned. */
367 re_search_stub (struct re_pattern_buffer
*bufp
, const char *string
, int length
,
368 int start
, int range
, int stop
, struct re_registers
*regs
,
371 reg_errcode_t result
;
375 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
377 /* Check for out-of-range. */
378 if (BE (start
< 0 || start
> length
, 0))
380 if (BE (start
+ range
> length
, 0))
381 range
= length
- start
;
382 else if (BE (start
+ range
< 0, 0))
385 __libc_lock_lock (dfa
->lock
);
387 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
388 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
390 /* Compile fastmap if we haven't yet. */
391 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
392 re_compile_fastmap (bufp
);
394 if (BE (bufp
->no_sub
, 0))
397 /* We need at least 1 register. */
400 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
401 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
403 nregs
= regs
->num_regs
;
404 if (BE (nregs
< 1, 0))
406 /* Nothing can be copied to regs. */
412 nregs
= bufp
->re_nsub
+ 1;
413 pmatch
= re_malloc (regmatch_t
, nregs
);
414 if (BE (pmatch
== NULL
, 0))
420 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
421 nregs
, pmatch
, eflags
);
425 /* I hope we needn't fill ther regs with -1's when no match was found. */
426 if (result
!= REG_NOERROR
)
428 else if (regs
!= NULL
)
430 /* If caller wants register contents data back, copy them. */
431 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
432 bufp
->regs_allocated
);
433 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
437 if (BE (rval
== 0, 1))
441 assert (pmatch
[0].rm_so
== start
);
442 rval
= pmatch
[0].rm_eo
- start
;
445 rval
= pmatch
[0].rm_so
;
449 __libc_lock_unlock (dfa
->lock
);
454 re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
, int nregs
,
457 int rval
= REGS_REALLOCATE
;
459 int need_regs
= nregs
+ 1;
460 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated
== REGS_UNALLOCATED
)
465 { /* No. So allocate them with malloc. */
466 regs
->start
= re_malloc (regoff_t
, need_regs
);
467 if (BE (regs
->start
== NULL
, 0))
468 return REGS_UNALLOCATED
;
469 regs
->end
= re_malloc (regoff_t
, need_regs
);
470 if (BE (regs
->end
== NULL
, 0))
472 re_free (regs
->start
);
473 return REGS_UNALLOCATED
;
475 regs
->num_regs
= need_regs
;
477 else if (regs_allocated
== REGS_REALLOCATE
)
478 { /* Yes. If we need more elements than were already
479 allocated, reallocate them. If we need fewer, just
481 if (BE (need_regs
> regs
->num_regs
, 0))
483 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
485 if (BE (new_start
== NULL
, 0))
486 return REGS_UNALLOCATED
;
487 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
488 if (BE (new_end
== NULL
, 0))
491 return REGS_UNALLOCATED
;
493 regs
->start
= new_start
;
495 regs
->num_regs
= need_regs
;
500 assert (regs_allocated
== REGS_FIXED
);
501 /* This function may not be called with REGS_FIXED and nregs too big. */
502 assert (regs
->num_regs
>= nregs
);
507 for (i
= 0; i
< nregs
; ++i
)
509 regs
->start
[i
] = pmatch
[i
].rm_so
;
510 regs
->end
[i
] = pmatch
[i
].rm_eo
;
512 for ( ; i
< regs
->num_regs
; ++i
)
513 regs
->start
[i
] = regs
->end
[i
] = -1;
518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
520 this memory for recording register information. STARTS and ENDS
521 must be allocated using the malloc library routine, and must each
522 be at least NUM_REGS * sizeof (regoff_t) bytes long.
524 If NUM_REGS == 0, then subsequent matches should allocate their own
527 Unless this function is called, the first search or match using
528 PATTERN_BUFFER will allocate its own register data, without
529 freeing the old data. */
532 re_set_registers (struct re_pattern_buffer
*bufp
, struct re_registers
*regs
,
533 unsigned num_regs
, regoff_t
*starts
, regoff_t
*ends
)
537 bufp
->regs_allocated
= REGS_REALLOCATE
;
538 regs
->num_regs
= num_regs
;
539 regs
->start
= starts
;
544 bufp
->regs_allocated
= REGS_UNALLOCATED
;
546 regs
->start
= regs
->end
= (regoff_t
*) 0;
550 weak_alias (__re_set_registers
, re_set_registers
)
553 /* Entry points compatible with 4.2 BSD regex library. We don't define
554 them unless specifically requested. */
556 #if defined _REGEX_RE_COMP || defined _LIBC
561 re_exec (const char *s
)
563 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
565 #endif /* _REGEX_RE_COMP */
567 /* Internal entry point. */
569 /* Searches for a compiled pattern PREG in the string STRING, whose
570 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
571 meaning as with regexec. START, and RANGE have the same meanings
573 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574 otherwise return the error code.
575 Note: We assume front end functions already check ranges.
576 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
579 __attribute_warn_unused_result__
580 re_search_internal (const regex_t
*preg
, const char *string
, int length
,
581 int start
, int range
, int stop
, size_t nmatch
,
582 regmatch_t pmatch
[], int eflags
)
585 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
586 int left_lim
, right_lim
, incr
;
587 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
590 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
591 re_match_context_t mctx
= { .dfa
= dfa
};
593 re_match_context_t mctx
;
595 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
596 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
597 RE_TRANSLATE_TYPE t
= preg
->translate
;
599 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
600 memset (&mctx
, '\0', sizeof (re_match_context_t
));
604 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
605 nmatch
-= extra_nmatch
;
607 /* Check if the DFA haven't been compiled. */
608 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
609 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
610 || dfa
->init_state_begbuf
== NULL
, 0))
614 /* We assume front-end functions already check them. */
615 assert (start
+ range
>= 0 && start
+ range
<= length
);
618 /* If initial states with non-begbuf contexts have no elements,
619 the regex must be anchored. If preg->newline_anchor is set,
620 we'll never use init_state_nl, so do not check it. */
621 if (dfa
->init_state
->nodes
.nelem
== 0
622 && dfa
->init_state_word
->nodes
.nelem
== 0
623 && (dfa
->init_state_nl
->nodes
.nelem
== 0
624 || !preg
->newline_anchor
))
626 if (start
!= 0 && start
+ range
!= 0)
631 /* We must check the longest matching, if nmatch > 0. */
632 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
634 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
635 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
636 if (BE (err
!= REG_NOERROR
, 0))
638 mctx
.input
.stop
= stop
;
639 mctx
.input
.raw_stop
= stop
;
640 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
642 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
643 if (BE (err
!= REG_NOERROR
, 0))
646 /* We will log all the DFA states through which the dfa pass,
647 if nmatch > 1, or this dfa has "multibyte node", which is a
648 back-reference or a node which can accept multibyte character or
649 multi character collating element. */
650 if (nmatch
> 1 || dfa
->has_mb_node
)
652 /* Avoid overflow. */
653 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
659 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
660 if (BE (mctx
.state_log
== NULL
, 0))
667 mctx
.state_log
= NULL
;
670 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
671 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
673 /* Check incrementally whether of not the input string match. */
674 incr
= (range
< 0) ? -1 : 1;
675 left_lim
= (range
< 0) ? start
+ range
: start
;
676 right_lim
= (range
< 0) ? start
: start
+ range
;
677 sb
= dfa
->mb_cur_max
== 1;
680 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
681 | (range
>= 0 ? 2 : 0)
682 | (t
!= NULL
? 1 : 0))
685 for (;; match_first
+= incr
)
688 if (match_first
< left_lim
|| right_lim
< match_first
)
691 /* Advance as rapidly as possible through the string, until we
692 find a plausible place to start matching. This may be done
693 with varying efficiency, so there are various possibilities:
694 only the most common of them are specialized, in order to
695 save on code size. We use a switch statement for speed. */
703 /* Fastmap with single-byte translation, match forward. */
704 while (BE (match_first
< right_lim
, 1)
705 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
707 goto forward_match_found_start_or_reached_end
;
710 /* Fastmap without translation, match forward. */
711 while (BE (match_first
< right_lim
, 1)
712 && !fastmap
[(unsigned char) string
[match_first
]])
715 forward_match_found_start_or_reached_end
:
716 if (BE (match_first
== right_lim
, 0))
718 ch
= match_first
>= length
719 ? 0 : (unsigned char) string
[match_first
];
720 if (!fastmap
[t
? t
[ch
] : ch
])
727 /* Fastmap without multi-byte translation, match backwards. */
728 while (match_first
>= left_lim
)
730 ch
= match_first
>= length
731 ? 0 : (unsigned char) string
[match_first
];
732 if (fastmap
[t
? t
[ch
] : ch
])
736 if (match_first
< left_lim
)
741 /* In this case, we can't determine easily the current byte,
742 since it might be a component byte of a multibyte
743 character. Then we use the constructed buffer instead. */
746 /* If MATCH_FIRST is out of the valid range, reconstruct the
748 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
749 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
751 err
= re_string_reconstruct (&mctx
.input
, match_first
,
753 if (BE (err
!= REG_NOERROR
, 0))
756 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
758 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
759 Note that MATCH_FIRST must not be smaller than 0. */
760 ch
= (match_first
>= length
761 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
765 if (match_first
< left_lim
|| match_first
> right_lim
)
774 /* Reconstruct the buffers so that the matcher can assume that
775 the matching starts from the beginning of the buffer. */
776 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
777 if (BE (err
!= REG_NOERROR
, 0))
780 #ifdef RE_ENABLE_I18N
781 /* Don't consider this char as a possible match start if it part,
782 yet isn't the head, of a multibyte character. */
783 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
787 /* It seems to be appropriate one, then use the matcher. */
788 /* We assume that the matching starts from 0. */
789 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
790 match_last
= check_matching (&mctx
, fl_longest_match
,
791 range
>= 0 ? &match_first
: NULL
);
792 if (match_last
!= -1)
794 if (BE (match_last
== -2, 0))
801 mctx
.match_last
= match_last
;
802 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
804 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
805 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
808 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
811 err
= prune_impossible_nodes (&mctx
);
812 if (err
== REG_NOERROR
)
814 if (BE (err
!= REG_NOMATCH
, 0))
819 break; /* We found a match. */
823 match_ctx_clean (&mctx
);
827 assert (match_last
!= -1);
828 assert (err
== REG_NOERROR
);
831 /* Set pmatch[] if we need. */
836 /* Initialize registers. */
837 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
838 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
840 /* Set the points where matching start/end. */
842 pmatch
[0].rm_eo
= mctx
.match_last
;
844 if (!preg
->no_sub
&& nmatch
> 1)
846 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
847 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
848 if (BE (err
!= REG_NOERROR
, 0))
852 /* At last, add the offset to each register, since we slid
853 the buffers so that we could assume that the matching starts
855 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
856 if (pmatch
[reg_idx
].rm_so
!= -1)
858 #ifdef RE_ENABLE_I18N
859 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
861 pmatch
[reg_idx
].rm_so
=
862 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
863 ? mctx
.input
.valid_raw_len
864 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
865 pmatch
[reg_idx
].rm_eo
=
866 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
867 ? mctx
.input
.valid_raw_len
868 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
871 assert (mctx
.input
.offsets_needed
== 0);
873 pmatch
[reg_idx
].rm_so
+= match_first
;
874 pmatch
[reg_idx
].rm_eo
+= match_first
;
876 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
878 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
879 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
883 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
884 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
886 pmatch
[reg_idx
+ 1].rm_so
887 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
888 pmatch
[reg_idx
+ 1].rm_eo
889 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
894 re_free (mctx
.state_log
);
896 match_ctx_free (&mctx
);
897 re_string_destruct (&mctx
.input
);
902 __attribute_warn_unused_result__
903 prune_impossible_nodes (re_match_context_t
*mctx
)
905 const re_dfa_t
*const dfa
= mctx
->dfa
;
906 int halt_node
, match_last
;
908 re_dfastate_t
**sifted_states
;
909 re_dfastate_t
**lim_states
= NULL
;
910 re_sift_context_t sctx
;
912 assert (mctx
->state_log
!= NULL
);
914 match_last
= mctx
->match_last
;
915 halt_node
= mctx
->last_node
;
917 /* Avoid overflow. */
918 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
921 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
922 if (BE (sifted_states
== NULL
, 0))
929 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
930 if (BE (lim_states
== NULL
, 0))
937 memset (lim_states
, '\0',
938 sizeof (re_dfastate_t
*) * (match_last
+ 1));
939 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
941 ret
= sift_states_backward (mctx
, &sctx
);
942 re_node_set_free (&sctx
.limits
);
943 if (BE (ret
!= REG_NOERROR
, 0))
945 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
955 } while (mctx
->state_log
[match_last
] == NULL
956 || !mctx
->state_log
[match_last
]->halt
);
957 halt_node
= check_halt_state_context (mctx
,
958 mctx
->state_log
[match_last
],
961 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
963 re_free (lim_states
);
965 if (BE (ret
!= REG_NOERROR
, 0))
970 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
971 ret
= sift_states_backward (mctx
, &sctx
);
972 re_node_set_free (&sctx
.limits
);
973 if (BE (ret
!= REG_NOERROR
, 0))
975 if (sifted_states
[0] == NULL
)
981 re_free (mctx
->state_log
);
982 mctx
->state_log
= sifted_states
;
983 sifted_states
= NULL
;
984 mctx
->last_node
= halt_node
;
985 mctx
->match_last
= match_last
;
988 re_free (sifted_states
);
989 re_free (lim_states
);
993 /* Acquire an initial state and return it.
994 We must select appropriate initial state depending on the context,
995 since initial states may have constraints like "\<", "^", etc.. */
997 static inline re_dfastate_t
*
998 __attribute ((always_inline
))
999 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1002 const re_dfa_t
*const dfa
= mctx
->dfa
;
1003 if (dfa
->init_state
->has_constraint
)
1005 unsigned int context
;
1006 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1007 if (IS_WORD_CONTEXT (context
))
1008 return dfa
->init_state_word
;
1009 else if (IS_ORDINARY_CONTEXT (context
))
1010 return dfa
->init_state
;
1011 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1012 return dfa
->init_state_begbuf
;
1013 else if (IS_NEWLINE_CONTEXT (context
))
1014 return dfa
->init_state_nl
;
1015 else if (IS_BEGBUF_CONTEXT (context
))
1017 /* It is relatively rare case, then calculate on demand. */
1018 return re_acquire_state_context (err
, dfa
,
1019 dfa
->init_state
->entrance_nodes
,
1023 /* Must not happen? */
1024 return dfa
->init_state
;
1027 return dfa
->init_state
;
1030 /* Check whether the regular expression match input string INPUT or not,
1031 and return the index where the matching end, return -1 if not match,
1032 or return -2 in case of an error.
1033 FL_LONGEST_MATCH means we want the POSIX longest matching.
1034 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1035 next place where we may want to try matching.
1036 Note that the matcher assume that the maching starts from the current
1037 index of the buffer. */
1040 __attribute_warn_unused_result__
1041 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1044 const re_dfa_t
*const dfa
= mctx
->dfa
;
1047 int match_last
= -1;
1048 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1049 re_dfastate_t
*cur_state
;
1050 int at_init_state
= p_match_first
!= NULL
;
1051 int next_start_idx
= cur_str_idx
;
1054 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1055 /* An initial state must not be NULL (invalid). */
1056 if (BE (cur_state
== NULL
, 0))
1058 assert (err
== REG_ESPACE
);
1062 if (mctx
->state_log
!= NULL
)
1064 mctx
->state_log
[cur_str_idx
] = cur_state
;
1066 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1067 later. E.g. Processing back references. */
1068 if (BE (dfa
->nbackref
, 0))
1071 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1072 if (BE (err
!= REG_NOERROR
, 0))
1075 if (cur_state
->has_backref
)
1077 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1078 if (BE (err
!= REG_NOERROR
, 0))
1084 /* If the RE accepts NULL string. */
1085 if (BE (cur_state
->halt
, 0))
1087 if (!cur_state
->has_constraint
1088 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1090 if (!fl_longest_match
)
1094 match_last
= cur_str_idx
;
1100 while (!re_string_eoi (&mctx
->input
))
1102 re_dfastate_t
*old_state
= cur_state
;
1103 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1105 if ((BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1106 && mctx
->input
.bufs_len
< mctx
->input
.len
)
1107 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1108 && mctx
->input
.valid_len
< mctx
->input
.len
))
1110 err
= extend_buffers (mctx
, next_char_idx
+ 1);
1111 if (BE (err
!= REG_NOERROR
, 0))
1113 assert (err
== REG_ESPACE
);
1118 cur_state
= transit_state (&err
, mctx
, cur_state
);
1119 if (mctx
->state_log
!= NULL
)
1120 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1122 if (cur_state
== NULL
)
1124 /* Reached the invalid state or an error. Try to recover a valid
1125 state using the state log, if available and if we have not
1126 already found a valid (even if not the longest) match. */
1127 if (BE (err
!= REG_NOERROR
, 0))
1130 if (mctx
->state_log
== NULL
1131 || (match
&& !fl_longest_match
)
1132 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1136 if (BE (at_init_state
, 0))
1138 if (old_state
== cur_state
)
1139 next_start_idx
= next_char_idx
;
1144 if (cur_state
->halt
)
1146 /* Reached a halt state.
1147 Check the halt state can satisfy the current context. */
1148 if (!cur_state
->has_constraint
1149 || check_halt_state_context (mctx
, cur_state
,
1150 re_string_cur_idx (&mctx
->input
)))
1152 /* We found an appropriate halt state. */
1153 match_last
= re_string_cur_idx (&mctx
->input
);
1156 /* We found a match, do not modify match_first below. */
1157 p_match_first
= NULL
;
1158 if (!fl_longest_match
)
1165 *p_match_first
+= next_start_idx
;
1170 /* Check NODE match the current context. */
1173 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1175 re_token_type_t type
= dfa
->nodes
[node
].type
;
1176 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1177 if (type
!= END_OF_RE
)
1181 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1186 /* Check the halt state STATE match the current context.
1187 Return 0 if not match, if the node, STATE has, is a halt node and
1188 match the context, return the node. */
1191 check_halt_state_context (const re_match_context_t
*mctx
,
1192 const re_dfastate_t
*state
, int idx
)
1195 unsigned int context
;
1197 assert (state
->halt
);
1199 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1200 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1201 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1202 return state
->nodes
.elems
[i
];
1206 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1207 corresponding to the DFA).
1208 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1212 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1213 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1214 struct re_fail_stack_t
*fs
)
1216 const re_dfa_t
*const dfa
= mctx
->dfa
;
1218 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1220 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1221 re_node_set
*edests
= &dfa
->edests
[node
];
1223 err
= re_node_set_insert (eps_via_nodes
, node
);
1224 if (BE (err
< 0, 0))
1226 /* Pick up a valid destination, or return -1 if none is found. */
1227 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1229 int candidate
= edests
->elems
[i
];
1230 if (!re_node_set_contains (cur_nodes
, candidate
))
1232 if (dest_node
== -1)
1233 dest_node
= candidate
;
1237 /* In order to avoid infinite loop like "(a*)*", return the second
1238 epsilon-transition if the first was already considered. */
1239 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1242 /* Otherwise, push the second epsilon-transition on the fail stack. */
1244 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1248 /* We know we are going to exit. */
1257 re_token_type_t type
= dfa
->nodes
[node
].type
;
1259 #ifdef RE_ENABLE_I18N
1260 if (dfa
->nodes
[node
].accept_mb
)
1261 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1263 #endif /* RE_ENABLE_I18N */
1264 if (type
== OP_BACK_REF
)
1266 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1267 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1270 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1274 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1275 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1284 err
= re_node_set_insert (eps_via_nodes
, node
);
1285 if (BE (err
< 0, 0))
1287 dest_node
= dfa
->edests
[node
].elems
[0];
1288 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1295 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1297 int dest_node
= dfa
->nexts
[node
];
1298 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1299 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1300 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1303 re_node_set_empty (eps_via_nodes
);
1310 static reg_errcode_t
1311 __attribute_warn_unused_result__
1312 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1313 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1316 int num
= fs
->num
++;
1317 if (fs
->num
== fs
->alloc
)
1319 struct re_fail_stack_ent_t
*new_array
;
1320 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1322 if (new_array
== NULL
)
1325 fs
->stack
= new_array
;
1327 fs
->stack
[num
].idx
= str_idx
;
1328 fs
->stack
[num
].node
= dest_node
;
1329 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1330 if (fs
->stack
[num
].regs
== NULL
)
1332 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1333 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1338 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1339 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1341 int num
= --fs
->num
;
1343 *pidx
= fs
->stack
[num
].idx
;
1344 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1345 re_node_set_free (eps_via_nodes
);
1346 re_free (fs
->stack
[num
].regs
);
1347 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1348 return fs
->stack
[num
].node
;
1351 /* Set the positions where the subexpressions are starts/ends to registers
1353 Note: We assume that pmatch[0] is already set, and
1354 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1356 static reg_errcode_t
1357 __attribute_warn_unused_result__
1358 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1359 regmatch_t
*pmatch
, int fl_backtrack
)
1361 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1363 re_node_set eps_via_nodes
;
1364 struct re_fail_stack_t
*fs
;
1365 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1366 regmatch_t
*prev_idx_match
;
1367 int prev_idx_match_malloced
= 0;
1370 assert (nmatch
> 1);
1371 assert (mctx
->state_log
!= NULL
);
1376 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1377 if (fs
->stack
== NULL
)
1383 cur_node
= dfa
->init_node
;
1384 re_node_set_init_empty (&eps_via_nodes
);
1386 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1387 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1390 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1391 if (prev_idx_match
== NULL
)
1393 free_fail_stack_return (fs
);
1396 prev_idx_match_malloced
= 1;
1398 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1400 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1402 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1404 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1409 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1410 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1412 if (reg_idx
== nmatch
)
1414 re_node_set_free (&eps_via_nodes
);
1415 if (prev_idx_match_malloced
)
1416 re_free (prev_idx_match
);
1417 return free_fail_stack_return (fs
);
1419 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1424 re_node_set_free (&eps_via_nodes
);
1425 if (prev_idx_match_malloced
)
1426 re_free (prev_idx_match
);
1431 /* Proceed to next node. */
1432 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1433 &eps_via_nodes
, fs
);
1435 if (BE (cur_node
< 0, 0))
1437 if (BE (cur_node
== -2, 0))
1439 re_node_set_free (&eps_via_nodes
);
1440 if (prev_idx_match_malloced
)
1441 re_free (prev_idx_match
);
1442 free_fail_stack_return (fs
);
1446 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1450 re_node_set_free (&eps_via_nodes
);
1451 if (prev_idx_match_malloced
)
1452 re_free (prev_idx_match
);
1457 re_node_set_free (&eps_via_nodes
);
1458 if (prev_idx_match_malloced
)
1459 re_free (prev_idx_match
);
1460 return free_fail_stack_return (fs
);
1463 static reg_errcode_t
1464 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1469 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1471 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1472 re_free (fs
->stack
[fs_idx
].regs
);
1474 re_free (fs
->stack
);
1480 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1481 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1483 int type
= dfa
->nodes
[cur_node
].type
;
1484 if (type
== OP_OPEN_SUBEXP
)
1486 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1488 /* We are at the first node of this sub expression. */
1489 if (reg_num
< nmatch
)
1491 pmatch
[reg_num
].rm_so
= cur_idx
;
1492 pmatch
[reg_num
].rm_eo
= -1;
1495 else if (type
== OP_CLOSE_SUBEXP
)
1497 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1498 if (reg_num
< nmatch
)
1500 /* We are at the last node of this sub expression. */
1501 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1503 pmatch
[reg_num
].rm_eo
= cur_idx
;
1504 /* This is a non-empty match or we are not inside an optional
1505 subexpression. Accept this right away. */
1506 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1510 if (dfa
->nodes
[cur_node
].opt_subexp
1511 && prev_idx_match
[reg_num
].rm_so
!= -1)
1512 /* We transited through an empty match for an optional
1513 subexpression, like (a?)*, and this is not the subexp's
1514 first match. Copy back the old content of the registers
1515 so that matches of an inner subexpression are undone as
1516 well, like in ((a?))*. */
1517 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1519 /* We completed a subexpression, but it may be part of
1520 an optional one, so do not update PREV_IDX_MATCH. */
1521 pmatch
[reg_num
].rm_eo
= cur_idx
;
1527 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1528 and sift the nodes in each states according to the following rules.
1529 Updated state_log will be wrote to STATE_LOG.
1531 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1532 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1533 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1534 the LAST_NODE, we throw away the node `a'.
1535 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1536 string `s' and transit to `b':
1537 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1539 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1540 thrown away, we throw away the node `a'.
1541 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1542 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1544 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1545 we throw away the node `a'. */
1547 #define STATE_NODE_CONTAINS(state,node) \
1548 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1550 static reg_errcode_t
1551 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1555 int str_idx
= sctx
->last_str_idx
;
1556 re_node_set cur_dest
;
1559 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1562 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1563 transit to the last_node and the last_node itself. */
1564 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1565 if (BE (err
!= REG_NOERROR
, 0))
1567 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1568 if (BE (err
!= REG_NOERROR
, 0))
1571 /* Then check each states in the state_log. */
1574 /* Update counters. */
1575 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1576 if (null_cnt
> mctx
->max_mb_elem_len
)
1578 memset (sctx
->sifted_states
, '\0',
1579 sizeof (re_dfastate_t
*) * str_idx
);
1580 re_node_set_free (&cur_dest
);
1583 re_node_set_empty (&cur_dest
);
1586 if (mctx
->state_log
[str_idx
])
1588 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1589 if (BE (err
!= REG_NOERROR
, 0))
1593 /* Add all the nodes which satisfy the following conditions:
1594 - It can epsilon transit to a node in CUR_DEST.
1596 And update state_log. */
1597 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1598 if (BE (err
!= REG_NOERROR
, 0))
1603 re_node_set_free (&cur_dest
);
1607 static reg_errcode_t
1608 __attribute_warn_unused_result__
1609 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1610 int str_idx
, re_node_set
*cur_dest
)
1612 const re_dfa_t
*const dfa
= mctx
->dfa
;
1613 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1616 /* Then build the next sifted state.
1617 We build the next sifted state on `cur_dest', and update
1618 `sifted_states[str_idx]' with `cur_dest'.
1620 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1621 `cur_src' points the node_set of the old `state_log[str_idx]'
1622 (with the epsilon nodes pre-filtered out). */
1623 for (i
= 0; i
< cur_src
->nelem
; i
++)
1625 int prev_node
= cur_src
->elems
[i
];
1630 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1631 assert (!IS_EPSILON_NODE (type
));
1633 #ifdef RE_ENABLE_I18N
1634 /* If the node may accept `multi byte'. */
1635 if (dfa
->nodes
[prev_node
].accept_mb
)
1636 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1637 str_idx
, sctx
->last_str_idx
);
1638 #endif /* RE_ENABLE_I18N */
1640 /* We don't check backreferences here.
1641 See update_cur_sifted_state(). */
1643 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1644 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1645 dfa
->nexts
[prev_node
]))
1651 if (sctx
->limits
.nelem
)
1653 int to_idx
= str_idx
+ naccepted
;
1654 if (check_dst_limits (mctx
, &sctx
->limits
,
1655 dfa
->nexts
[prev_node
], to_idx
,
1656 prev_node
, str_idx
))
1659 ret
= re_node_set_insert (cur_dest
, prev_node
);
1660 if (BE (ret
== -1, 0))
1667 /* Helper functions. */
1669 static reg_errcode_t
1670 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1672 int top
= mctx
->state_log_top
;
1674 if ((next_state_log_idx
>= mctx
->input
.bufs_len
1675 && mctx
->input
.bufs_len
< mctx
->input
.len
)
1676 || (next_state_log_idx
>= mctx
->input
.valid_len
1677 && mctx
->input
.valid_len
< mctx
->input
.len
))
1680 err
= extend_buffers (mctx
, next_state_log_idx
+ 1);
1681 if (BE (err
!= REG_NOERROR
, 0))
1685 if (top
< next_state_log_idx
)
1687 memset (mctx
->state_log
+ top
+ 1, '\0',
1688 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1689 mctx
->state_log_top
= next_state_log_idx
;
1694 static reg_errcode_t
1695 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1696 re_dfastate_t
**src
, int num
)
1700 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1702 if (dst
[st_idx
] == NULL
)
1703 dst
[st_idx
] = src
[st_idx
];
1704 else if (src
[st_idx
] != NULL
)
1706 re_node_set merged_set
;
1707 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1708 &src
[st_idx
]->nodes
);
1709 if (BE (err
!= REG_NOERROR
, 0))
1711 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1712 re_node_set_free (&merged_set
);
1713 if (BE (err
!= REG_NOERROR
, 0))
1720 static reg_errcode_t
1721 update_cur_sifted_state (const re_match_context_t
*mctx
,
1722 re_sift_context_t
*sctx
, int str_idx
,
1723 re_node_set
*dest_nodes
)
1725 const re_dfa_t
*const dfa
= mctx
->dfa
;
1726 reg_errcode_t err
= REG_NOERROR
;
1727 const re_node_set
*candidates
;
1728 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1729 : &mctx
->state_log
[str_idx
]->nodes
);
1731 if (dest_nodes
->nelem
== 0)
1732 sctx
->sifted_states
[str_idx
] = NULL
;
1737 /* At first, add the nodes which can epsilon transit to a node in
1739 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1740 if (BE (err
!= REG_NOERROR
, 0))
1743 /* Then, check the limitations in the current sift_context. */
1744 if (sctx
->limits
.nelem
)
1746 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1747 mctx
->bkref_ents
, str_idx
);
1748 if (BE (err
!= REG_NOERROR
, 0))
1753 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1754 if (BE (err
!= REG_NOERROR
, 0))
1758 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1760 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1761 if (BE (err
!= REG_NOERROR
, 0))
1767 static reg_errcode_t
1768 __attribute_warn_unused_result__
1769 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1770 const re_node_set
*candidates
)
1772 reg_errcode_t err
= REG_NOERROR
;
1775 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1776 if (BE (err
!= REG_NOERROR
, 0))
1779 if (!state
->inveclosure
.alloc
)
1781 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1782 if (BE (err
!= REG_NOERROR
, 0))
1784 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1786 err
= re_node_set_merge (&state
->inveclosure
,
1787 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1788 if (BE (err
!= REG_NOERROR
, 0))
1792 return re_node_set_add_intersect (dest_nodes
, candidates
,
1793 &state
->inveclosure
);
1796 static reg_errcode_t
1797 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1798 const re_node_set
*candidates
)
1802 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1803 re_node_set except_nodes
;
1804 re_node_set_init_empty (&except_nodes
);
1805 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1807 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1808 if (cur_node
== node
)
1810 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1812 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1813 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1814 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1815 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1816 && re_node_set_contains (dest_nodes
, edst1
))
1818 && !re_node_set_contains (inv_eclosure
, edst2
)
1819 && re_node_set_contains (dest_nodes
, edst2
)))
1821 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1822 dfa
->inveclosures
+ cur_node
);
1823 if (BE (err
!= REG_NOERROR
, 0))
1825 re_node_set_free (&except_nodes
);
1831 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1833 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1834 if (!re_node_set_contains (&except_nodes
, cur_node
))
1836 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1837 re_node_set_remove_at (dest_nodes
, idx
);
1840 re_node_set_free (&except_nodes
);
1845 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1846 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1848 const re_dfa_t
*const dfa
= mctx
->dfa
;
1849 int lim_idx
, src_pos
, dst_pos
;
1851 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1852 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1853 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1856 struct re_backref_cache_entry
*ent
;
1857 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1858 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1860 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1861 subexp_idx
, dst_node
, dst_idx
,
1863 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1864 subexp_idx
, src_node
, src_idx
,
1868 <src> <dst> ( <subexp> )
1869 ( <subexp> ) <src> <dst>
1870 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1871 if (src_pos
== dst_pos
)
1872 continue; /* This is unrelated limitation. */
1880 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1881 int subexp_idx
, int from_node
, int bkref_idx
)
1883 const re_dfa_t
*const dfa
= mctx
->dfa
;
1884 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1887 /* Else, we are on the boundary: examine the nodes on the epsilon
1889 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1891 int node
= eclosures
->elems
[node_idx
];
1892 switch (dfa
->nodes
[node
].type
)
1895 if (bkref_idx
!= -1)
1897 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1902 if (ent
->node
!= node
)
1905 if (subexp_idx
< BITSET_WORD_BITS
1906 && !(ent
->eps_reachable_subexps_map
1907 & ((bitset_word_t
) 1 << subexp_idx
)))
1910 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1911 OP_CLOSE_SUBEXP cases below. But, if the
1912 destination node is the same node as the source
1913 node, don't recurse because it would cause an
1914 infinite loop: a regex that exhibits this behavior
1916 dst
= dfa
->edests
[node
].elems
[0];
1917 if (dst
== from_node
)
1921 else /* if (boundaries & 2) */
1926 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1928 if (cpos
== -1 /* && (boundaries & 1) */)
1930 if (cpos
== 0 && (boundaries
& 2))
1933 if (subexp_idx
< BITSET_WORD_BITS
)
1934 ent
->eps_reachable_subexps_map
1935 &= ~((bitset_word_t
) 1 << subexp_idx
);
1937 while (ent
++->more
);
1941 case OP_OPEN_SUBEXP
:
1942 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1946 case OP_CLOSE_SUBEXP
:
1947 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1956 return (boundaries
& 2) ? 1 : 0;
1960 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
1961 int subexp_idx
, int from_node
, int str_idx
,
1964 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1967 /* If we are outside the range of the subexpression, return -1 or 1. */
1968 if (str_idx
< lim
->subexp_from
)
1971 if (lim
->subexp_to
< str_idx
)
1974 /* If we are within the subexpression, return 0. */
1975 boundaries
= (str_idx
== lim
->subexp_from
);
1976 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
1977 if (boundaries
== 0)
1980 /* Else, examine epsilon closure. */
1981 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1982 from_node
, bkref_idx
);
1985 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1986 which are against limitations from DEST_NODES. */
1988 static reg_errcode_t
1989 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1990 const re_node_set
*candidates
, re_node_set
*limits
,
1991 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
1994 int node_idx
, lim_idx
;
1996 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1999 struct re_backref_cache_entry
*ent
;
2000 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2002 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2003 continue; /* This is unrelated limitation. */
2005 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2006 if (ent
->subexp_to
== str_idx
)
2010 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2012 int node
= dest_nodes
->elems
[node_idx
];
2013 re_token_type_t type
= dfa
->nodes
[node
].type
;
2014 if (type
== OP_OPEN_SUBEXP
2015 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2017 else if (type
== OP_CLOSE_SUBEXP
2018 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2022 /* Check the limitation of the open subexpression. */
2023 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2026 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2028 if (BE (err
!= REG_NOERROR
, 0))
2032 /* Check the limitation of the close subexpression. */
2034 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2036 int node
= dest_nodes
->elems
[node_idx
];
2037 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2039 && !re_node_set_contains (dfa
->eclosures
+ node
,
2042 /* It is against this limitation.
2043 Remove it form the current sifted state. */
2044 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2046 if (BE (err
!= REG_NOERROR
, 0))
2052 else /* (ent->subexp_to != str_idx) */
2054 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2056 int node
= dest_nodes
->elems
[node_idx
];
2057 re_token_type_t type
= dfa
->nodes
[node
].type
;
2058 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2060 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2062 /* It is against this limitation.
2063 Remove it form the current sifted state. */
2064 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2066 if (BE (err
!= REG_NOERROR
, 0))
2075 static reg_errcode_t
2076 __attribute_warn_unused_result__
2077 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2078 int str_idx
, const re_node_set
*candidates
)
2080 const re_dfa_t
*const dfa
= mctx
->dfa
;
2083 re_sift_context_t local_sctx
;
2084 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2086 if (first_idx
== -1)
2089 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2091 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2094 re_token_type_t type
;
2095 struct re_backref_cache_entry
*entry
;
2096 node
= candidates
->elems
[node_idx
];
2097 type
= dfa
->nodes
[node
].type
;
2098 /* Avoid infinite loop for the REs like "()\1+". */
2099 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2101 if (type
!= OP_BACK_REF
)
2104 entry
= mctx
->bkref_ents
+ first_idx
;
2105 enabled_idx
= first_idx
;
2112 re_dfastate_t
*cur_state
;
2114 if (entry
->node
!= node
)
2116 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2117 to_idx
= str_idx
+ subexp_len
;
2118 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2119 : dfa
->edests
[node
].elems
[0]);
2121 if (to_idx
> sctx
->last_str_idx
2122 || sctx
->sifted_states
[to_idx
] == NULL
2123 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2124 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2125 str_idx
, dst_node
, to_idx
))
2128 if (local_sctx
.sifted_states
== NULL
)
2131 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2132 if (BE (err
!= REG_NOERROR
, 0))
2135 local_sctx
.last_node
= node
;
2136 local_sctx
.last_str_idx
= str_idx
;
2137 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2138 if (BE (ret
< 0, 0))
2143 cur_state
= local_sctx
.sifted_states
[str_idx
];
2144 err
= sift_states_backward (mctx
, &local_sctx
);
2145 if (BE (err
!= REG_NOERROR
, 0))
2147 if (sctx
->limited_states
!= NULL
)
2149 err
= merge_state_array (dfa
, sctx
->limited_states
,
2150 local_sctx
.sifted_states
,
2152 if (BE (err
!= REG_NOERROR
, 0))
2155 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2156 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2158 /* mctx->bkref_ents may have changed, reload the pointer. */
2159 entry
= mctx
->bkref_ents
+ enabled_idx
;
2161 while (enabled_idx
++, entry
++->more
);
2165 if (local_sctx
.sifted_states
!= NULL
)
2167 re_node_set_free (&local_sctx
.limits
);
2174 #ifdef RE_ENABLE_I18N
2176 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2177 int node_idx
, int str_idx
, int max_str_idx
)
2179 const re_dfa_t
*const dfa
= mctx
->dfa
;
2181 /* Check the node can accept `multi byte'. */
2182 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2183 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2184 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2185 dfa
->nexts
[node_idx
]))
2186 /* The node can't accept the `multi byte', or the
2187 destination was already thrown away, then the node
2188 could't accept the current input `multi byte'. */
2190 /* Otherwise, it is sure that the node could accept
2191 `naccepted' bytes input. */
2194 #endif /* RE_ENABLE_I18N */
2197 /* Functions for state transition. */
2199 /* Return the next state to which the current state STATE will transit by
2200 accepting the current input byte, and update STATE_LOG if necessary.
2201 If STATE can accept a multibyte char/collating element/back reference
2202 update the destination of STATE_LOG. */
2204 static re_dfastate_t
*
2205 __attribute_warn_unused_result__
2206 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2207 re_dfastate_t
*state
)
2209 re_dfastate_t
**trtable
;
2212 #ifdef RE_ENABLE_I18N
2213 /* If the current state can accept multibyte. */
2214 if (BE (state
->accept_mb
, 0))
2216 *err
= transit_state_mb (mctx
, state
);
2217 if (BE (*err
!= REG_NOERROR
, 0))
2220 #endif /* RE_ENABLE_I18N */
2222 /* Then decide the next state with the single byte. */
2225 /* don't use transition table */
2226 return transit_state_sb (err
, mctx
, state
);
2229 /* Use transition table */
2230 ch
= re_string_fetch_byte (&mctx
->input
);
2233 trtable
= state
->trtable
;
2234 if (BE (trtable
!= NULL
, 1))
2237 trtable
= state
->word_trtable
;
2238 if (BE (trtable
!= NULL
, 1))
2240 unsigned int context
;
2242 = re_string_context_at (&mctx
->input
,
2243 re_string_cur_idx (&mctx
->input
) - 1,
2245 if (IS_WORD_CONTEXT (context
))
2246 return trtable
[ch
+ SBC_MAX
];
2251 if (!build_trtable (mctx
->dfa
, state
))
2257 /* Retry, we now have a transition table. */
2261 /* Update the state_log if we need */
2263 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2264 re_dfastate_t
*next_state
)
2266 const re_dfa_t
*const dfa
= mctx
->dfa
;
2267 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2269 if (cur_idx
> mctx
->state_log_top
)
2271 mctx
->state_log
[cur_idx
] = next_state
;
2272 mctx
->state_log_top
= cur_idx
;
2274 else if (mctx
->state_log
[cur_idx
] == 0)
2276 mctx
->state_log
[cur_idx
] = next_state
;
2280 re_dfastate_t
*pstate
;
2281 unsigned int context
;
2282 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2283 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2284 the destination of a multibyte char/collating element/
2285 back reference. Then the next state is the union set of
2286 these destinations and the results of the transition table. */
2287 pstate
= mctx
->state_log
[cur_idx
];
2288 log_nodes
= pstate
->entrance_nodes
;
2289 if (next_state
!= NULL
)
2291 table_nodes
= next_state
->entrance_nodes
;
2292 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2294 if (BE (*err
!= REG_NOERROR
, 0))
2298 next_nodes
= *log_nodes
;
2299 /* Note: We already add the nodes of the initial state,
2300 then we don't need to add them here. */
2302 context
= re_string_context_at (&mctx
->input
,
2303 re_string_cur_idx (&mctx
->input
) - 1,
2305 next_state
= mctx
->state_log
[cur_idx
]
2306 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2307 /* We don't need to check errors here, since the return value of
2308 this function is next_state and ERR is already set. */
2310 if (table_nodes
!= NULL
)
2311 re_node_set_free (&next_nodes
);
2314 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2316 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2317 later. We must check them here, since the back references in the
2318 next state might use them. */
2319 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2321 if (BE (*err
!= REG_NOERROR
, 0))
2324 /* If the next state has back references. */
2325 if (next_state
->has_backref
)
2327 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2328 if (BE (*err
!= REG_NOERROR
, 0))
2330 next_state
= mctx
->state_log
[cur_idx
];
2337 /* Skip bytes in the input that correspond to part of a
2338 multi-byte match, then look in the log for a state
2339 from which to restart matching. */
2341 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2343 re_dfastate_t
*cur_state
;
2346 int max
= mctx
->state_log_top
;
2347 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2351 if (++cur_str_idx
> max
)
2353 re_string_skip_bytes (&mctx
->input
, 1);
2355 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2357 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2359 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2363 /* Helper functions for transit_state. */
2365 /* From the node set CUR_NODES, pick up the nodes whose types are
2366 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2367 expression. And register them to use them later for evaluating the
2368 corresponding back references. */
2370 static reg_errcode_t
2371 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2374 const re_dfa_t
*const dfa
= mctx
->dfa
;
2378 /* TODO: This isn't efficient.
2379 Because there might be more than one nodes whose types are
2380 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2383 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2385 int node
= cur_nodes
->elems
[node_idx
];
2386 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2387 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2388 && (dfa
->used_bkref_map
2389 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2391 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2392 if (BE (err
!= REG_NOERROR
, 0))
2400 /* Return the next state to which the current state STATE will transit by
2401 accepting the current input byte. */
2403 static re_dfastate_t
*
2404 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2405 re_dfastate_t
*state
)
2407 const re_dfa_t
*const dfa
= mctx
->dfa
;
2408 re_node_set next_nodes
;
2409 re_dfastate_t
*next_state
;
2410 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2411 unsigned int context
;
2413 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2414 if (BE (*err
!= REG_NOERROR
, 0))
2416 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2418 int cur_node
= state
->nodes
.elems
[node_cnt
];
2419 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2421 *err
= re_node_set_merge (&next_nodes
,
2422 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2423 if (BE (*err
!= REG_NOERROR
, 0))
2425 re_node_set_free (&next_nodes
);
2430 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2431 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2432 /* We don't need to check errors here, since the return value of
2433 this function is next_state and ERR is already set. */
2435 re_node_set_free (&next_nodes
);
2436 re_string_skip_bytes (&mctx
->input
, 1);
2441 #ifdef RE_ENABLE_I18N
2442 static reg_errcode_t
2443 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2445 const re_dfa_t
*const dfa
= mctx
->dfa
;
2449 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2451 re_node_set dest_nodes
, *new_nodes
;
2452 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2453 int naccepted
, dest_idx
;
2454 unsigned int context
;
2455 re_dfastate_t
*dest_state
;
2457 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2460 if (dfa
->nodes
[cur_node_idx
].constraint
)
2462 context
= re_string_context_at (&mctx
->input
,
2463 re_string_cur_idx (&mctx
->input
),
2465 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2470 /* How many bytes the node can accept? */
2471 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2472 re_string_cur_idx (&mctx
->input
));
2476 /* The node can accepts `naccepted' bytes. */
2477 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2478 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2479 : mctx
->max_mb_elem_len
);
2480 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2481 if (BE (err
!= REG_NOERROR
, 0))
2484 assert (dfa
->nexts
[cur_node_idx
] != -1);
2486 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2488 dest_state
= mctx
->state_log
[dest_idx
];
2489 if (dest_state
== NULL
)
2490 dest_nodes
= *new_nodes
;
2493 err
= re_node_set_init_union (&dest_nodes
,
2494 dest_state
->entrance_nodes
, new_nodes
);
2495 if (BE (err
!= REG_NOERROR
, 0))
2498 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2500 mctx
->state_log
[dest_idx
]
2501 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2502 if (dest_state
!= NULL
)
2503 re_node_set_free (&dest_nodes
);
2504 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2509 #endif /* RE_ENABLE_I18N */
2511 static reg_errcode_t
2512 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2514 const re_dfa_t
*const dfa
= mctx
->dfa
;
2517 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2519 for (i
= 0; i
< nodes
->nelem
; ++i
)
2521 int dest_str_idx
, prev_nelem
, bkc_idx
;
2522 int node_idx
= nodes
->elems
[i
];
2523 unsigned int context
;
2524 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2525 re_node_set
*new_dest_nodes
;
2527 /* Check whether `node' is a backreference or not. */
2528 if (node
->type
!= OP_BACK_REF
)
2531 if (node
->constraint
)
2533 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2535 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2539 /* `node' is a backreference.
2540 Check the substring which the substring matched. */
2541 bkc_idx
= mctx
->nbkref_ents
;
2542 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2543 if (BE (err
!= REG_NOERROR
, 0))
2546 /* And add the epsilon closures (which is `new_dest_nodes') of
2547 the backreference to appropriate state_log. */
2549 assert (dfa
->nexts
[node_idx
] != -1);
2551 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2554 re_dfastate_t
*dest_state
;
2555 struct re_backref_cache_entry
*bkref_ent
;
2556 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2557 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2559 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2560 new_dest_nodes
= (subexp_len
== 0
2561 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2562 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2563 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2564 - bkref_ent
->subexp_from
);
2565 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2567 dest_state
= mctx
->state_log
[dest_str_idx
];
2568 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2569 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2570 /* Add `new_dest_node' to state_log. */
2571 if (dest_state
== NULL
)
2573 mctx
->state_log
[dest_str_idx
]
2574 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2576 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2577 && err
!= REG_NOERROR
, 0))
2582 re_node_set dest_nodes
;
2583 err
= re_node_set_init_union (&dest_nodes
,
2584 dest_state
->entrance_nodes
,
2586 if (BE (err
!= REG_NOERROR
, 0))
2588 re_node_set_free (&dest_nodes
);
2591 mctx
->state_log
[dest_str_idx
]
2592 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2593 re_node_set_free (&dest_nodes
);
2594 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2595 && err
!= REG_NOERROR
, 0))
2598 /* We need to check recursively if the backreference can epsilon
2601 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2603 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2605 if (BE (err
!= REG_NOERROR
, 0))
2607 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2608 if (BE (err
!= REG_NOERROR
, 0))
2618 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2619 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2620 Note that we might collect inappropriate candidates here.
2621 However, the cost of checking them strictly here is too high, then we
2622 delay these checking for prune_impossible_nodes(). */
2624 static reg_errcode_t
2625 __attribute_warn_unused_result__
2626 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2628 const re_dfa_t
*const dfa
= mctx
->dfa
;
2629 int subexp_num
, sub_top_idx
;
2630 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2631 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2632 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2633 if (cache_idx
!= -1)
2635 const struct re_backref_cache_entry
*entry
2636 = mctx
->bkref_ents
+ cache_idx
;
2638 if (entry
->node
== bkref_node
)
2639 return REG_NOERROR
; /* We already checked it. */
2640 while (entry
++->more
);
2643 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2645 /* For each sub expression */
2646 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2649 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2650 re_sub_match_last_t
*sub_last
;
2651 int sub_last_idx
, sl_str
, bkref_str_off
;
2653 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2654 continue; /* It isn't related. */
2656 sl_str
= sub_top
->str_idx
;
2657 bkref_str_off
= bkref_str_idx
;
2658 /* At first, check the last node of sub expressions we already
2660 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2663 sub_last
= sub_top
->lasts
[sub_last_idx
];
2664 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2665 /* The matched string by the sub expression match with the substring
2666 at the back reference? */
2667 if (sl_str_diff
> 0)
2669 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2671 /* Not enough chars for a successful match. */
2672 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2675 err
= clean_state_log_if_needed (mctx
,
2678 if (BE (err
!= REG_NOERROR
, 0))
2680 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2682 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2683 /* We don't need to search this sub expression any more. */
2686 bkref_str_off
+= sl_str_diff
;
2687 sl_str
+= sl_str_diff
;
2688 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2691 /* Reload buf, since the preceding call might have reallocated
2693 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2695 if (err
== REG_NOMATCH
)
2697 if (BE (err
!= REG_NOERROR
, 0))
2701 if (sub_last_idx
< sub_top
->nlasts
)
2703 if (sub_last_idx
> 0)
2705 /* Then, search for the other last nodes of the sub expression. */
2706 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2708 int cls_node
, sl_str_off
;
2709 const re_node_set
*nodes
;
2710 sl_str_off
= sl_str
- sub_top
->str_idx
;
2711 /* The matched string by the sub expression match with the substring
2712 at the back reference? */
2715 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2717 /* If we are at the end of the input, we cannot match. */
2718 if (bkref_str_off
>= mctx
->input
.len
)
2721 err
= extend_buffers (mctx
, bkref_str_off
+ 1);
2722 if (BE (err
!= REG_NOERROR
, 0))
2725 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2727 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2728 break; /* We don't need to search this sub expression
2731 if (mctx
->state_log
[sl_str
] == NULL
)
2733 /* Does this state have a ')' of the sub expression? */
2734 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2735 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2739 if (sub_top
->path
== NULL
)
2741 sub_top
->path
= calloc (sizeof (state_array_t
),
2742 sl_str
- sub_top
->str_idx
+ 1);
2743 if (sub_top
->path
== NULL
)
2746 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2747 in the current context? */
2748 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2749 sub_top
->str_idx
, cls_node
, sl_str
,
2751 if (err
== REG_NOMATCH
)
2753 if (BE (err
!= REG_NOERROR
, 0))
2755 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2756 if (BE (sub_last
== NULL
, 0))
2758 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2760 if (err
== REG_NOMATCH
)
2767 /* Helper functions for get_subexp(). */
2769 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2770 If it can arrive, register the sub expression expressed with SUB_TOP
2773 static reg_errcode_t
2774 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2775 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2779 /* Can the subexpression arrive the back reference? */
2780 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2781 sub_last
->str_idx
, bkref_node
, bkref_str
,
2783 if (err
!= REG_NOERROR
)
2785 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2787 if (BE (err
!= REG_NOERROR
, 0))
2789 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2790 return clean_state_log_if_needed (mctx
, to_idx
);
2793 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2794 Search '(' if FL_OPEN, or search ')' otherwise.
2795 TODO: This function isn't efficient...
2796 Because there might be more than one nodes whose types are
2797 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2802 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2803 int subexp_idx
, int type
)
2806 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2808 int cls_node
= nodes
->elems
[cls_idx
];
2809 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2810 if (node
->type
== type
2811 && node
->opr
.idx
== subexp_idx
)
2817 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2818 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2820 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2822 static reg_errcode_t
2823 __attribute_warn_unused_result__
2824 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2825 int top_str
, int last_node
, int last_str
, int type
)
2827 const re_dfa_t
*const dfa
= mctx
->dfa
;
2828 reg_errcode_t err
= REG_NOERROR
;
2829 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2830 re_dfastate_t
*cur_state
= NULL
;
2831 re_node_set
*cur_nodes
, next_nodes
;
2832 re_dfastate_t
**backup_state_log
;
2833 unsigned int context
;
2835 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2836 /* Extend the buffer if we need. */
2837 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2839 re_dfastate_t
**new_array
;
2840 int old_alloc
= path
->alloc
;
2841 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2842 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2843 if (BE (new_array
== NULL
, 0))
2845 path
->alloc
= old_alloc
;
2848 path
->array
= new_array
;
2849 memset (new_array
+ old_alloc
, '\0',
2850 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2853 str_idx
= path
->next_idx
?: top_str
;
2855 /* Temporary modify MCTX. */
2856 backup_state_log
= mctx
->state_log
;
2857 backup_cur_idx
= mctx
->input
.cur_idx
;
2858 mctx
->state_log
= path
->array
;
2859 mctx
->input
.cur_idx
= str_idx
;
2861 /* Setup initial node set. */
2862 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2863 if (str_idx
== top_str
)
2865 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2866 if (BE (err
!= REG_NOERROR
, 0))
2868 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2869 if (BE (err
!= REG_NOERROR
, 0))
2871 re_node_set_free (&next_nodes
);
2877 cur_state
= mctx
->state_log
[str_idx
];
2878 if (cur_state
&& cur_state
->has_backref
)
2880 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2881 if (BE (err
!= REG_NOERROR
, 0))
2885 re_node_set_init_empty (&next_nodes
);
2887 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2889 if (next_nodes
.nelem
)
2891 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2893 if (BE (err
!= REG_NOERROR
, 0))
2895 re_node_set_free (&next_nodes
);
2899 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2900 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2902 re_node_set_free (&next_nodes
);
2905 mctx
->state_log
[str_idx
] = cur_state
;
2908 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2910 re_node_set_empty (&next_nodes
);
2911 if (mctx
->state_log
[str_idx
+ 1])
2913 err
= re_node_set_merge (&next_nodes
,
2914 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2915 if (BE (err
!= REG_NOERROR
, 0))
2917 re_node_set_free (&next_nodes
);
2923 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2924 &cur_state
->non_eps_nodes
,
2926 if (BE (err
!= REG_NOERROR
, 0))
2928 re_node_set_free (&next_nodes
);
2933 if (next_nodes
.nelem
)
2935 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2936 if (BE (err
!= REG_NOERROR
, 0))
2938 re_node_set_free (&next_nodes
);
2941 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2943 if (BE (err
!= REG_NOERROR
, 0))
2945 re_node_set_free (&next_nodes
);
2949 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2950 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2951 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2953 re_node_set_free (&next_nodes
);
2956 mctx
->state_log
[str_idx
] = cur_state
;
2957 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2959 re_node_set_free (&next_nodes
);
2960 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2961 : &mctx
->state_log
[last_str
]->nodes
);
2962 path
->next_idx
= str_idx
;
2965 mctx
->state_log
= backup_state_log
;
2966 mctx
->input
.cur_idx
= backup_cur_idx
;
2968 /* Then check the current node set has the node LAST_NODE. */
2969 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
2975 /* Helper functions for check_arrival. */
2977 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2979 TODO: This function is similar to the functions transit_state*(),
2980 however this function has many additional works.
2981 Can't we unify them? */
2983 static reg_errcode_t
2984 __attribute_warn_unused_result__
2985 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
2986 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
2988 const re_dfa_t
*const dfa
= mctx
->dfa
;
2991 #ifdef RE_ENABLE_I18N
2992 reg_errcode_t err
= REG_NOERROR
;
2994 re_node_set union_set
;
2995 re_node_set_init_empty (&union_set
);
2996 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
2999 int cur_node
= cur_nodes
->elems
[cur_idx
];
3001 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3002 assert (!IS_EPSILON_NODE (type
));
3004 #ifdef RE_ENABLE_I18N
3005 /* If the node may accept `multi byte'. */
3006 if (dfa
->nodes
[cur_node
].accept_mb
)
3008 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3012 re_dfastate_t
*dest_state
;
3013 int next_node
= dfa
->nexts
[cur_node
];
3014 int next_idx
= str_idx
+ naccepted
;
3015 dest_state
= mctx
->state_log
[next_idx
];
3016 re_node_set_empty (&union_set
);
3019 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3020 if (BE (err
!= REG_NOERROR
, 0))
3022 re_node_set_free (&union_set
);
3026 result
= re_node_set_insert (&union_set
, next_node
);
3027 if (BE (result
< 0, 0))
3029 re_node_set_free (&union_set
);
3032 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3034 if (BE (mctx
->state_log
[next_idx
] == NULL
3035 && err
!= REG_NOERROR
, 0))
3037 re_node_set_free (&union_set
);
3042 #endif /* RE_ENABLE_I18N */
3044 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3046 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3047 if (BE (result
< 0, 0))
3049 re_node_set_free (&union_set
);
3054 re_node_set_free (&union_set
);
3058 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3059 CUR_NODES, however exclude the nodes which are:
3060 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3061 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3064 static reg_errcode_t
3065 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3066 int ex_subexp
, int type
)
3069 int idx
, outside_node
;
3070 re_node_set new_nodes
;
3072 assert (cur_nodes
->nelem
);
3074 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3075 if (BE (err
!= REG_NOERROR
, 0))
3077 /* Create a new node set NEW_NODES with the nodes which are epsilon
3078 closures of the node in CUR_NODES. */
3080 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3082 int cur_node
= cur_nodes
->elems
[idx
];
3083 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3084 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3085 if (outside_node
== -1)
3087 /* There are no problematic nodes, just merge them. */
3088 err
= re_node_set_merge (&new_nodes
, eclosure
);
3089 if (BE (err
!= REG_NOERROR
, 0))
3091 re_node_set_free (&new_nodes
);
3097 /* There are problematic nodes, re-calculate incrementally. */
3098 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3100 if (BE (err
!= REG_NOERROR
, 0))
3102 re_node_set_free (&new_nodes
);
3107 re_node_set_free (cur_nodes
);
3108 *cur_nodes
= new_nodes
;
3112 /* Helper function for check_arrival_expand_ecl.
3113 Check incrementally the epsilon closure of TARGET, and if it isn't
3114 problematic append it to DST_NODES. */
3116 static reg_errcode_t
3117 __attribute_warn_unused_result__
3118 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3119 int target
, int ex_subexp
, int type
)
3122 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3126 if (dfa
->nodes
[cur_node
].type
== type
3127 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3129 if (type
== OP_CLOSE_SUBEXP
)
3131 err
= re_node_set_insert (dst_nodes
, cur_node
);
3132 if (BE (err
== -1, 0))
3137 err
= re_node_set_insert (dst_nodes
, cur_node
);
3138 if (BE (err
== -1, 0))
3140 if (dfa
->edests
[cur_node
].nelem
== 0)
3142 if (dfa
->edests
[cur_node
].nelem
== 2)
3144 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3145 dfa
->edests
[cur_node
].elems
[1],
3147 if (BE (err
!= REG_NOERROR
, 0))
3150 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3156 /* For all the back references in the current state, calculate the
3157 destination of the back references by the appropriate entry
3158 in MCTX->BKREF_ENTS. */
3160 static reg_errcode_t
3161 __attribute_warn_unused_result__
3162 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3163 int cur_str
, int subexp_num
, int type
)
3165 const re_dfa_t
*const dfa
= mctx
->dfa
;
3167 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3168 struct re_backref_cache_entry
*ent
;
3170 if (cache_idx_start
== -1)
3174 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3177 int to_idx
, next_node
;
3179 /* Is this entry ENT is appropriate? */
3180 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3183 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3184 /* Calculate the destination of the back reference, and append it
3185 to MCTX->STATE_LOG. */
3186 if (to_idx
== cur_str
)
3188 /* The backreference did epsilon transit, we must re-check all the
3189 node in the current state. */
3190 re_node_set new_dests
;
3191 reg_errcode_t err2
, err3
;
3192 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3193 if (re_node_set_contains (cur_nodes
, next_node
))
3195 err
= re_node_set_init_1 (&new_dests
, next_node
);
3196 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3197 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3198 re_node_set_free (&new_dests
);
3199 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3200 || err3
!= REG_NOERROR
, 0))
3202 err
= (err
!= REG_NOERROR
? err
3203 : (err2
!= REG_NOERROR
? err2
: err3
));
3206 /* TODO: It is still inefficient... */
3211 re_node_set union_set
;
3212 next_node
= dfa
->nexts
[ent
->node
];
3213 if (mctx
->state_log
[to_idx
])
3216 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3219 err
= re_node_set_init_copy (&union_set
,
3220 &mctx
->state_log
[to_idx
]->nodes
);
3221 ret
= re_node_set_insert (&union_set
, next_node
);
3222 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3224 re_node_set_free (&union_set
);
3225 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3231 err
= re_node_set_init_1 (&union_set
, next_node
);
3232 if (BE (err
!= REG_NOERROR
, 0))
3235 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3236 re_node_set_free (&union_set
);
3237 if (BE (mctx
->state_log
[to_idx
] == NULL
3238 && err
!= REG_NOERROR
, 0))
3242 while (ent
++->more
);
3246 /* Build transition table for the state.
3247 Return 1 if succeeded, otherwise return NULL. */
3250 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3253 int i
, j
, ch
, need_word_trtable
= 0;
3254 bitset_word_t elem
, mask
;
3255 bool dests_node_malloced
= false;
3256 bool dest_states_malloced
= false;
3257 int ndests
; /* Number of the destination states from `state'. */
3258 re_dfastate_t
**trtable
;
3259 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3260 re_node_set follows
, *dests_node
;
3262 bitset_t acceptable
;
3266 re_node_set dests_node
[SBC_MAX
];
3267 bitset_t dests_ch
[SBC_MAX
];
3270 /* We build DFA states which corresponds to the destination nodes
3271 from `state'. `dests_node[i]' represents the nodes which i-th
3272 destination state contains, and `dests_ch[i]' represents the
3273 characters which i-th destination state accepts. */
3274 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3275 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3278 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3279 if (BE (dests_alloc
== NULL
, 0))
3281 dests_node_malloced
= true;
3283 dests_node
= dests_alloc
->dests_node
;
3284 dests_ch
= dests_alloc
->dests_ch
;
3286 /* Initialize transiton table. */
3287 state
->word_trtable
= state
->trtable
= NULL
;
3289 /* At first, group all nodes belonging to `state' into several
3291 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3292 if (BE (ndests
<= 0, 0))
3294 if (dests_node_malloced
)
3296 /* Return 0 in case of an error, 1 otherwise. */
3299 state
->trtable
= (re_dfastate_t
**)
3300 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3301 if (BE (state
->trtable
== NULL
, 0))
3308 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3309 if (BE (err
!= REG_NOERROR
, 0))
3312 /* Avoid arithmetic overflow in size calculation. */
3313 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3314 / (3 * sizeof (re_dfastate_t
*)))
3319 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3320 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3321 dest_states
= (re_dfastate_t
**)
3322 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3325 dest_states
= (re_dfastate_t
**)
3326 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3327 if (BE (dest_states
== NULL
, 0))
3330 if (dest_states_malloced
)
3332 re_node_set_free (&follows
);
3333 for (i
= 0; i
< ndests
; ++i
)
3334 re_node_set_free (dests_node
+ i
);
3335 if (dests_node_malloced
)
3339 dest_states_malloced
= true;
3341 dest_states_word
= dest_states
+ ndests
;
3342 dest_states_nl
= dest_states_word
+ ndests
;
3343 bitset_empty (acceptable
);
3345 /* Then build the states for all destinations. */
3346 for (i
= 0; i
< ndests
; ++i
)
3349 re_node_set_empty (&follows
);
3350 /* Merge the follows of this destination states. */
3351 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3353 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3354 if (next_node
!= -1)
3356 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3357 if (BE (err
!= REG_NOERROR
, 0))
3361 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3362 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3364 /* If the new state has context constraint,
3365 build appropriate states for these contexts. */
3366 if (dest_states
[i
]->has_constraint
)
3368 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3370 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3373 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3374 need_word_trtable
= 1;
3376 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3378 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3383 dest_states_word
[i
] = dest_states
[i
];
3384 dest_states_nl
[i
] = dest_states
[i
];
3386 bitset_merge (acceptable
, dests_ch
[i
]);
3389 if (!BE (need_word_trtable
, 0))
3391 /* We don't care about whether the following character is a word
3392 character, or we are in a single-byte character set so we can
3393 discern by looking at the character code: allocate a
3394 256-entry transition table. */
3395 trtable
= state
->trtable
=
3396 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3397 if (BE (trtable
== NULL
, 0))
3400 /* For all characters ch...: */
3401 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3402 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3404 mask
<<= 1, elem
>>= 1, ++ch
)
3405 if (BE (elem
& 1, 0))
3407 /* There must be exactly one destination which accepts
3408 character ch. See group_nodes_into_DFAstates. */
3409 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3412 /* j-th destination accepts the word character ch. */
3413 if (dfa
->word_char
[i
] & mask
)
3414 trtable
[ch
] = dest_states_word
[j
];
3416 trtable
[ch
] = dest_states
[j
];
3421 /* We care about whether the following character is a word
3422 character, and we are in a multi-byte character set: discern
3423 by looking at the character code: build two 256-entry
3424 transition tables, one starting at trtable[0] and one
3425 starting at trtable[SBC_MAX]. */
3426 trtable
= state
->word_trtable
=
3427 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3428 if (BE (trtable
== NULL
, 0))
3431 /* For all characters ch...: */
3432 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3433 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3435 mask
<<= 1, elem
>>= 1, ++ch
)
3436 if (BE (elem
& 1, 0))
3438 /* There must be exactly one destination which accepts
3439 character ch. See group_nodes_into_DFAstates. */
3440 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3443 /* j-th destination accepts the word character ch. */
3444 trtable
[ch
] = dest_states
[j
];
3445 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3450 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3452 /* The current state accepts newline character. */
3453 for (j
= 0; j
< ndests
; ++j
)
3454 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3456 /* k-th destination accepts newline character. */
3457 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3458 if (need_word_trtable
)
3459 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3460 /* There must be only one destination which accepts
3461 newline. See group_nodes_into_DFAstates. */
3466 if (dest_states_malloced
)
3469 re_node_set_free (&follows
);
3470 for (i
= 0; i
< ndests
; ++i
)
3471 re_node_set_free (dests_node
+ i
);
3473 if (dests_node_malloced
)
3479 /* Group all nodes belonging to STATE into several destinations.
3480 Then for all destinations, set the nodes belonging to the destination
3481 to DESTS_NODE[i] and set the characters accepted by the destination
3482 to DEST_CH[i]. This function return the number of destinations. */
3485 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3486 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3491 int ndests
; /* Number of the destinations from `state'. */
3492 bitset_t accepts
; /* Characters a node can accept. */
3493 const re_node_set
*cur_nodes
= &state
->nodes
;
3494 bitset_empty (accepts
);
3497 /* For all the nodes belonging to `state', */
3498 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3500 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3501 re_token_type_t type
= node
->type
;
3502 unsigned int constraint
= node
->constraint
;
3504 /* Enumerate all single byte character this node can accept. */
3505 if (type
== CHARACTER
)
3506 bitset_set (accepts
, node
->opr
.c
);
3507 else if (type
== SIMPLE_BRACKET
)
3509 bitset_merge (accepts
, node
->opr
.sbcset
);
3511 else if (type
== OP_PERIOD
)
3513 #ifdef RE_ENABLE_I18N
3514 if (dfa
->mb_cur_max
> 1)
3515 bitset_merge (accepts
, dfa
->sb_char
);
3518 bitset_set_all (accepts
);
3519 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3520 bitset_clear (accepts
, '\n');
3521 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3522 bitset_clear (accepts
, '\0');
3524 #ifdef RE_ENABLE_I18N
3525 else if (type
== OP_UTF8_PERIOD
)
3527 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3528 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3529 bitset_clear (accepts
, '\n');
3530 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3531 bitset_clear (accepts
, '\0');
3537 /* Check the `accepts' and sift the characters which are not
3538 match it the context. */
3541 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3543 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3544 bitset_empty (accepts
);
3545 if (accepts_newline
)
3546 bitset_set (accepts
, NEWLINE_CHAR
);
3550 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3552 bitset_empty (accepts
);
3556 if (constraint
& NEXT_WORD_CONSTRAINT
)
3558 bitset_word_t any_set
= 0;
3559 if (type
== CHARACTER
&& !node
->word_char
)
3561 bitset_empty (accepts
);
3564 #ifdef RE_ENABLE_I18N
3565 if (dfa
->mb_cur_max
> 1)
3566 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3567 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3570 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3571 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3575 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3577 bitset_word_t any_set
= 0;
3578 if (type
== CHARACTER
&& node
->word_char
)
3580 bitset_empty (accepts
);
3583 #ifdef RE_ENABLE_I18N
3584 if (dfa
->mb_cur_max
> 1)
3585 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3586 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3589 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3590 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3596 /* Then divide `accepts' into DFA states, or create a new
3597 state. Above, we make sure that accepts is not empty. */
3598 for (j
= 0; j
< ndests
; ++j
)
3600 bitset_t intersec
; /* Intersection sets, see below. */
3602 /* Flags, see below. */
3603 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3605 /* Optimization, skip if this state doesn't accept the character. */
3606 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3609 /* Enumerate the intersection set of this state and `accepts'. */
3611 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3612 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3613 /* And skip if the intersection set is empty. */
3617 /* Then check if this state is a subset of `accepts'. */
3618 not_subset
= not_consumed
= 0;
3619 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3621 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3622 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3625 /* If this state isn't a subset of `accepts', create a
3626 new group state, which has the `remains'. */
3629 bitset_copy (dests_ch
[ndests
], remains
);
3630 bitset_copy (dests_ch
[j
], intersec
);
3631 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3632 if (BE (err
!= REG_NOERROR
, 0))
3637 /* Put the position in the current group. */
3638 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3639 if (BE (result
< 0, 0))
3642 /* If all characters are consumed, go to next node. */
3646 /* Some characters remain, create a new group. */
3649 bitset_copy (dests_ch
[ndests
], accepts
);
3650 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3651 if (BE (err
!= REG_NOERROR
, 0))
3654 bitset_empty (accepts
);
3659 for (j
= 0; j
< ndests
; ++j
)
3660 re_node_set_free (dests_node
+ j
);
3664 #ifdef RE_ENABLE_I18N
3665 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3666 Return the number of the bytes the node accepts.
3667 STR_IDX is the current index of the input string.
3669 This function handles the nodes which can accept one character, or
3670 one collating element like '.', '[a-z]', opposite to the other nodes
3671 can only accept one byte. */
3674 # include <locale/weight.h>
3678 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3679 const re_string_t
*input
, int str_idx
)
3681 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3682 int char_len
, elem_len
;
3685 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3687 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3688 if (BE (c
< 0xc2, 1))
3691 if (str_idx
+ 2 > input
->len
)
3694 d
= re_string_byte_at (input
, str_idx
+ 1);
3696 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3700 if (c
== 0xe0 && d
< 0xa0)
3706 if (c
== 0xf0 && d
< 0x90)
3712 if (c
== 0xf8 && d
< 0x88)
3718 if (c
== 0xfc && d
< 0x84)
3724 if (str_idx
+ char_len
> input
->len
)
3727 for (i
= 1; i
< char_len
; ++i
)
3729 d
= re_string_byte_at (input
, str_idx
+ i
);
3730 if (d
< 0x80 || d
> 0xbf)
3736 char_len
= re_string_char_size_at (input
, str_idx
);
3737 if (node
->type
== OP_PERIOD
)
3741 /* FIXME: I don't think this if is needed, as both '\n'
3742 and '\0' are char_len == 1. */
3743 /* '.' accepts any one character except the following two cases. */
3744 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3745 re_string_byte_at (input
, str_idx
) == '\n') ||
3746 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3747 re_string_byte_at (input
, str_idx
) == '\0'))
3752 elem_len
= re_string_elem_size_at (input
, str_idx
);
3753 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3756 if (node
->type
== COMPLEX_BRACKET
)
3758 const re_charset_t
*cset
= node
->opr
.mbcset
;
3760 const unsigned char *pin
3761 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3766 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3767 ? re_string_wchar_at (input
, str_idx
) : 0);
3769 /* match with multibyte character? */
3770 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3771 if (wc
== cset
->mbchars
[i
])
3773 match_len
= char_len
;
3774 goto check_node_accept_bytes_match
;
3776 /* match with character_class? */
3777 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3779 wctype_t wt
= cset
->char_classes
[i
];
3780 if (__iswctype (wc
, wt
))
3782 match_len
= char_len
;
3783 goto check_node_accept_bytes_match
;
3788 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3791 unsigned int in_collseq
= 0;
3792 const int32_t *table
, *indirect
;
3793 const unsigned char *weights
, *extra
;
3794 const char *collseqwc
;
3796 /* match with collating_symbol? */
3797 if (cset
->ncoll_syms
)
3798 extra
= (const unsigned char *)
3799 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3800 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3802 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3803 /* Compare the length of input collating element and
3804 the length of current collating element. */
3805 if (*coll_sym
!= elem_len
)
3807 /* Compare each bytes. */
3808 for (j
= 0; j
< *coll_sym
; j
++)
3809 if (pin
[j
] != coll_sym
[1 + j
])
3813 /* Match if every bytes is equal. */
3815 goto check_node_accept_bytes_match
;
3821 if (elem_len
<= char_len
)
3823 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3824 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3827 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3829 /* match with range expression? */
3830 for (i
= 0; i
< cset
->nranges
; ++i
)
3831 if (cset
->range_starts
[i
] <= in_collseq
3832 && in_collseq
<= cset
->range_ends
[i
])
3834 match_len
= elem_len
;
3835 goto check_node_accept_bytes_match
;
3838 /* match with equivalence_class? */
3839 if (cset
->nequiv_classes
)
3841 const unsigned char *cp
= pin
;
3842 table
= (const int32_t *)
3843 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3844 weights
= (const unsigned char *)
3845 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3846 extra
= (const unsigned char *)
3847 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3848 indirect
= (const int32_t *)
3849 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3850 int32_t idx
= findidx (table
, indirect
, extra
, &cp
, elem_len
);
3852 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3854 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3855 size_t weight_len
= weights
[idx
& 0xffffff];
3856 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3857 && (idx
>> 24) == (equiv_class_idx
>> 24))
3862 equiv_class_idx
&= 0xffffff;
3864 while (cnt
<= weight_len
3865 && (weights
[equiv_class_idx
+ 1 + cnt
]
3866 == weights
[idx
+ 1 + cnt
]))
3868 if (cnt
> weight_len
)
3870 match_len
= elem_len
;
3871 goto check_node_accept_bytes_match
;
3880 /* match with range expression? */
3882 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3884 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3887 for (i
= 0; i
< cset
->nranges
; ++i
)
3889 cmp_buf
[0] = cset
->range_starts
[i
];
3890 cmp_buf
[4] = cset
->range_ends
[i
];
3891 if (__wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3892 && __wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3894 match_len
= char_len
;
3895 goto check_node_accept_bytes_match
;
3899 check_node_accept_bytes_match
:
3900 if (!cset
->non_match
)
3907 return (elem_len
> char_len
) ? elem_len
: char_len
;
3915 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3917 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3922 /* No valid character. Match it as a single byte character. */
3923 const unsigned char *collseq
= (const unsigned char *)
3924 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3925 return collseq
[mbs
[0]];
3932 const unsigned char *extra
= (const unsigned char *)
3933 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3934 int32_t extrasize
= (const unsigned char *)
3935 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3937 for (idx
= 0; idx
< extrasize
;)
3939 int mbs_cnt
, found
= 0;
3940 int32_t elem_mbs_len
;
3941 /* Skip the name of collating element name. */
3942 idx
= idx
+ extra
[idx
] + 1;
3943 elem_mbs_len
= extra
[idx
++];
3944 if (mbs_len
== elem_mbs_len
)
3946 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3947 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3949 if (mbs_cnt
== elem_mbs_len
)
3950 /* Found the entry. */
3953 /* Skip the byte sequence of the collating element. */
3954 idx
+= elem_mbs_len
;
3955 /* Adjust for the alignment. */
3956 idx
= (idx
+ 3) & ~3;
3957 /* Skip the collation sequence value. */
3958 idx
+= sizeof (uint32_t);
3959 /* Skip the wide char sequence of the collating element. */
3960 idx
= idx
+ sizeof (uint32_t) * (*(int32_t *) (extra
+ idx
) + 1);
3961 /* If we found the entry, return the sequence value. */
3963 return *(uint32_t *) (extra
+ idx
);
3964 /* Skip the collation sequence value. */
3965 idx
+= sizeof (uint32_t);
3971 #endif /* RE_ENABLE_I18N */
3973 /* Check whether the node accepts the byte which is IDX-th
3974 byte of the INPUT. */
3977 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
3981 ch
= re_string_byte_at (&mctx
->input
, idx
);
3985 if (node
->opr
.c
!= ch
)
3989 case SIMPLE_BRACKET
:
3990 if (!bitset_contain (node
->opr
.sbcset
, ch
))
3994 #ifdef RE_ENABLE_I18N
3995 case OP_UTF8_PERIOD
:
4001 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4002 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4010 if (node
->constraint
)
4012 /* The node has constraints. Check whether the current context
4013 satisfies the constraints. */
4014 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4016 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4023 /* Extend the buffers, if the buffers have run out. */
4025 static reg_errcode_t
4026 __attribute_warn_unused_result__
4027 extend_buffers (re_match_context_t
*mctx
, int min_len
)
4030 re_string_t
*pstr
= &mctx
->input
;
4032 /* Avoid overflow. */
4033 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4036 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4037 ret
= re_string_realloc_buffers (pstr
,
4039 MIN (pstr
->len
, pstr
->bufs_len
* 2)));
4040 if (BE (ret
!= REG_NOERROR
, 0))
4043 if (mctx
->state_log
!= NULL
)
4045 /* And double the length of state_log. */
4046 /* XXX We have no indication of the size of this buffer. If this
4047 allocation fail we have no indication that the state_log array
4048 does not have the right size. */
4049 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4050 pstr
->bufs_len
+ 1);
4051 if (BE (new_array
== NULL
, 0))
4053 mctx
->state_log
= new_array
;
4056 /* Then reconstruct the buffers. */
4059 #ifdef RE_ENABLE_I18N
4060 if (pstr
->mb_cur_max
> 1)
4062 ret
= build_wcs_upper_buffer (pstr
);
4063 if (BE (ret
!= REG_NOERROR
, 0))
4067 #endif /* RE_ENABLE_I18N */
4068 build_upper_buffer (pstr
);
4072 #ifdef RE_ENABLE_I18N
4073 if (pstr
->mb_cur_max
> 1)
4074 build_wcs_buffer (pstr
);
4076 #endif /* RE_ENABLE_I18N */
4078 if (pstr
->trans
!= NULL
)
4079 re_string_translate_buffer (pstr
);
4086 /* Functions for matching context. */
4088 /* Initialize MCTX. */
4090 static reg_errcode_t
4091 __attribute_warn_unused_result__
4092 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4094 mctx
->eflags
= eflags
;
4095 mctx
->match_last
= -1;
4098 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4099 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4100 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4103 /* Already zero-ed by the caller.
4105 mctx->bkref_ents = NULL;
4106 mctx->nbkref_ents = 0;
4107 mctx->nsub_tops = 0; */
4108 mctx
->abkref_ents
= n
;
4109 mctx
->max_mb_elem_len
= 1;
4110 mctx
->asub_tops
= n
;
4114 /* Clean the entries which depend on the current input in MCTX.
4115 This function must be invoked when the matcher changes the start index
4116 of the input, or changes the input string. */
4119 match_ctx_clean (re_match_context_t
*mctx
)
4122 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4125 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4126 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4128 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4129 re_free (last
->path
.array
);
4132 re_free (top
->lasts
);
4135 re_free (top
->path
->array
);
4136 re_free (top
->path
);
4141 mctx
->nsub_tops
= 0;
4142 mctx
->nbkref_ents
= 0;
4145 /* Free all the memory associated with MCTX. */
4148 match_ctx_free (re_match_context_t
*mctx
)
4150 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4151 match_ctx_clean (mctx
);
4152 re_free (mctx
->sub_tops
);
4153 re_free (mctx
->bkref_ents
);
4156 /* Add a new backreference entry to MCTX.
4157 Note that we assume that caller never call this function with duplicate
4158 entry, and call with STR_IDX which isn't smaller than any existing entry.
4161 static reg_errcode_t
4162 __attribute_warn_unused_result__
4163 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4166 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4168 struct re_backref_cache_entry
* new_entry
;
4169 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4170 mctx
->abkref_ents
* 2);
4171 if (BE (new_entry
== NULL
, 0))
4173 re_free (mctx
->bkref_ents
);
4176 mctx
->bkref_ents
= new_entry
;
4177 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4178 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4179 mctx
->abkref_ents
*= 2;
4181 if (mctx
->nbkref_ents
> 0
4182 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4183 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4185 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4186 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4187 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4188 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4190 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4191 If bit N is clear, means that this entry won't epsilon-transition to
4192 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4193 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4196 A backreference does not epsilon-transition unless it is empty, so set
4197 to all zeros if FROM != TO. */
4198 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4199 = (from
== to
? ~0 : 0);
4201 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4202 if (mctx
->max_mb_elem_len
< to
- from
)
4203 mctx
->max_mb_elem_len
= to
- from
;
4207 /* Search for the first entry which has the same str_idx, or -1 if none is
4208 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4211 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4213 int left
, right
, mid
, last
;
4214 last
= right
= mctx
->nbkref_ents
;
4215 for (left
= 0; left
< right
;)
4217 mid
= (left
+ right
) / 2;
4218 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4223 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4229 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4232 static reg_errcode_t
4233 __attribute_warn_unused_result__
4234 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4237 assert (mctx
->sub_tops
!= NULL
);
4238 assert (mctx
->asub_tops
> 0);
4240 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4242 int new_asub_tops
= mctx
->asub_tops
* 2;
4243 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4244 re_sub_match_top_t
*,
4246 if (BE (new_array
== NULL
, 0))
4248 mctx
->sub_tops
= new_array
;
4249 mctx
->asub_tops
= new_asub_tops
;
4251 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4252 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4254 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4255 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4259 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4260 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4262 static re_sub_match_last_t
*
4263 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4265 re_sub_match_last_t
*new_entry
;
4266 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4268 int new_alasts
= 2 * subtop
->alasts
+ 1;
4269 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4270 re_sub_match_last_t
*,
4272 if (BE (new_array
== NULL
, 0))
4274 subtop
->lasts
= new_array
;
4275 subtop
->alasts
= new_alasts
;
4277 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4278 if (BE (new_entry
!= NULL
, 1))
4280 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4281 new_entry
->node
= node
;
4282 new_entry
->str_idx
= str_idx
;
4289 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4290 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4292 sctx
->sifted_states
= sifted_sts
;
4293 sctx
->limited_states
= limited_sts
;
4294 sctx
->last_node
= last_node
;
4295 sctx
->last_str_idx
= last_str_idx
;
4296 re_node_set_init_empty (&sctx
->limits
);