1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
);
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
);
55 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
);
56 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
57 int *p_match_first
) internal_function
;
58 static int check_halt_state_context (const re_match_context_t
*mctx
,
59 const re_dfastate_t
*state
, int idx
)
61 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
62 regmatch_t
*prev_idx_match
, int cur_node
,
63 int cur_idx
, int nmatch
) internal_function
;
64 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
65 int str_idx
, int dest_node
, int nregs
,
67 re_node_set
*eps_via_nodes
)
69 static reg_errcode_t
set_regs (const regex_t
*preg
,
70 const re_match_context_t
*mctx
,
71 size_t nmatch
, regmatch_t
*pmatch
,
72 int fl_backtrack
) internal_function
;
73 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
77 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
78 re_sift_context_t
*sctx
,
79 int node_idx
, int str_idx
, int max_str_idx
)
81 #endif /* RE_ENABLE_I18N */
82 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
83 re_sift_context_t
*sctx
)
85 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
86 re_sift_context_t
*sctx
, int str_idx
,
87 re_node_set
*cur_dest
)
89 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
90 re_sift_context_t
*sctx
,
92 re_node_set
*dest_nodes
)
94 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
95 re_node_set
*dest_nodes
,
96 const re_node_set
*candidates
)
98 static int check_dst_limits (const re_match_context_t
*mctx
,
100 int dst_node
, int dst_idx
, int src_node
,
101 int src_idx
) internal_function
;
102 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
103 int boundaries
, int subexp_idx
,
104 int from_node
, int bkref_idx
)
106 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
107 int limit
, int subexp_idx
,
108 int node
, int str_idx
,
109 int bkref_idx
) internal_function
;
110 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
111 re_node_set
*dest_nodes
,
112 const re_node_set
*candidates
,
114 struct re_backref_cache_entry
*bkref_ents
,
115 int str_idx
) internal_function
;
116 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
117 re_sift_context_t
*sctx
,
118 int str_idx
, const re_node_set
*candidates
)
120 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
122 re_dfastate_t
**src
, int num
)
124 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
125 re_match_context_t
*mctx
) internal_function
;
126 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
127 re_match_context_t
*mctx
,
128 re_dfastate_t
*state
) internal_function
;
129 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
130 re_match_context_t
*mctx
,
131 re_dfastate_t
*next_state
)
133 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
134 re_node_set
*cur_nodes
,
135 int str_idx
) internal_function
;
137 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
138 re_match_context_t
*mctx
,
139 re_dfastate_t
*pstate
)
142 #ifdef RE_ENABLE_I18N
143 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
144 re_dfastate_t
*pstate
)
146 #endif /* RE_ENABLE_I18N */
147 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
148 const re_node_set
*nodes
)
150 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
151 int bkref_node
, int bkref_str_idx
)
153 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
154 const re_sub_match_top_t
*sub_top
,
155 re_sub_match_last_t
*sub_last
,
156 int bkref_node
, int bkref_str
)
158 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
159 int subexp_idx
, int type
) internal_function
;
160 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
161 state_array_t
*path
, int top_node
,
162 int top_str
, int last_node
, int last_str
,
163 int type
) internal_function
;
164 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
166 re_node_set
*cur_nodes
,
167 re_node_set
*next_nodes
)
169 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
170 re_node_set
*cur_nodes
,
171 int ex_subexp
, int type
)
173 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
174 re_node_set
*dst_nodes
,
175 int target
, int ex_subexp
,
176 int type
) internal_function
;
177 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
178 re_node_set
*cur_nodes
, int cur_str
,
179 int subexp_num
, int type
)
181 static int build_trtable (const re_dfa_t
*dfa
,
182 re_dfastate_t
*state
) internal_function
;
183 #ifdef RE_ENABLE_I18N
184 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
185 const re_string_t
*input
, int idx
)
188 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
192 #endif /* RE_ENABLE_I18N */
193 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
194 const re_dfastate_t
*state
,
195 re_node_set
*states_node
,
196 bitset_t
*states_ch
) internal_function
;
197 static int check_node_accept (const re_match_context_t
*mctx
,
198 const re_token_t
*node
, int idx
)
200 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
203 /* Entry point for POSIX code. */
205 /* regexec searches for a given pattern, specified by PREG, in the
208 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
209 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
210 least NMATCH elements, and we set them to the offsets of the
211 corresponding matched substrings.
213 EFLAGS specifies `execution flags' which affect matching: if
214 REG_NOTBOL is set, then ^ does not match at the beginning of the
215 string; if REG_NOTEOL is set, then $ does not match at the end.
217 We return 0 if we find a match and REG_NOMATCH if not. */
221 const regex_t
*__restrict preg
,
222 const char *__restrict string
,
230 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
233 if (eflags
& REG_STARTEND
)
235 start
= pmatch
[0].rm_so
;
236 length
= pmatch
[0].rm_eo
;
241 length
= strlen (string
);
244 __libc_lock_lock (dfa
->lock
);
246 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
247 length
, 0, NULL
, eflags
);
249 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
250 length
, nmatch
, pmatch
, eflags
);
251 __libc_lock_unlock (dfa
->lock
);
252 return err
!= REG_NOERROR
;
256 # include <shlib-compat.h>
257 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
259 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
260 __typeof__ (__regexec
) __compat_regexec
;
263 attribute_compat_text_section
264 __compat_regexec (const regex_t
*__restrict preg
,
265 const char *__restrict string
, size_t nmatch
,
266 regmatch_t pmatch
[], int eflags
)
268 return regexec (preg
, string
, nmatch
, pmatch
,
269 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
271 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
275 /* Entry points for GNU code. */
277 /* re_match, re_search, re_match_2, re_search_2
279 The former two functions operate on STRING with length LENGTH,
280 while the later two operate on concatenation of STRING1 and STRING2
281 with lengths LENGTH1 and LENGTH2, respectively.
283 re_match() matches the compiled pattern in BUFP against the string,
284 starting at index START.
286 re_search() first tries matching at index START, then it tries to match
287 starting from index START + 1, and so on. The last start position tried
288 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
291 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
292 the first STOP characters of the concatenation of the strings should be
295 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
296 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
297 computed relative to the concatenation, not relative to the individual
300 On success, re_match* functions return the length of the match, re_search*
301 return the position of the start of the match. Return value -1 means no
302 match was found and -2 indicates an internal error. */
305 re_match (struct re_pattern_buffer
*bufp
,
309 struct re_registers
*regs
)
311 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
314 weak_alias (__re_match
, re_match
)
318 re_search (struct re_pattern_buffer
*bufp
,
320 int length
, int start
, int range
,
321 struct re_registers
*regs
)
323 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
326 weak_alias (__re_search
, re_search
)
330 re_match_2 (struct re_pattern_buffer
*bufp
,
331 const char *string1
, int length1
,
332 const char *string2
, int length2
, int start
,
333 struct re_registers
*regs
, int stop
)
335 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
336 start
, 0, regs
, stop
, 1);
339 weak_alias (__re_match_2
, re_match_2
)
343 re_search_2 (struct re_pattern_buffer
*bufp
,
344 const char *string1
, int length1
,
345 const char *string2
, int length2
, int start
,
346 int range
, struct re_registers
*regs
, int stop
)
348 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
349 start
, range
, regs
, stop
, 0);
352 weak_alias (__re_search_2
, re_search_2
)
356 re_search_2_stub (struct re_pattern_buffer
*bufp
,
357 const char *string1
, int length1
,
358 const char *string2
, int length2
, int start
,
359 int range
, struct re_registers
*regs
,
360 int stop
, int ret_len
)
364 int len
= length1
+ length2
;
367 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
370 /* Concatenate the strings. */
374 char *s
= re_malloc (char, len
);
376 if (BE (s
== NULL
, 0))
378 memcpy (s
, string1
, length1
);
379 memcpy (s
+ length1
, string2
, length2
);
388 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
390 re_free ((char *) str
);
394 /* The parameters have the same meaning as those of re_search.
395 Additional parameters:
396 If RET_LEN is nonzero the length of the match is returned (re_match style);
397 otherwise the position of the match is returned. */
400 re_search_stub (struct re_pattern_buffer
*bufp
,
401 const char *string
, int length
, int start
,
403 struct re_registers
*regs
, int ret_len
)
405 reg_errcode_t result
;
410 /* Check for out-of-range. */
411 if (BE (start
< 0 || start
> length
, 0))
413 if (BE (start
+ range
> length
, 0))
414 range
= length
- start
;
415 else if (BE (start
+ range
< 0, 0))
418 __libc_lock_lock (dfa
->lock
);
420 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
421 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
423 /* Compile fastmap if we haven't yet. */
424 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
425 re_compile_fastmap (bufp
);
427 if (BE (bufp
->no_sub
, 0))
430 /* We need at least 1 register. */
433 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
434 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
436 nregs
= regs
->num_regs
;
437 if (BE (nregs
< 1, 0))
439 /* Nothing can be copied to regs. */
445 nregs
= bufp
->re_nsub
+ 1;
446 pmatch
= re_malloc (regmatch_t
, nregs
);
447 if (BE (pmatch
== NULL
, 0))
453 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
454 nregs
, pmatch
, eflags
);
458 /* I hope we needn't fill their regs with -1's when no match was found. */
459 if (result
!= REG_NOERROR
)
461 else if (regs
!= NULL
)
463 /* If caller wants register contents data back, copy them. */
464 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
465 bufp
->regs_allocated
);
466 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
470 if (BE (rval
== 0, 1))
474 assert (pmatch
[0].rm_so
== start
);
475 rval
= pmatch
[0].rm_eo
- start
;
478 rval
= pmatch
[0].rm_so
;
482 __libc_lock_unlock (dfa
->lock
);
487 re_copy_regs (struct re_registers
*regs
,
489 int nregs
, int regs_allocated
)
491 int rval
= REGS_REALLOCATE
;
493 int need_regs
= nregs
+ 1;
494 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
497 /* Have the register data arrays been allocated? */
498 if (regs_allocated
== REGS_UNALLOCATED
)
499 { /* No. So allocate them with malloc. */
500 regs
->start
= re_malloc (regoff_t
, need_regs
);
501 if (BE (regs
->start
== NULL
, 0))
502 return REGS_UNALLOCATED
;
503 regs
->end
= re_malloc (regoff_t
, need_regs
);
504 if (BE (regs
->end
== NULL
, 0))
506 re_free (regs
->start
);
507 return REGS_UNALLOCATED
;
509 regs
->num_regs
= need_regs
;
511 else if (regs_allocated
== REGS_REALLOCATE
)
512 { /* Yes. If we need more elements than were already
513 allocated, reallocate them. If we need fewer, just
515 if (BE (need_regs
> regs
->num_regs
, 0))
517 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
519 if (BE (new_start
== NULL
, 0))
520 return REGS_UNALLOCATED
;
521 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
522 if (BE (new_end
== NULL
, 0))
525 return REGS_UNALLOCATED
;
527 regs
->start
= new_start
;
529 regs
->num_regs
= need_regs
;
534 assert (regs_allocated
== REGS_FIXED
);
535 /* This function may not be called with REGS_FIXED and nregs too big. */
536 assert (regs
->num_regs
>= nregs
);
541 for (i
= 0; i
< nregs
; ++i
)
543 regs
->start
[i
] = pmatch
[i
].rm_so
;
544 regs
->end
[i
] = pmatch
[i
].rm_eo
;
546 for ( ; i
< regs
->num_regs
; ++i
)
547 regs
->start
[i
] = regs
->end
[i
] = -1;
552 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
553 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
554 this memory for recording register information. STARTS and ENDS
555 must be allocated using the malloc library routine, and must each
556 be at least NUM_REGS * sizeof (regoff_t) bytes long.
558 If NUM_REGS == 0, then subsequent matches should allocate their own
561 Unless this function is called, the first search or match using
562 PATTERN_BUFFER will allocate its own register data, without
563 freeing the old data. */
566 re_set_registers (struct re_pattern_buffer
*bufp
,
567 struct re_registers
*regs
,
574 bufp
->regs_allocated
= REGS_REALLOCATE
;
575 regs
->num_regs
= num_regs
;
576 regs
->start
= starts
;
581 bufp
->regs_allocated
= REGS_UNALLOCATED
;
583 regs
->start
= regs
->end
= (regoff_t
*) 0;
587 weak_alias (__re_set_registers
, re_set_registers
)
590 /* Entry points compatible with 4.2 BSD regex library. We don't define
591 them unless specifically requested. */
593 #if defined _REGEX_RE_COMP || defined _LIBC
601 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
603 #endif /* _REGEX_RE_COMP */
605 /* Internal entry point. */
607 /* Searches for a compiled pattern PREG in the string STRING, whose
608 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
609 mingings with regexec. START, and RANGE have the same meanings
611 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
612 otherwise return the error code.
613 Note: We assume front end functions already check ranges.
614 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
617 re_search_internal (const regex_t
*preg
,
619 int length
, int start
, int range
, int stop
,
620 size_t nmatch
, regmatch_t pmatch
[],
624 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
625 int left_lim
, right_lim
, incr
;
626 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
629 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
630 re_match_context_t mctx
= { .dfa
= dfa
};
632 re_match_context_t mctx
;
634 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
635 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
636 RE_TRANSLATE_TYPE t
= preg
->translate
;
638 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
639 memset (&mctx
, '\0', sizeof (re_match_context_t
));
643 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
644 nmatch
-= extra_nmatch
;
646 /* Check if the DFA haven't been compiled. */
647 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
648 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
649 || dfa
->init_state_begbuf
== NULL
, 0))
653 /* We assume front-end functions already check them. */
654 assert (start
+ range
>= 0 && start
+ range
<= length
);
657 /* If initial states with non-begbuf contexts have no elements,
658 the regex must be anchored. If preg->newline_anchor is set,
659 we'll never use init_state_nl, so do not check it. */
660 if (dfa
->init_state
->nodes
.nelem
== 0
661 && dfa
->init_state_word
->nodes
.nelem
== 0
662 && (dfa
->init_state_nl
->nodes
.nelem
== 0
663 || !preg
->newline_anchor
))
665 if (start
!= 0 && start
+ range
!= 0)
670 /* We must check the longest matching, if nmatch > 0. */
671 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
673 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
674 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
675 if (BE (err
!= REG_NOERROR
, 0))
677 mctx
.input
.stop
= stop
;
678 mctx
.input
.raw_stop
= stop
;
679 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
681 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
682 if (BE (err
!= REG_NOERROR
, 0))
685 /* We will log all the DFA states through which the dfa pass,
686 if nmatch > 1, or this dfa has "multibyte node", which is a
687 back-reference or a node which can accept multibyte character or
688 multi character collating element. */
689 if (nmatch
> 1 || dfa
->has_mb_node
)
691 /* Avoid overflow. */
692 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
698 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
699 if (BE (mctx
.state_log
== NULL
, 0))
706 mctx
.state_log
= NULL
;
709 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
710 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
712 /* Check incrementally whether of not the input string match. */
713 incr
= (range
< 0) ? -1 : 1;
714 left_lim
= (range
< 0) ? start
+ range
: start
;
715 right_lim
= (range
< 0) ? start
: start
+ range
;
716 sb
= dfa
->mb_cur_max
== 1;
719 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
720 | (range
>= 0 ? 2 : 0)
721 | (t
!= NULL
? 1 : 0))
724 for (;; match_first
+= incr
)
727 if (match_first
< left_lim
|| right_lim
< match_first
)
730 /* Advance as rapidly as possible through the string, until we
731 find a plausible place to start matching. This may be done
732 with varying efficiency, so there are various possibilities:
733 only the most common of them are specialized, in order to
734 save on code size. We use a switch statement for speed. */
742 /* Fastmap with single-byte translation, match forward. */
743 while (BE (match_first
< right_lim
, 1)
744 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
746 goto forward_match_found_start_or_reached_end
;
749 /* Fastmap without translation, match forward. */
750 while (BE (match_first
< right_lim
, 1)
751 && !fastmap
[(unsigned char) string
[match_first
]])
754 forward_match_found_start_or_reached_end
:
755 if (BE (match_first
== right_lim
, 0))
757 ch
= match_first
>= length
758 ? 0 : (unsigned char) string
[match_first
];
759 if (!fastmap
[t
? t
[ch
] : ch
])
766 /* Fastmap without multi-byte translation, match backwards. */
767 while (match_first
>= left_lim
)
769 ch
= match_first
>= length
770 ? 0 : (unsigned char) string
[match_first
];
771 if (fastmap
[t
? t
[ch
] : ch
])
775 if (match_first
< left_lim
)
780 /* In this case, we can't determine easily the current byte,
781 since it might be a component byte of a multibyte
782 character. Then we use the constructed buffer instead. */
785 /* If MATCH_FIRST is out of the valid range, reconstruct the
787 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
788 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
790 err
= re_string_reconstruct (&mctx
.input
, match_first
,
792 if (BE (err
!= REG_NOERROR
, 0))
795 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
797 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
798 Note that MATCH_FIRST must not be smaller than 0. */
799 ch
= (match_first
>= length
800 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
804 if (match_first
< left_lim
|| match_first
> right_lim
)
813 /* Reconstruct the buffers so that the matcher can assume that
814 the matching starts from the beginning of the buffer. */
815 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
816 if (BE (err
!= REG_NOERROR
, 0))
819 #ifdef RE_ENABLE_I18N
820 /* Don't consider this char as a possible match start if it part,
821 yet isn't the head, of a multibyte character. */
822 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
826 /* It seems to be appropriate one, then use the matcher. */
827 /* We assume that the matching starts from 0. */
828 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
829 match_last
= check_matching (&mctx
, fl_longest_match
,
830 range
>= 0 ? &match_first
: NULL
);
831 if (match_last
!= -1)
833 if (BE (match_last
== -2, 0))
840 mctx
.match_last
= match_last
;
841 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
843 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
844 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
847 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
850 err
= prune_impossible_nodes (&mctx
);
851 if (err
== REG_NOERROR
)
853 if (BE (err
!= REG_NOMATCH
, 0))
858 break; /* We found a match. */
862 match_ctx_clean (&mctx
);
866 assert (match_last
!= -1);
867 assert (err
== REG_NOERROR
);
870 /* Set pmatch[] if we need. */
875 /* Initialize registers. */
876 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
877 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
879 /* Set the points where matching start/end. */
881 pmatch
[0].rm_eo
= mctx
.match_last
;
883 if (!preg
->no_sub
&& nmatch
> 1)
885 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
886 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
887 if (BE (err
!= REG_NOERROR
, 0))
891 /* At last, add the offset to the each registers, since we slided
892 the buffers so that we could assume that the matching starts
894 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
895 if (pmatch
[reg_idx
].rm_so
!= -1)
897 #ifdef RE_ENABLE_I18N
898 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
900 pmatch
[reg_idx
].rm_so
=
901 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
902 ? mctx
.input
.valid_raw_len
903 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
904 pmatch
[reg_idx
].rm_eo
=
905 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
906 ? mctx
.input
.valid_raw_len
907 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
910 assert (mctx
.input
.offsets_needed
== 0);
912 pmatch
[reg_idx
].rm_so
+= match_first
;
913 pmatch
[reg_idx
].rm_eo
+= match_first
;
915 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
917 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
918 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
922 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
923 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
925 pmatch
[reg_idx
+ 1].rm_so
926 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
927 pmatch
[reg_idx
+ 1].rm_eo
928 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
933 re_free (mctx
.state_log
);
935 match_ctx_free (&mctx
);
936 re_string_destruct (&mctx
.input
);
941 prune_impossible_nodes (re_match_context_t
*mctx
)
943 const re_dfa_t
*const dfa
= mctx
->dfa
;
944 int halt_node
, match_last
;
946 re_dfastate_t
**sifted_states
;
947 re_dfastate_t
**lim_states
= NULL
;
948 re_sift_context_t sctx
;
950 assert (mctx
->state_log
!= NULL
);
952 match_last
= mctx
->match_last
;
953 halt_node
= mctx
->last_node
;
955 /* Avoid overflow. */
956 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
959 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
960 if (BE (sifted_states
== NULL
, 0))
967 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
968 if (BE (lim_states
== NULL
, 0))
975 memset (lim_states
, '\0',
976 sizeof (re_dfastate_t
*) * (match_last
+ 1));
977 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
979 ret
= sift_states_backward (mctx
, &sctx
);
980 re_node_set_free (&sctx
.limits
);
981 if (BE (ret
!= REG_NOERROR
, 0))
983 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
993 } while (mctx
->state_log
[match_last
] == NULL
994 || !mctx
->state_log
[match_last
]->halt
);
995 halt_node
= check_halt_state_context (mctx
,
996 mctx
->state_log
[match_last
],
999 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
1001 re_free (lim_states
);
1003 if (BE (ret
!= REG_NOERROR
, 0))
1008 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1009 ret
= sift_states_backward (mctx
, &sctx
);
1010 re_node_set_free (&sctx
.limits
);
1011 if (BE (ret
!= REG_NOERROR
, 0))
1013 if (sifted_states
[0] == NULL
)
1019 re_free (mctx
->state_log
);
1020 mctx
->state_log
= sifted_states
;
1021 sifted_states
= NULL
;
1022 mctx
->last_node
= halt_node
;
1023 mctx
->match_last
= match_last
;
1026 re_free (sifted_states
);
1027 re_free (lim_states
);
1031 /* Acquire an initial state and return it.
1032 We must select appropriate initial state depending on the context,
1033 since initial states may have constraints like "\<", "^", etc.. */
1035 static inline re_dfastate_t
*
1036 __attribute ((always_inline
)) internal_function
1037 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1040 const re_dfa_t
*const dfa
= mctx
->dfa
;
1041 if (dfa
->init_state
->has_constraint
)
1043 unsigned int context
;
1044 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1045 if (IS_WORD_CONTEXT (context
))
1046 return dfa
->init_state_word
;
1047 else if (IS_ORDINARY_CONTEXT (context
))
1048 return dfa
->init_state
;
1049 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1050 return dfa
->init_state_begbuf
;
1051 else if (IS_NEWLINE_CONTEXT (context
))
1052 return dfa
->init_state_nl
;
1053 else if (IS_BEGBUF_CONTEXT (context
))
1055 /* It is relatively rare case, then calculate on demand. */
1056 return re_acquire_state_context (err
, dfa
,
1057 dfa
->init_state
->entrance_nodes
,
1061 /* Must not happen? */
1062 return dfa
->init_state
;
1065 return dfa
->init_state
;
1068 /* Check whether the regular expression match input string INPUT or not,
1069 and return the index where the matching end, return -1 if not match,
1070 or return -2 in case of an error.
1071 FL_LONGEST_MATCH means we want the POSIX longest matching.
1072 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1073 next place where we may want to try matching.
1074 Note that the matcher assume that the matching starts from the current
1075 index of the buffer. */
1079 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1082 const re_dfa_t
*const dfa
= mctx
->dfa
;
1085 int match_last
= -1;
1086 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1087 re_dfastate_t
*cur_state
;
1088 int at_init_state
= p_match_first
!= NULL
;
1089 int next_start_idx
= cur_str_idx
;
1092 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1093 /* An initial state must not be NULL (invalid). */
1094 if (BE (cur_state
== NULL
, 0))
1096 assert (err
== REG_ESPACE
);
1100 if (mctx
->state_log
!= NULL
)
1102 mctx
->state_log
[cur_str_idx
] = cur_state
;
1104 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1105 later. E.g. Processing back references. */
1106 if (BE (dfa
->nbackref
, 0))
1109 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1110 if (BE (err
!= REG_NOERROR
, 0))
1113 if (cur_state
->has_backref
)
1115 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1116 if (BE (err
!= REG_NOERROR
, 0))
1122 /* If the RE accepts NULL string. */
1123 if (BE (cur_state
->halt
, 0))
1125 if (!cur_state
->has_constraint
1126 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1128 if (!fl_longest_match
)
1132 match_last
= cur_str_idx
;
1138 while (!re_string_eoi (&mctx
->input
))
1140 re_dfastate_t
*old_state
= cur_state
;
1141 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1143 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1144 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1145 && mctx
->input
.valid_len
< mctx
->input
.len
))
1147 err
= extend_buffers (mctx
);
1148 if (BE (err
!= REG_NOERROR
, 0))
1150 assert (err
== REG_ESPACE
);
1155 cur_state
= transit_state (&err
, mctx
, cur_state
);
1156 if (mctx
->state_log
!= NULL
)
1157 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1159 if (cur_state
== NULL
)
1161 /* Reached the invalid state or an error. Try to recover a valid
1162 state using the state log, if available and if we have not
1163 already found a valid (even if not the longest) match. */
1164 if (BE (err
!= REG_NOERROR
, 0))
1167 if (mctx
->state_log
== NULL
1168 || (match
&& !fl_longest_match
)
1169 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1173 if (BE (at_init_state
, 0))
1175 if (old_state
== cur_state
)
1176 next_start_idx
= next_char_idx
;
1181 if (cur_state
->halt
)
1183 /* Reached a halt state.
1184 Check the halt state can satisfy the current context. */
1185 if (!cur_state
->has_constraint
1186 || check_halt_state_context (mctx
, cur_state
,
1187 re_string_cur_idx (&mctx
->input
)))
1189 /* We found an appropriate halt state. */
1190 match_last
= re_string_cur_idx (&mctx
->input
);
1193 /* We found a match, do not modify match_first below. */
1194 p_match_first
= NULL
;
1195 if (!fl_longest_match
)
1202 *p_match_first
+= next_start_idx
;
1207 /* Check NODE match the current context. */
1211 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1213 re_token_type_t type
= dfa
->nodes
[node
].type
;
1214 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1215 if (type
!= END_OF_RE
)
1219 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1224 /* Check the halt state STATE match the current context.
1225 Return 0 if not match, if the node, STATE has, is a halt node and
1226 match the context, return the node. */
1230 check_halt_state_context (const re_match_context_t
*mctx
,
1231 const re_dfastate_t
*state
, int idx
)
1234 unsigned int context
;
1236 assert (state
->halt
);
1238 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1239 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1240 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1241 return state
->nodes
.elems
[i
];
1245 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1246 corresponding to the DFA).
1247 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1252 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1253 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1254 struct re_fail_stack_t
*fs
)
1256 const re_dfa_t
*const dfa
= mctx
->dfa
;
1258 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1260 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1261 re_node_set
*edests
= &dfa
->edests
[node
];
1263 err
= re_node_set_insert (eps_via_nodes
, node
);
1264 if (BE (err
< 0, 0))
1266 /* Pick up a valid destination, or return -1 if none is found. */
1267 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1269 int candidate
= edests
->elems
[i
];
1270 if (!re_node_set_contains (cur_nodes
, candidate
))
1272 if (dest_node
== -1)
1273 dest_node
= candidate
;
1277 /* In order to avoid infinite loop like "(a*)*", return the second
1278 epsilon-transition if the first was already considered. */
1279 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1282 /* Otherwise, push the second epsilon-transition on the fail stack. */
1284 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1288 /* We know we are going to exit. */
1297 re_token_type_t type
= dfa
->nodes
[node
].type
;
1299 #ifdef RE_ENABLE_I18N
1300 if (dfa
->nodes
[node
].accept_mb
)
1301 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1303 #endif /* RE_ENABLE_I18N */
1304 if (type
== OP_BACK_REF
)
1306 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1307 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1310 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1314 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1315 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1324 err
= re_node_set_insert (eps_via_nodes
, node
);
1325 if (BE (err
< 0, 0))
1327 dest_node
= dfa
->edests
[node
].elems
[0];
1328 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1335 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1337 int dest_node
= dfa
->nexts
[node
];
1338 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1339 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1340 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1343 re_node_set_empty (eps_via_nodes
);
1350 static reg_errcode_t
1352 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1353 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1356 int num
= fs
->num
++;
1357 if (fs
->num
== fs
->alloc
)
1359 struct re_fail_stack_ent_t
*new_array
;
1360 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1362 if (new_array
== NULL
)
1365 fs
->stack
= new_array
;
1367 fs
->stack
[num
].idx
= str_idx
;
1368 fs
->stack
[num
].node
= dest_node
;
1369 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1370 if (fs
->stack
[num
].regs
== NULL
)
1372 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1373 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1379 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1380 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1382 int num
= --fs
->num
;
1384 *pidx
= fs
->stack
[num
].idx
;
1385 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1386 re_node_set_free (eps_via_nodes
);
1387 re_free (fs
->stack
[num
].regs
);
1388 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1389 return fs
->stack
[num
].node
;
1392 /* Set the positions where the subexpressions are starts/ends to registers
1394 Note: We assume that pmatch[0] is already set, and
1395 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1397 static reg_errcode_t
1399 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1400 regmatch_t
*pmatch
, int fl_backtrack
)
1402 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1404 re_node_set eps_via_nodes
;
1405 struct re_fail_stack_t
*fs
;
1406 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1407 regmatch_t
*prev_idx_match
;
1408 int prev_idx_match_malloced
= 0;
1411 assert (nmatch
> 1);
1412 assert (mctx
->state_log
!= NULL
);
1417 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1418 if (fs
->stack
== NULL
)
1424 cur_node
= dfa
->init_node
;
1425 re_node_set_init_empty (&eps_via_nodes
);
1428 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1429 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1433 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1434 if (prev_idx_match
== NULL
)
1436 free_fail_stack_return (fs
);
1439 prev_idx_match_malloced
= 1;
1441 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1443 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1445 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1447 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1452 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1453 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1455 if (reg_idx
== nmatch
)
1457 re_node_set_free (&eps_via_nodes
);
1458 if (prev_idx_match_malloced
)
1459 re_free (prev_idx_match
);
1460 return free_fail_stack_return (fs
);
1462 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1467 re_node_set_free (&eps_via_nodes
);
1468 if (prev_idx_match_malloced
)
1469 re_free (prev_idx_match
);
1474 /* Proceed to next node. */
1475 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1476 &eps_via_nodes
, fs
);
1478 if (BE (cur_node
< 0, 0))
1480 if (BE (cur_node
== -2, 0))
1482 re_node_set_free (&eps_via_nodes
);
1483 if (prev_idx_match_malloced
)
1484 re_free (prev_idx_match
);
1485 free_fail_stack_return (fs
);
1489 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1493 re_node_set_free (&eps_via_nodes
);
1494 if (prev_idx_match_malloced
)
1495 re_free (prev_idx_match
);
1500 re_node_set_free (&eps_via_nodes
);
1501 if (prev_idx_match_malloced
)
1502 re_free (prev_idx_match
);
1503 return free_fail_stack_return (fs
);
1506 static reg_errcode_t
1508 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1513 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1515 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1516 re_free (fs
->stack
[fs_idx
].regs
);
1518 re_free (fs
->stack
);
1525 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1526 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1528 int type
= dfa
->nodes
[cur_node
].type
;
1529 if (type
== OP_OPEN_SUBEXP
)
1531 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1533 /* We are at the first node of this sub expression. */
1534 if (reg_num
< nmatch
)
1536 pmatch
[reg_num
].rm_so
= cur_idx
;
1537 pmatch
[reg_num
].rm_eo
= -1;
1540 else if (type
== OP_CLOSE_SUBEXP
)
1542 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1543 if (reg_num
< nmatch
)
1545 /* We are at the last node of this sub expression. */
1546 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1548 pmatch
[reg_num
].rm_eo
= cur_idx
;
1549 /* This is a non-empty match or we are not inside an optional
1550 subexpression. Accept this right away. */
1551 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1555 if (dfa
->nodes
[cur_node
].opt_subexp
1556 && prev_idx_match
[reg_num
].rm_so
!= -1)
1557 /* We transited through an empty match for an optional
1558 subexpression, like (a?)*, and this is not the subexp's
1559 first match. Copy back the old content of the registers
1560 so that matches of an inner subexpression are undone as
1561 well, like in ((a?))*. */
1562 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1564 /* We completed a subexpression, but it may be part of
1565 an optional one, so do not update PREV_IDX_MATCH. */
1566 pmatch
[reg_num
].rm_eo
= cur_idx
;
1572 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1573 and sift the nodes in each states according to the following rules.
1574 Updated state_log will be wrote to STATE_LOG.
1576 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1577 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1578 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1579 the LAST_NODE, we throw away the node `a'.
1580 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1581 string `s' and transit to `b':
1582 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1584 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1585 thrown away, we throw away the node `a'.
1586 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1587 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1589 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1590 we throw away the node `a'. */
1592 #define STATE_NODE_CONTAINS(state,node) \
1593 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1595 static reg_errcode_t
1597 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1601 int str_idx
= sctx
->last_str_idx
;
1602 re_node_set cur_dest
;
1605 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1608 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1609 transit to the last_node and the last_node itself. */
1610 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1611 if (BE (err
!= REG_NOERROR
, 0))
1613 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1614 if (BE (err
!= REG_NOERROR
, 0))
1617 /* Then check each states in the state_log. */
1620 /* Update counters. */
1621 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1622 if (null_cnt
> mctx
->max_mb_elem_len
)
1624 memset (sctx
->sifted_states
, '\0',
1625 sizeof (re_dfastate_t
*) * str_idx
);
1626 re_node_set_free (&cur_dest
);
1629 re_node_set_empty (&cur_dest
);
1632 if (mctx
->state_log
[str_idx
])
1634 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1635 if (BE (err
!= REG_NOERROR
, 0))
1639 /* Add all the nodes which satisfy the following conditions:
1640 - It can epsilon transit to a node in CUR_DEST.
1642 And update state_log. */
1643 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1644 if (BE (err
!= REG_NOERROR
, 0))
1649 re_node_set_free (&cur_dest
);
1653 static reg_errcode_t
1655 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1656 int str_idx
, re_node_set
*cur_dest
)
1658 const re_dfa_t
*const dfa
= mctx
->dfa
;
1659 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1662 /* Then build the next sifted state.
1663 We build the next sifted state on `cur_dest', and update
1664 `sifted_states[str_idx]' with `cur_dest'.
1666 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1667 `cur_src' points the node_set of the old `state_log[str_idx]'
1668 (with the epsilon nodes pre-filtered out). */
1669 for (i
= 0; i
< cur_src
->nelem
; i
++)
1671 int prev_node
= cur_src
->elems
[i
];
1676 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1677 assert (!IS_EPSILON_NODE (type
));
1679 #ifdef RE_ENABLE_I18N
1680 /* If the node may accept `multi byte'. */
1681 if (dfa
->nodes
[prev_node
].accept_mb
)
1682 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1683 str_idx
, sctx
->last_str_idx
);
1684 #endif /* RE_ENABLE_I18N */
1686 /* We don't check backreferences here.
1687 See update_cur_sifted_state(). */
1689 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1690 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1691 dfa
->nexts
[prev_node
]))
1697 if (sctx
->limits
.nelem
)
1699 int to_idx
= str_idx
+ naccepted
;
1700 if (check_dst_limits (mctx
, &sctx
->limits
,
1701 dfa
->nexts
[prev_node
], to_idx
,
1702 prev_node
, str_idx
))
1705 ret
= re_node_set_insert (cur_dest
, prev_node
);
1706 if (BE (ret
== -1, 0))
1713 /* Helper functions. */
1715 static reg_errcode_t
1717 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1719 int top
= mctx
->state_log_top
;
1721 if (next_state_log_idx
>= mctx
->input
.bufs_len
1722 || (next_state_log_idx
>= mctx
->input
.valid_len
1723 && mctx
->input
.valid_len
< mctx
->input
.len
))
1726 err
= extend_buffers (mctx
);
1727 if (BE (err
!= REG_NOERROR
, 0))
1731 if (top
< next_state_log_idx
)
1733 memset (mctx
->state_log
+ top
+ 1, '\0',
1734 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1735 mctx
->state_log_top
= next_state_log_idx
;
1740 static reg_errcode_t
1742 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1743 re_dfastate_t
**src
, int num
)
1747 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1749 if (dst
[st_idx
] == NULL
)
1750 dst
[st_idx
] = src
[st_idx
];
1751 else if (src
[st_idx
] != NULL
)
1753 re_node_set merged_set
;
1754 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1755 &src
[st_idx
]->nodes
);
1756 if (BE (err
!= REG_NOERROR
, 0))
1758 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1759 re_node_set_free (&merged_set
);
1760 if (BE (err
!= REG_NOERROR
, 0))
1767 static reg_errcode_t
1769 update_cur_sifted_state (const re_match_context_t
*mctx
,
1770 re_sift_context_t
*sctx
, int str_idx
,
1771 re_node_set
*dest_nodes
)
1773 const re_dfa_t
*const dfa
= mctx
->dfa
;
1774 reg_errcode_t err
= REG_NOERROR
;
1775 const re_node_set
*candidates
;
1776 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1777 : &mctx
->state_log
[str_idx
]->nodes
);
1779 if (dest_nodes
->nelem
== 0)
1780 sctx
->sifted_states
[str_idx
] = NULL
;
1785 /* At first, add the nodes which can epsilon transit to a node in
1787 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1788 if (BE (err
!= REG_NOERROR
, 0))
1791 /* Then, check the limitations in the current sift_context. */
1792 if (sctx
->limits
.nelem
)
1794 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1795 mctx
->bkref_ents
, str_idx
);
1796 if (BE (err
!= REG_NOERROR
, 0))
1801 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1802 if (BE (err
!= REG_NOERROR
, 0))
1806 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1808 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1809 if (BE (err
!= REG_NOERROR
, 0))
1815 static reg_errcode_t
1817 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1818 const re_node_set
*candidates
)
1820 reg_errcode_t err
= REG_NOERROR
;
1823 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1824 if (BE (err
!= REG_NOERROR
, 0))
1827 if (!state
->inveclosure
.alloc
)
1829 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1830 if (BE (err
!= REG_NOERROR
, 0))
1832 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1834 err
= re_node_set_merge (&state
->inveclosure
,
1835 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1836 if (BE (err
!= REG_NOERROR
, 0))
1840 return re_node_set_add_intersect (dest_nodes
, candidates
,
1841 &state
->inveclosure
);
1844 static reg_errcode_t
1846 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1847 const re_node_set
*candidates
)
1851 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1852 re_node_set except_nodes
;
1853 re_node_set_init_empty (&except_nodes
);
1854 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1856 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1857 if (cur_node
== node
)
1859 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1861 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1862 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1863 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1864 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1865 && re_node_set_contains (dest_nodes
, edst1
))
1867 && !re_node_set_contains (inv_eclosure
, edst2
)
1868 && re_node_set_contains (dest_nodes
, edst2
)))
1870 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1871 dfa
->inveclosures
+ cur_node
);
1872 if (BE (err
!= REG_NOERROR
, 0))
1874 re_node_set_free (&except_nodes
);
1880 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1882 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1883 if (!re_node_set_contains (&except_nodes
, cur_node
))
1885 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1886 re_node_set_remove_at (dest_nodes
, idx
);
1889 re_node_set_free (&except_nodes
);
1895 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1896 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1898 const re_dfa_t
*const dfa
= mctx
->dfa
;
1899 int lim_idx
, src_pos
, dst_pos
;
1901 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1902 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1903 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1906 struct re_backref_cache_entry
*ent
;
1907 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1908 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1910 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1911 subexp_idx
, dst_node
, dst_idx
,
1913 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1914 subexp_idx
, src_node
, src_idx
,
1918 <src> <dst> ( <subexp> )
1919 ( <subexp> ) <src> <dst>
1920 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1921 if (src_pos
== dst_pos
)
1922 continue; /* This is unrelated limitation. */
1931 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1932 int subexp_idx
, int from_node
, int bkref_idx
)
1934 const re_dfa_t
*const dfa
= mctx
->dfa
;
1935 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1938 /* Else, we are on the boundary: examine the nodes on the epsilon
1940 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1942 int node
= eclosures
->elems
[node_idx
];
1943 switch (dfa
->nodes
[node
].type
)
1946 if (bkref_idx
!= -1)
1948 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1953 if (ent
->node
!= node
)
1956 if (subexp_idx
< BITSET_WORD_BITS
1957 && !(ent
->eps_reachable_subexps_map
1958 & ((bitset_word_t
) 1 << subexp_idx
)))
1961 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1962 OP_CLOSE_SUBEXP cases below. But, if the
1963 destination node is the same node as the source
1964 node, don't recurse because it would cause an
1965 infinite loop: a regex that exhibits this behavior
1967 dst
= dfa
->edests
[node
].elems
[0];
1968 if (dst
== from_node
)
1972 else /* if (boundaries & 2) */
1977 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1979 if (cpos
== -1 /* && (boundaries & 1) */)
1981 if (cpos
== 0 && (boundaries
& 2))
1984 if (subexp_idx
< BITSET_WORD_BITS
)
1985 ent
->eps_reachable_subexps_map
1986 &= ~((bitset_word_t
) 1 << subexp_idx
);
1988 while (ent
++->more
);
1992 case OP_OPEN_SUBEXP
:
1993 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1997 case OP_CLOSE_SUBEXP
:
1998 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2007 return (boundaries
& 2) ? 1 : 0;
2012 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2013 int subexp_idx
, int from_node
, int str_idx
,
2016 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2019 /* If we are outside the range of the subexpression, return -1 or 1. */
2020 if (str_idx
< lim
->subexp_from
)
2023 if (lim
->subexp_to
< str_idx
)
2026 /* If we are within the subexpression, return 0. */
2027 boundaries
= (str_idx
== lim
->subexp_from
);
2028 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2029 if (boundaries
== 0)
2032 /* Else, examine epsilon closure. */
2033 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2034 from_node
, bkref_idx
);
2037 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2038 which are against limitations from DEST_NODES. */
2040 static reg_errcode_t
2042 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2043 const re_node_set
*candidates
, re_node_set
*limits
,
2044 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2047 int node_idx
, lim_idx
;
2049 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2052 struct re_backref_cache_entry
*ent
;
2053 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2055 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2056 continue; /* This is unrelated limitation. */
2058 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2059 if (ent
->subexp_to
== str_idx
)
2063 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2065 int node
= dest_nodes
->elems
[node_idx
];
2066 re_token_type_t type
= dfa
->nodes
[node
].type
;
2067 if (type
== OP_OPEN_SUBEXP
2068 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2070 else if (type
== OP_CLOSE_SUBEXP
2071 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2075 /* Check the limitation of the open subexpression. */
2076 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2079 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2081 if (BE (err
!= REG_NOERROR
, 0))
2085 /* Check the limitation of the close subexpression. */
2087 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2089 int node
= dest_nodes
->elems
[node_idx
];
2090 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2092 && !re_node_set_contains (dfa
->eclosures
+ node
,
2095 /* It is against this limitation.
2096 Remove it form the current sifted state. */
2097 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2099 if (BE (err
!= REG_NOERROR
, 0))
2105 else /* (ent->subexp_to != str_idx) */
2107 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2109 int node
= dest_nodes
->elems
[node_idx
];
2110 re_token_type_t type
= dfa
->nodes
[node
].type
;
2111 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2113 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2115 /* It is against this limitation.
2116 Remove it form the current sifted state. */
2117 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2119 if (BE (err
!= REG_NOERROR
, 0))
2128 static reg_errcode_t
2130 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2131 int str_idx
, const re_node_set
*candidates
)
2133 const re_dfa_t
*const dfa
= mctx
->dfa
;
2136 re_sift_context_t local_sctx
;
2137 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2139 if (first_idx
== -1)
2142 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2144 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2147 re_token_type_t type
;
2148 struct re_backref_cache_entry
*entry
;
2149 node
= candidates
->elems
[node_idx
];
2150 type
= dfa
->nodes
[node
].type
;
2151 /* Avoid infinite loop for the REs like "()\1+". */
2152 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2154 if (type
!= OP_BACK_REF
)
2157 entry
= mctx
->bkref_ents
+ first_idx
;
2158 enabled_idx
= first_idx
;
2165 re_dfastate_t
*cur_state
;
2167 if (entry
->node
!= node
)
2169 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2170 to_idx
= str_idx
+ subexp_len
;
2171 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2172 : dfa
->edests
[node
].elems
[0]);
2174 if (to_idx
> sctx
->last_str_idx
2175 || sctx
->sifted_states
[to_idx
] == NULL
2176 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2177 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2178 str_idx
, dst_node
, to_idx
))
2181 if (local_sctx
.sifted_states
== NULL
)
2184 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2185 if (BE (err
!= REG_NOERROR
, 0))
2188 local_sctx
.last_node
= node
;
2189 local_sctx
.last_str_idx
= str_idx
;
2190 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2191 if (BE (ret
< 0, 0))
2196 cur_state
= local_sctx
.sifted_states
[str_idx
];
2197 err
= sift_states_backward (mctx
, &local_sctx
);
2198 if (BE (err
!= REG_NOERROR
, 0))
2200 if (sctx
->limited_states
!= NULL
)
2202 err
= merge_state_array (dfa
, sctx
->limited_states
,
2203 local_sctx
.sifted_states
,
2205 if (BE (err
!= REG_NOERROR
, 0))
2208 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2209 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2211 /* mctx->bkref_ents may have changed, reload the pointer. */
2212 entry
= mctx
->bkref_ents
+ enabled_idx
;
2214 while (enabled_idx
++, entry
++->more
);
2218 if (local_sctx
.sifted_states
!= NULL
)
2220 re_node_set_free (&local_sctx
.limits
);
2227 #ifdef RE_ENABLE_I18N
2230 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2231 int node_idx
, int str_idx
, int max_str_idx
)
2233 const re_dfa_t
*const dfa
= mctx
->dfa
;
2235 /* Check the node can accept `multi byte'. */
2236 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2237 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2238 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2239 dfa
->nexts
[node_idx
]))
2240 /* The node can't accept the `multi byte', or the
2241 destination was already thrown away, then the node
2242 couldn't accept the current input `multi byte'. */
2244 /* Otherwise, it is sure that the node could accept
2245 `naccepted' bytes input. */
2248 #endif /* RE_ENABLE_I18N */
2251 /* Functions for state transition. */
2253 /* Return the next state to which the current state STATE will transit by
2254 accepting the current input byte, and update STATE_LOG if necessary.
2255 If STATE can accept a multibyte char/collating element/back reference
2256 update the destination of STATE_LOG. */
2258 static re_dfastate_t
*
2260 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2261 re_dfastate_t
*state
)
2263 re_dfastate_t
**trtable
;
2266 #ifdef RE_ENABLE_I18N
2267 /* If the current state can accept multibyte. */
2268 if (BE (state
->accept_mb
, 0))
2270 *err
= transit_state_mb (mctx
, state
);
2271 if (BE (*err
!= REG_NOERROR
, 0))
2274 #endif /* RE_ENABLE_I18N */
2276 /* Then decide the next state with the single byte. */
2279 /* don't use transition table */
2280 return transit_state_sb (err
, mctx
, state
);
2283 /* Use transition table */
2284 ch
= re_string_fetch_byte (&mctx
->input
);
2287 trtable
= state
->trtable
;
2288 if (BE (trtable
!= NULL
, 1))
2291 trtable
= state
->word_trtable
;
2292 if (BE (trtable
!= NULL
, 1))
2294 unsigned int context
;
2296 = re_string_context_at (&mctx
->input
,
2297 re_string_cur_idx (&mctx
->input
) - 1,
2299 if (IS_WORD_CONTEXT (context
))
2300 return trtable
[ch
+ SBC_MAX
];
2305 if (!build_trtable (mctx
->dfa
, state
))
2311 /* Retry, we now have a transition table. */
2315 /* Update the state_log if we need */
2316 static re_dfastate_t
*
2318 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2319 re_dfastate_t
*next_state
)
2321 const re_dfa_t
*const dfa
= mctx
->dfa
;
2322 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2324 if (cur_idx
> mctx
->state_log_top
)
2326 mctx
->state_log
[cur_idx
] = next_state
;
2327 mctx
->state_log_top
= cur_idx
;
2329 else if (mctx
->state_log
[cur_idx
] == NULL
)
2331 mctx
->state_log
[cur_idx
] = next_state
;
2335 re_dfastate_t
*pstate
;
2336 unsigned int context
;
2337 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2338 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2339 the destination of a multibyte char/collating element/
2340 back reference. Then the next state is the union set of
2341 these destinations and the results of the transition table. */
2342 pstate
= mctx
->state_log
[cur_idx
];
2343 log_nodes
= pstate
->entrance_nodes
;
2344 if (next_state
!= NULL
)
2346 table_nodes
= next_state
->entrance_nodes
;
2347 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2349 if (BE (*err
!= REG_NOERROR
, 0))
2353 next_nodes
= *log_nodes
;
2354 /* Note: We already add the nodes of the initial state,
2355 then we don't need to add them here. */
2357 context
= re_string_context_at (&mctx
->input
,
2358 re_string_cur_idx (&mctx
->input
) - 1,
2360 next_state
= mctx
->state_log
[cur_idx
]
2361 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2362 /* We don't need to check errors here, since the return value of
2363 this function is next_state and ERR is already set. */
2365 if (table_nodes
!= NULL
)
2366 re_node_set_free (&next_nodes
);
2369 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2371 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2372 later. We must check them here, since the back references in the
2373 next state might use them. */
2374 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2376 if (BE (*err
!= REG_NOERROR
, 0))
2379 /* If the next state has back references. */
2380 if (next_state
->has_backref
)
2382 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2383 if (BE (*err
!= REG_NOERROR
, 0))
2385 next_state
= mctx
->state_log
[cur_idx
];
2392 /* Skip bytes in the input that correspond to part of a
2393 multi-byte match, then look in the log for a state
2394 from which to restart matching. */
2395 static re_dfastate_t
*
2397 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2399 re_dfastate_t
*cur_state
;
2402 int max
= mctx
->state_log_top
;
2403 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2407 if (++cur_str_idx
> max
)
2409 re_string_skip_bytes (&mctx
->input
, 1);
2411 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2413 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2415 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2419 /* Helper functions for transit_state. */
2421 /* From the node set CUR_NODES, pick up the nodes whose types are
2422 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2423 expression. And register them to use them later for evaluating the
2424 correspoding back references. */
2426 static reg_errcode_t
2428 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2431 const re_dfa_t
*const dfa
= mctx
->dfa
;
2435 /* TODO: This isn't efficient.
2436 Because there might be more than one nodes whose types are
2437 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2440 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2442 int node
= cur_nodes
->elems
[node_idx
];
2443 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2444 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2445 && (dfa
->used_bkref_map
2446 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2448 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2449 if (BE (err
!= REG_NOERROR
, 0))
2457 /* Return the next state to which the current state STATE will transit by
2458 accepting the current input byte. */
2460 static re_dfastate_t
*
2461 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2462 re_dfastate_t
*state
)
2464 const re_dfa_t
*const dfa
= mctx
->dfa
;
2465 re_node_set next_nodes
;
2466 re_dfastate_t
*next_state
;
2467 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2468 unsigned int context
;
2470 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2471 if (BE (*err
!= REG_NOERROR
, 0))
2473 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2475 int cur_node
= state
->nodes
.elems
[node_cnt
];
2476 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2478 *err
= re_node_set_merge (&next_nodes
,
2479 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2480 if (BE (*err
!= REG_NOERROR
, 0))
2482 re_node_set_free (&next_nodes
);
2487 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2488 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2489 /* We don't need to check errors here, since the return value of
2490 this function is next_state and ERR is already set. */
2492 re_node_set_free (&next_nodes
);
2493 re_string_skip_bytes (&mctx
->input
, 1);
2498 #ifdef RE_ENABLE_I18N
2499 static reg_errcode_t
2501 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2503 const re_dfa_t
*const dfa
= mctx
->dfa
;
2507 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2509 re_node_set dest_nodes
, *new_nodes
;
2510 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2511 int naccepted
, dest_idx
;
2512 unsigned int context
;
2513 re_dfastate_t
*dest_state
;
2515 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2518 if (dfa
->nodes
[cur_node_idx
].constraint
)
2520 context
= re_string_context_at (&mctx
->input
,
2521 re_string_cur_idx (&mctx
->input
),
2523 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2528 /* How many bytes the node can accept? */
2529 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2530 re_string_cur_idx (&mctx
->input
));
2534 /* The node can accepts `naccepted' bytes. */
2535 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2536 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2537 : mctx
->max_mb_elem_len
);
2538 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2539 if (BE (err
!= REG_NOERROR
, 0))
2542 assert (dfa
->nexts
[cur_node_idx
] != -1);
2544 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2546 dest_state
= mctx
->state_log
[dest_idx
];
2547 if (dest_state
== NULL
)
2548 dest_nodes
= *new_nodes
;
2551 err
= re_node_set_init_union (&dest_nodes
,
2552 dest_state
->entrance_nodes
, new_nodes
);
2553 if (BE (err
!= REG_NOERROR
, 0))
2556 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2558 mctx
->state_log
[dest_idx
]
2559 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2560 if (dest_state
!= NULL
)
2561 re_node_set_free (&dest_nodes
);
2562 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2567 #endif /* RE_ENABLE_I18N */
2569 static reg_errcode_t
2571 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2573 const re_dfa_t
*const dfa
= mctx
->dfa
;
2576 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2578 for (i
= 0; i
< nodes
->nelem
; ++i
)
2580 int dest_str_idx
, prev_nelem
, bkc_idx
;
2581 int node_idx
= nodes
->elems
[i
];
2582 unsigned int context
;
2583 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2584 re_node_set
*new_dest_nodes
;
2586 /* Check whether `node' is a backreference or not. */
2587 if (node
->type
!= OP_BACK_REF
)
2590 if (node
->constraint
)
2592 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2594 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2598 /* `node' is a backreference.
2599 Check the substring which the substring matched. */
2600 bkc_idx
= mctx
->nbkref_ents
;
2601 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2602 if (BE (err
!= REG_NOERROR
, 0))
2605 /* And add the epsilon closures (which is `new_dest_nodes') of
2606 the backreference to appropriate state_log. */
2608 assert (dfa
->nexts
[node_idx
] != -1);
2610 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2613 re_dfastate_t
*dest_state
;
2614 struct re_backref_cache_entry
*bkref_ent
;
2615 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2616 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2618 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2619 new_dest_nodes
= (subexp_len
== 0
2620 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2621 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2622 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2623 - bkref_ent
->subexp_from
);
2624 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2626 dest_state
= mctx
->state_log
[dest_str_idx
];
2627 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2628 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2629 /* Add `new_dest_node' to state_log. */
2630 if (dest_state
== NULL
)
2632 mctx
->state_log
[dest_str_idx
]
2633 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2635 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2636 && err
!= REG_NOERROR
, 0))
2641 re_node_set dest_nodes
;
2642 err
= re_node_set_init_union (&dest_nodes
,
2643 dest_state
->entrance_nodes
,
2645 if (BE (err
!= REG_NOERROR
, 0))
2647 re_node_set_free (&dest_nodes
);
2650 mctx
->state_log
[dest_str_idx
]
2651 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2652 re_node_set_free (&dest_nodes
);
2653 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2654 && err
!= REG_NOERROR
, 0))
2657 /* We need to check recursively if the backreference can epsilon
2660 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2662 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2664 if (BE (err
!= REG_NOERROR
, 0))
2666 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2667 if (BE (err
!= REG_NOERROR
, 0))
2677 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2678 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2679 Note that we might collect inappropriate candidates here.
2680 However, the cost of checking them strictly here is too high, then we
2681 delay these checking for prune_impossible_nodes(). */
2683 static reg_errcode_t
2685 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2687 const re_dfa_t
*const dfa
= mctx
->dfa
;
2688 int subexp_num
, sub_top_idx
;
2689 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2690 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2691 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2692 if (cache_idx
!= -1)
2694 const struct re_backref_cache_entry
*entry
2695 = mctx
->bkref_ents
+ cache_idx
;
2697 if (entry
->node
== bkref_node
)
2698 return REG_NOERROR
; /* We already checked it. */
2699 while (entry
++->more
);
2702 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2704 /* For each sub expression */
2705 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2708 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2709 re_sub_match_last_t
*sub_last
;
2710 int sub_last_idx
, sl_str
, bkref_str_off
;
2712 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2713 continue; /* It isn't related. */
2715 sl_str
= sub_top
->str_idx
;
2716 bkref_str_off
= bkref_str_idx
;
2717 /* At first, check the last node of sub expressions we already
2719 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2722 sub_last
= sub_top
->lasts
[sub_last_idx
];
2723 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2724 /* The matched string by the sub expression match with the substring
2725 at the back reference? */
2726 if (sl_str_diff
> 0)
2728 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2730 /* Not enough chars for a successful match. */
2731 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2734 err
= clean_state_log_if_needed (mctx
,
2737 if (BE (err
!= REG_NOERROR
, 0))
2739 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2741 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2742 /* We don't need to search this sub expression any more. */
2745 bkref_str_off
+= sl_str_diff
;
2746 sl_str
+= sl_str_diff
;
2747 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2750 /* Reload buf, since the preceding call might have reallocated
2752 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2754 if (err
== REG_NOMATCH
)
2756 if (BE (err
!= REG_NOERROR
, 0))
2760 if (sub_last_idx
< sub_top
->nlasts
)
2762 if (sub_last_idx
> 0)
2764 /* Then, search for the other last nodes of the sub expression. */
2765 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2767 int cls_node
, sl_str_off
;
2768 const re_node_set
*nodes
;
2769 sl_str_off
= sl_str
- sub_top
->str_idx
;
2770 /* The matched string by the sub expression match with the substring
2771 at the back reference? */
2774 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2776 /* If we are at the end of the input, we cannot match. */
2777 if (bkref_str_off
>= mctx
->input
.len
)
2780 err
= extend_buffers (mctx
);
2781 if (BE (err
!= REG_NOERROR
, 0))
2784 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2786 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2787 break; /* We don't need to search this sub expression
2790 if (mctx
->state_log
[sl_str
] == NULL
)
2792 /* Does this state have a ')' of the sub expression? */
2793 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2794 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2798 if (sub_top
->path
== NULL
)
2800 sub_top
->path
= calloc (sizeof (state_array_t
),
2801 sl_str
- sub_top
->str_idx
+ 1);
2802 if (sub_top
->path
== NULL
)
2805 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2806 in the current context? */
2807 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2808 sub_top
->str_idx
, cls_node
, sl_str
,
2810 if (err
== REG_NOMATCH
)
2812 if (BE (err
!= REG_NOERROR
, 0))
2814 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2815 if (BE (sub_last
== NULL
, 0))
2817 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2819 if (err
== REG_NOMATCH
)
2826 /* Helper functions for get_subexp(). */
2828 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2829 If it can arrive, register the sub expression expressed with SUB_TOP
2832 static reg_errcode_t
2834 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2835 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2839 /* Can the subexpression arrive the back reference? */
2840 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2841 sub_last
->str_idx
, bkref_node
, bkref_str
,
2843 if (err
!= REG_NOERROR
)
2845 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2847 if (BE (err
!= REG_NOERROR
, 0))
2849 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2850 return clean_state_log_if_needed (mctx
, to_idx
);
2853 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2854 Search '(' if FL_OPEN, or search ')' otherwise.
2855 TODO: This function isn't efficient...
2856 Because there might be more than one nodes whose types are
2857 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2863 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2864 int subexp_idx
, int type
)
2867 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2869 int cls_node
= nodes
->elems
[cls_idx
];
2870 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2871 if (node
->type
== type
2872 && node
->opr
.idx
== subexp_idx
)
2878 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2879 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2881 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2883 static reg_errcode_t
2885 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2886 int top_str
, int last_node
, int last_str
, int type
)
2888 const re_dfa_t
*const dfa
= mctx
->dfa
;
2889 reg_errcode_t err
= REG_NOERROR
;
2890 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2891 re_dfastate_t
*cur_state
= NULL
;
2892 re_node_set
*cur_nodes
, next_nodes
;
2893 re_dfastate_t
**backup_state_log
;
2894 unsigned int context
;
2896 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2897 /* Extend the buffer if we need. */
2898 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2900 re_dfastate_t
**new_array
;
2901 int old_alloc
= path
->alloc
;
2902 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2903 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2904 if (BE (new_array
== NULL
, 0))
2906 path
->alloc
= old_alloc
;
2909 path
->array
= new_array
;
2910 memset (new_array
+ old_alloc
, '\0',
2911 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2914 str_idx
= path
->next_idx
? path
->next_idx
: top_str
;
2916 /* Temporary modify MCTX. */
2917 backup_state_log
= mctx
->state_log
;
2918 backup_cur_idx
= mctx
->input
.cur_idx
;
2919 mctx
->state_log
= path
->array
;
2920 mctx
->input
.cur_idx
= str_idx
;
2922 /* Setup initial node set. */
2923 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2924 if (str_idx
== top_str
)
2926 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2927 if (BE (err
!= REG_NOERROR
, 0))
2929 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2930 if (BE (err
!= REG_NOERROR
, 0))
2932 re_node_set_free (&next_nodes
);
2938 cur_state
= mctx
->state_log
[str_idx
];
2939 if (cur_state
&& cur_state
->has_backref
)
2941 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2942 if (BE (err
!= REG_NOERROR
, 0))
2946 re_node_set_init_empty (&next_nodes
);
2948 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2950 if (next_nodes
.nelem
)
2952 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2954 if (BE (err
!= REG_NOERROR
, 0))
2956 re_node_set_free (&next_nodes
);
2960 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2961 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2963 re_node_set_free (&next_nodes
);
2966 mctx
->state_log
[str_idx
] = cur_state
;
2969 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2971 re_node_set_empty (&next_nodes
);
2972 if (mctx
->state_log
[str_idx
+ 1])
2974 err
= re_node_set_merge (&next_nodes
,
2975 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2976 if (BE (err
!= REG_NOERROR
, 0))
2978 re_node_set_free (&next_nodes
);
2984 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2985 &cur_state
->non_eps_nodes
,
2987 if (BE (err
!= REG_NOERROR
, 0))
2989 re_node_set_free (&next_nodes
);
2994 if (next_nodes
.nelem
)
2996 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2997 if (BE (err
!= REG_NOERROR
, 0))
2999 re_node_set_free (&next_nodes
);
3002 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
3004 if (BE (err
!= REG_NOERROR
, 0))
3006 re_node_set_free (&next_nodes
);
3010 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3011 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3012 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3014 re_node_set_free (&next_nodes
);
3017 mctx
->state_log
[str_idx
] = cur_state
;
3018 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3020 re_node_set_free (&next_nodes
);
3021 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3022 : &mctx
->state_log
[last_str
]->nodes
);
3023 path
->next_idx
= str_idx
;
3026 mctx
->state_log
= backup_state_log
;
3027 mctx
->input
.cur_idx
= backup_cur_idx
;
3029 /* Then check the current node set has the node LAST_NODE. */
3030 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3036 /* Helper functions for check_arrival. */
3038 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3040 TODO: This function is similar to the functions transit_state*(),
3041 however this function has many additional works.
3042 Can't we unify them? */
3044 static reg_errcode_t
3046 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3047 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3049 const re_dfa_t
*const dfa
= mctx
->dfa
;
3052 #ifdef RE_ENABLE_I18N
3053 reg_errcode_t err
= REG_NOERROR
;
3055 re_node_set union_set
;
3056 re_node_set_init_empty (&union_set
);
3057 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3060 int cur_node
= cur_nodes
->elems
[cur_idx
];
3062 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3063 assert (!IS_EPSILON_NODE (type
));
3065 #ifdef RE_ENABLE_I18N
3066 /* If the node may accept `multi byte'. */
3067 if (dfa
->nodes
[cur_node
].accept_mb
)
3069 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3073 re_dfastate_t
*dest_state
;
3074 int next_node
= dfa
->nexts
[cur_node
];
3075 int next_idx
= str_idx
+ naccepted
;
3076 dest_state
= mctx
->state_log
[next_idx
];
3077 re_node_set_empty (&union_set
);
3080 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3081 if (BE (err
!= REG_NOERROR
, 0))
3083 re_node_set_free (&union_set
);
3087 result
= re_node_set_insert (&union_set
, next_node
);
3088 if (BE (result
< 0, 0))
3090 re_node_set_free (&union_set
);
3093 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3095 if (BE (mctx
->state_log
[next_idx
] == NULL
3096 && err
!= REG_NOERROR
, 0))
3098 re_node_set_free (&union_set
);
3103 #endif /* RE_ENABLE_I18N */
3105 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3107 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3108 if (BE (result
< 0, 0))
3110 re_node_set_free (&union_set
);
3115 re_node_set_free (&union_set
);
3119 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3120 CUR_NODES, however exclude the nodes which are:
3121 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3122 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3125 static reg_errcode_t
3127 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3128 int ex_subexp
, int type
)
3131 int idx
, outside_node
;
3132 re_node_set new_nodes
;
3134 assert (cur_nodes
->nelem
);
3136 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3137 if (BE (err
!= REG_NOERROR
, 0))
3139 /* Create a new node set NEW_NODES with the nodes which are epsilon
3140 closures of the node in CUR_NODES. */
3142 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3144 int cur_node
= cur_nodes
->elems
[idx
];
3145 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3146 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3147 if (outside_node
== -1)
3149 /* There are no problematic nodes, just merge them. */
3150 err
= re_node_set_merge (&new_nodes
, eclosure
);
3151 if (BE (err
!= REG_NOERROR
, 0))
3153 re_node_set_free (&new_nodes
);
3159 /* There are problematic nodes, re-calculate incrementally. */
3160 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3162 if (BE (err
!= REG_NOERROR
, 0))
3164 re_node_set_free (&new_nodes
);
3169 re_node_set_free (cur_nodes
);
3170 *cur_nodes
= new_nodes
;
3174 /* Helper function for check_arrival_expand_ecl.
3175 Check incrementally the epsilon closure of TARGET, and if it isn't
3176 problematic append it to DST_NODES. */
3178 static reg_errcode_t
3180 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3181 int target
, int ex_subexp
, int type
)
3184 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3188 if (dfa
->nodes
[cur_node
].type
== type
3189 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3191 if (type
== OP_CLOSE_SUBEXP
)
3193 err
= re_node_set_insert (dst_nodes
, cur_node
);
3194 if (BE (err
== -1, 0))
3199 err
= re_node_set_insert (dst_nodes
, cur_node
);
3200 if (BE (err
== -1, 0))
3202 if (dfa
->edests
[cur_node
].nelem
== 0)
3204 if (dfa
->edests
[cur_node
].nelem
== 2)
3206 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3207 dfa
->edests
[cur_node
].elems
[1],
3209 if (BE (err
!= REG_NOERROR
, 0))
3212 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3218 /* For all the back references in the current state, calculate the
3219 destination of the back references by the appropriate entry
3220 in MCTX->BKREF_ENTS. */
3222 static reg_errcode_t
3224 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3225 int cur_str
, int subexp_num
, int type
)
3227 const re_dfa_t
*const dfa
= mctx
->dfa
;
3229 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3230 struct re_backref_cache_entry
*ent
;
3232 if (cache_idx_start
== -1)
3236 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3239 int to_idx
, next_node
;
3241 /* Is this entry ENT is appropriate? */
3242 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3245 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3246 /* Calculate the destination of the back reference, and append it
3247 to MCTX->STATE_LOG. */
3248 if (to_idx
== cur_str
)
3250 /* The backreference did epsilon transit, we must re-check all the
3251 node in the current state. */
3252 re_node_set new_dests
;
3253 reg_errcode_t err2
, err3
;
3254 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3255 if (re_node_set_contains (cur_nodes
, next_node
))
3257 err
= re_node_set_init_1 (&new_dests
, next_node
);
3258 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3259 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3260 re_node_set_free (&new_dests
);
3261 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3262 || err3
!= REG_NOERROR
, 0))
3264 err
= (err
!= REG_NOERROR
? err
3265 : (err2
!= REG_NOERROR
? err2
: err3
));
3268 /* TODO: It is still inefficient... */
3273 re_node_set union_set
;
3274 next_node
= dfa
->nexts
[ent
->node
];
3275 if (mctx
->state_log
[to_idx
])
3278 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3281 err
= re_node_set_init_copy (&union_set
,
3282 &mctx
->state_log
[to_idx
]->nodes
);
3283 ret
= re_node_set_insert (&union_set
, next_node
);
3284 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3286 re_node_set_free (&union_set
);
3287 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3293 err
= re_node_set_init_1 (&union_set
, next_node
);
3294 if (BE (err
!= REG_NOERROR
, 0))
3297 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3298 re_node_set_free (&union_set
);
3299 if (BE (mctx
->state_log
[to_idx
] == NULL
3300 && err
!= REG_NOERROR
, 0))
3304 while (ent
++->more
);
3308 /* Build transition table for the state.
3309 Return 1 if succeeded, otherwise return NULL. */
3313 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3316 int i
, j
, ch
, need_word_trtable
= 0;
3317 bitset_word_t elem
, mask
;
3318 bool dests_node_malloced
= false;
3319 bool dest_states_malloced
= false;
3320 int ndests
; /* Number of the destination states from `state'. */
3321 re_dfastate_t
**trtable
;
3322 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3323 re_node_set follows
, *dests_node
;
3325 bitset_t acceptable
;
3329 re_node_set dests_node
[SBC_MAX
];
3330 bitset_t dests_ch
[SBC_MAX
];
3333 /* We build DFA states which corresponds to the destination nodes
3334 from `state'. `dests_node[i]' represents the nodes which i-th
3335 destination state contains, and `dests_ch[i]' represents the
3336 characters which i-th destination state accepts. */
3338 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3339 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3343 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3344 if (BE (dests_alloc
== NULL
, 0))
3346 dests_node_malloced
= true;
3348 dests_node
= dests_alloc
->dests_node
;
3349 dests_ch
= dests_alloc
->dests_ch
;
3351 /* Initialize transiton table. */
3352 state
->word_trtable
= state
->trtable
= NULL
;
3354 /* At first, group all nodes belonging to `state' into several
3356 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3357 if (BE (ndests
<= 0, 0))
3359 if (dests_node_malloced
)
3361 /* Return 0 in case of an error, 1 otherwise. */
3364 state
->trtable
= (re_dfastate_t
**)
3365 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3371 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3372 if (BE (err
!= REG_NOERROR
, 0))
3375 /* Avoid arithmetic overflow in size calculation. */
3376 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3377 / (3 * sizeof (re_dfastate_t
*)))
3383 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3384 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3385 dest_states
= (re_dfastate_t
**)
3386 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3390 dest_states
= (re_dfastate_t
**)
3391 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3392 if (BE (dest_states
== NULL
, 0))
3395 if (dest_states_malloced
)
3397 re_node_set_free (&follows
);
3398 for (i
= 0; i
< ndests
; ++i
)
3399 re_node_set_free (dests_node
+ i
);
3400 if (dests_node_malloced
)
3404 dest_states_malloced
= true;
3406 dest_states_word
= dest_states
+ ndests
;
3407 dest_states_nl
= dest_states_word
+ ndests
;
3408 bitset_empty (acceptable
);
3410 /* Then build the states for all destinations. */
3411 for (i
= 0; i
< ndests
; ++i
)
3414 re_node_set_empty (&follows
);
3415 /* Merge the follows of this destination states. */
3416 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3418 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3419 if (next_node
!= -1)
3421 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3422 if (BE (err
!= REG_NOERROR
, 0))
3426 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3427 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3429 /* If the new state has context constraint,
3430 build appropriate states for these contexts. */
3431 if (dest_states
[i
]->has_constraint
)
3433 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3435 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3438 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3439 need_word_trtable
= 1;
3441 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3443 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3448 dest_states_word
[i
] = dest_states
[i
];
3449 dest_states_nl
[i
] = dest_states
[i
];
3451 bitset_merge (acceptable
, dests_ch
[i
]);
3454 if (!BE (need_word_trtable
, 0))
3456 /* We don't care about whether the following character is a word
3457 character, or we are in a single-byte character set so we can
3458 discern by looking at the character code: allocate a
3459 256-entry transition table. */
3460 trtable
= state
->trtable
=
3461 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3462 if (BE (trtable
== NULL
, 0))
3465 /* For all characters ch...: */
3466 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3467 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3469 mask
<<= 1, elem
>>= 1, ++ch
)
3470 if (BE (elem
& 1, 0))
3472 /* There must be exactly one destination which accepts
3473 character ch. See group_nodes_into_DFAstates. */
3474 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3477 /* j-th destination accepts the word character ch. */
3478 if (dfa
->word_char
[i
] & mask
)
3479 trtable
[ch
] = dest_states_word
[j
];
3481 trtable
[ch
] = dest_states
[j
];
3486 /* We care about whether the following character is a word
3487 character, and we are in a multi-byte character set: discern
3488 by looking at the character code: build two 256-entry
3489 transition tables, one starting at trtable[0] and one
3490 starting at trtable[SBC_MAX]. */
3491 trtable
= state
->word_trtable
=
3492 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3493 if (BE (trtable
== NULL
, 0))
3496 /* For all characters ch...: */
3497 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3498 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3500 mask
<<= 1, elem
>>= 1, ++ch
)
3501 if (BE (elem
& 1, 0))
3503 /* There must be exactly one destination which accepts
3504 character ch. See group_nodes_into_DFAstates. */
3505 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3508 /* j-th destination accepts the word character ch. */
3509 trtable
[ch
] = dest_states
[j
];
3510 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3515 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3517 /* The current state accepts newline character. */
3518 for (j
= 0; j
< ndests
; ++j
)
3519 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3521 /* k-th destination accepts newline character. */
3522 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3523 if (need_word_trtable
)
3524 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3525 /* There must be only one destination which accepts
3526 newline. See group_nodes_into_DFAstates. */
3531 if (dest_states_malloced
)
3534 re_node_set_free (&follows
);
3535 for (i
= 0; i
< ndests
; ++i
)
3536 re_node_set_free (dests_node
+ i
);
3538 if (dests_node_malloced
)
3544 /* Group all nodes belonging to STATE into several destinations.
3545 Then for all destinations, set the nodes belonging to the destination
3546 to DESTS_NODE[i] and set the characters accepted by the destination
3547 to DEST_CH[i]. This function return the number of destinations. */
3551 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3552 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3557 int ndests
; /* Number of the destinations from `state'. */
3558 bitset_t accepts
; /* Characters a node can accept. */
3559 const re_node_set
*cur_nodes
= &state
->nodes
;
3560 bitset_empty (accepts
);
3563 /* For all the nodes belonging to `state', */
3564 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3566 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3567 re_token_type_t type
= node
->type
;
3568 unsigned int constraint
= node
->constraint
;
3570 /* Enumerate all single byte character this node can accept. */
3571 if (type
== CHARACTER
)
3572 bitset_set (accepts
, node
->opr
.c
);
3573 else if (type
== SIMPLE_BRACKET
)
3575 bitset_merge (accepts
, node
->opr
.sbcset
);
3577 else if (type
== OP_PERIOD
)
3579 #ifdef RE_ENABLE_I18N
3580 if (dfa
->mb_cur_max
> 1)
3581 bitset_merge (accepts
, dfa
->sb_char
);
3584 bitset_set_all (accepts
);
3585 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3586 bitset_clear (accepts
, '\n');
3587 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3588 bitset_clear (accepts
, '\0');
3590 #ifdef RE_ENABLE_I18N
3591 else if (type
== OP_UTF8_PERIOD
)
3593 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3594 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3595 bitset_clear (accepts
, '\n');
3596 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3597 bitset_clear (accepts
, '\0');
3603 /* Check the `accepts' and sift the characters which are not
3604 match it the context. */
3607 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3609 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3610 bitset_empty (accepts
);
3611 if (accepts_newline
)
3612 bitset_set (accepts
, NEWLINE_CHAR
);
3616 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3618 bitset_empty (accepts
);
3622 if (constraint
& NEXT_WORD_CONSTRAINT
)
3624 bitset_word_t any_set
= 0;
3625 if (type
== CHARACTER
&& !node
->word_char
)
3627 bitset_empty (accepts
);
3630 #ifdef RE_ENABLE_I18N
3631 if (dfa
->mb_cur_max
> 1)
3632 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3633 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3636 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3637 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3641 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3643 bitset_word_t any_set
= 0;
3644 if (type
== CHARACTER
&& node
->word_char
)
3646 bitset_empty (accepts
);
3649 #ifdef RE_ENABLE_I18N
3650 if (dfa
->mb_cur_max
> 1)
3651 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3652 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3655 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3656 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3662 /* Then divide `accepts' into DFA states, or create a new
3663 state. Above, we make sure that accepts is not empty. */
3664 for (j
= 0; j
< ndests
; ++j
)
3666 bitset_t intersec
; /* Intersection sets, see below. */
3668 /* Flags, see below. */
3669 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3671 /* Optimization, skip if this state doesn't accept the character. */
3672 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3675 /* Enumerate the intersection set of this state and `accepts'. */
3677 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3678 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3679 /* And skip if the intersection set is empty. */
3683 /* Then check if this state is a subset of `accepts'. */
3684 not_subset
= not_consumed
= 0;
3685 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3687 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3688 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3691 /* If this state isn't a subset of `accepts', create a
3692 new group state, which has the `remains'. */
3695 bitset_copy (dests_ch
[ndests
], remains
);
3696 bitset_copy (dests_ch
[j
], intersec
);
3697 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3698 if (BE (err
!= REG_NOERROR
, 0))
3703 /* Put the position in the current group. */
3704 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3705 if (BE (result
< 0, 0))
3708 /* If all characters are consumed, go to next node. */
3712 /* Some characters remain, create a new group. */
3715 bitset_copy (dests_ch
[ndests
], accepts
);
3716 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3717 if (BE (err
!= REG_NOERROR
, 0))
3720 bitset_empty (accepts
);
3725 for (j
= 0; j
< ndests
; ++j
)
3726 re_node_set_free (dests_node
+ j
);
3730 #ifdef RE_ENABLE_I18N
3731 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3732 Return the number of the bytes the node accepts.
3733 STR_IDX is the current index of the input string.
3735 This function handles the nodes which can accept one character, or
3736 one collating element like '.', '[a-z]', opposite to the other nodes
3737 can only accept one byte. */
3741 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3742 const re_string_t
*input
, int str_idx
)
3744 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3745 int char_len
, elem_len
;
3749 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3751 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3752 if (BE (c
< 0xc2, 1))
3755 if (str_idx
+ 2 > input
->len
)
3758 d
= re_string_byte_at (input
, str_idx
+ 1);
3760 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3764 if (c
== 0xe0 && d
< 0xa0)
3770 if (c
== 0xf0 && d
< 0x90)
3776 if (c
== 0xf8 && d
< 0x88)
3782 if (c
== 0xfc && d
< 0x84)
3788 if (str_idx
+ char_len
> input
->len
)
3791 for (i
= 1; i
< char_len
; ++i
)
3793 d
= re_string_byte_at (input
, str_idx
+ i
);
3794 if (d
< 0x80 || d
> 0xbf)
3800 char_len
= re_string_char_size_at (input
, str_idx
);
3801 if (node
->type
== OP_PERIOD
)
3805 /* FIXME: I don't think this if is needed, as both '\n'
3806 and '\0' are char_len == 1. */
3807 /* '.' accepts any one character except the following two cases. */
3808 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3809 re_string_byte_at (input
, str_idx
) == '\n') ||
3810 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3811 re_string_byte_at (input
, str_idx
) == '\0'))
3816 elem_len
= re_string_elem_size_at (input
, str_idx
);
3817 wc
= __btowc(*(input
->mbs
+str_idx
));
3818 if (((elem_len
<= 1 && char_len
<= 1) || char_len
== 0) && (wc
!= WEOF
&& wc
< SBC_MAX
))
3821 if (node
->type
== COMPLEX_BRACKET
)
3823 const re_charset_t
*cset
= node
->opr
.mbcset
;
3825 const unsigned char *pin
3826 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3831 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3832 ? re_string_wchar_at (input
, str_idx
) : 0);
3834 /* match with multibyte character? */
3835 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3836 if (wc
== cset
->mbchars
[i
])
3838 match_len
= char_len
;
3839 goto check_node_accept_bytes_match
;
3841 /* match with character_class? */
3842 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3844 wctype_t wt
= cset
->char_classes
[i
];
3845 if (__iswctype (wc
, wt
))
3847 match_len
= char_len
;
3848 goto check_node_accept_bytes_match
;
3853 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3856 unsigned int in_collseq
= 0;
3857 const int32_t *table
, *indirect
;
3858 const unsigned char *weights
, *extra
;
3859 const char *collseqwc
;
3860 /* This #include defines a local function! */
3861 # include <locale/weight.h>
3863 /* match with collating_symbol? */
3864 if (cset
->ncoll_syms
)
3865 extra
= (const unsigned char *)
3866 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3867 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3869 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3870 /* Compare the length of input collating element and
3871 the length of current collating element. */
3872 if (*coll_sym
!= elem_len
)
3874 /* Compare each bytes. */
3875 for (j
= 0; j
< *coll_sym
; j
++)
3876 if (pin
[j
] != coll_sym
[1 + j
])
3880 /* Match if every bytes is equal. */
3882 goto check_node_accept_bytes_match
;
3888 if (elem_len
<= char_len
)
3890 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3891 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3894 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3896 /* match with range expression? */
3897 for (i
= 0; i
< cset
->nranges
; ++i
)
3898 if (cset
->range_starts
[i
] <= in_collseq
3899 && in_collseq
<= cset
->range_ends
[i
])
3901 match_len
= elem_len
;
3902 goto check_node_accept_bytes_match
;
3905 /* match with equivalence_class? */
3906 if (cset
->nequiv_classes
)
3908 const unsigned char *cp
= pin
;
3909 table
= (const int32_t *)
3910 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3911 weights
= (const unsigned char *)
3912 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3913 extra
= (const unsigned char *)
3914 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3915 indirect
= (const int32_t *)
3916 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3917 int32_t idx
= findidx (&cp
);
3919 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3921 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3922 size_t weight_len
= weights
[idx
& 0xffffff];
3923 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3924 && (idx
>> 24) == (equiv_class_idx
>> 24))
3929 equiv_class_idx
&= 0xffffff;
3931 while (cnt
<= weight_len
3932 && (weights
[equiv_class_idx
+ 1 + cnt
]
3933 == weights
[idx
+ 1 + cnt
]))
3935 if (cnt
> weight_len
)
3937 match_len
= elem_len
;
3938 goto check_node_accept_bytes_match
;
3947 /* match with range expression? */
3949 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3951 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3954 for (i
= 0; i
< cset
->nranges
; ++i
)
3956 cmp_buf
[0] = cset
->range_starts
[i
];
3957 cmp_buf
[4] = cset
->range_ends
[i
];
3958 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3959 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3961 match_len
= char_len
;
3962 goto check_node_accept_bytes_match
;
3966 check_node_accept_bytes_match
:
3967 if (!cset
->non_match
)
3974 return (elem_len
> char_len
) ? elem_len
: char_len
;
3983 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3985 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3990 /* No valid character. Match it as a single byte character. */
3991 const unsigned char *collseq
= (const unsigned char *)
3992 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3993 return collseq
[mbs
[0]];
4000 const unsigned char *extra
= (const unsigned char *)
4001 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
4002 int32_t extrasize
= (const unsigned char *)
4003 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
4005 for (idx
= 0; idx
< extrasize
;)
4007 int mbs_cnt
, found
= 0;
4008 int32_t elem_mbs_len
;
4009 /* Skip the name of collating element name. */
4010 idx
= idx
+ extra
[idx
] + 1;
4011 elem_mbs_len
= extra
[idx
++];
4012 if (mbs_len
== elem_mbs_len
)
4014 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4015 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4017 if (mbs_cnt
== elem_mbs_len
)
4018 /* Found the entry. */
4021 /* Skip the byte sequence of the collating element. */
4022 idx
+= elem_mbs_len
;
4023 /* Adjust for the alignment. */
4024 idx
= (idx
+ 3) & ~3;
4025 /* Skip the collation sequence value. */
4026 idx
+= sizeof (uint32_t);
4027 /* Skip the wide char sequence of the collating element. */
4028 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
4029 /* If we found the entry, return the sequence value. */
4031 return *(uint32_t *) (extra
+ idx
);
4032 /* Skip the collation sequence value. */
4033 idx
+= sizeof (uint32_t);
4039 #endif /* RE_ENABLE_I18N */
4041 /* Check whether the node accepts the byte which is IDX-th
4042 byte of the INPUT. */
4046 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4050 ch
= re_string_byte_at (&mctx
->input
, idx
);
4054 if (node
->opr
.c
!= ch
)
4058 case SIMPLE_BRACKET
:
4059 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4063 #ifdef RE_ENABLE_I18N
4064 case OP_UTF8_PERIOD
:
4070 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4071 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4079 if (node
->constraint
)
4081 /* The node has constraints. Check whether the current context
4082 satisfies the constraints. */
4083 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4085 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4092 /* Extend the buffers, if the buffers have run out. */
4094 static reg_errcode_t
4096 extend_buffers (re_match_context_t
*mctx
)
4099 re_string_t
*pstr
= &mctx
->input
;
4101 /* Avoid overflow. */
4102 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4105 /* Double the lengthes of the buffers. */
4106 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4107 if (BE (ret
!= REG_NOERROR
, 0))
4110 if (mctx
->state_log
!= NULL
)
4112 /* And double the length of state_log. */
4113 /* XXX We have no indication of the size of this buffer. If this
4114 allocation fail we have no indication that the state_log array
4115 does not have the right size. */
4116 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4117 pstr
->bufs_len
+ 1);
4118 if (BE (new_array
== NULL
, 0))
4120 mctx
->state_log
= new_array
;
4123 /* Then reconstruct the buffers. */
4126 #ifdef RE_ENABLE_I18N
4127 if (pstr
->mb_cur_max
> 1)
4129 ret
= build_wcs_upper_buffer (pstr
);
4130 if (BE (ret
!= REG_NOERROR
, 0))
4134 #endif /* RE_ENABLE_I18N */
4135 build_upper_buffer (pstr
);
4139 #ifdef RE_ENABLE_I18N
4140 if (pstr
->mb_cur_max
> 1)
4141 build_wcs_buffer (pstr
);
4143 #endif /* RE_ENABLE_I18N */
4145 if (pstr
->trans
!= NULL
)
4146 re_string_translate_buffer (pstr
);
4153 /* Functions for matching context. */
4155 /* Initialize MCTX. */
4157 static reg_errcode_t
4159 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4161 mctx
->eflags
= eflags
;
4162 mctx
->match_last
= -1;
4165 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4166 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4167 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4170 /* Already zero-ed by the caller.
4172 mctx->bkref_ents = NULL;
4173 mctx->nbkref_ents = 0;
4174 mctx->nsub_tops = 0; */
4175 mctx
->abkref_ents
= n
;
4176 mctx
->max_mb_elem_len
= 1;
4177 mctx
->asub_tops
= n
;
4181 /* Clean the entries which depend on the current input in MCTX.
4182 This function must be invoked when the matcher changes the start index
4183 of the input, or changes the input string. */
4187 match_ctx_clean (re_match_context_t
*mctx
)
4190 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4193 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4194 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4196 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4197 re_free (last
->path
.array
);
4200 re_free (top
->lasts
);
4203 re_free (top
->path
->array
);
4204 re_free (top
->path
);
4209 mctx
->nsub_tops
= 0;
4210 mctx
->nbkref_ents
= 0;
4213 /* Free all the memory associated with MCTX. */
4217 match_ctx_free (re_match_context_t
*mctx
)
4219 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4220 match_ctx_clean (mctx
);
4221 re_free (mctx
->sub_tops
);
4222 re_free (mctx
->bkref_ents
);
4225 /* Add a new backreference entry to MCTX.
4226 Note that we assume that caller never call this function with duplicate
4227 entry, and call with STR_IDX which isn't smaller than any existing entry.
4230 static reg_errcode_t
4232 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4235 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4237 struct re_backref_cache_entry
* new_entry
;
4238 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4239 mctx
->abkref_ents
* 2);
4240 if (BE (new_entry
== NULL
, 0))
4242 re_free (mctx
->bkref_ents
);
4245 mctx
->bkref_ents
= new_entry
;
4246 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4247 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4248 mctx
->abkref_ents
*= 2;
4250 if (mctx
->nbkref_ents
> 0
4251 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4252 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4254 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4255 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4256 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4257 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4259 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4260 If bit N is clear, means that this entry won't epsilon-transition to
4261 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4262 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4265 A backreference does not epsilon-transition unless it is empty, so set
4266 to all zeros if FROM != TO. */
4267 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4268 = (from
== to
? ~0 : 0);
4270 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4271 if (mctx
->max_mb_elem_len
< to
- from
)
4272 mctx
->max_mb_elem_len
= to
- from
;
4276 /* Search for the first entry which has the same str_idx, or -1 if none is
4277 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4281 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4283 int left
, right
, mid
, last
;
4284 last
= right
= mctx
->nbkref_ents
;
4285 for (left
= 0; left
< right
;)
4287 mid
= (left
+ right
) / 2;
4288 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4293 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4299 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4302 static reg_errcode_t
4304 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4307 assert (mctx
->sub_tops
!= NULL
);
4308 assert (mctx
->asub_tops
> 0);
4310 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4312 int new_asub_tops
= mctx
->asub_tops
* 2;
4313 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4314 re_sub_match_top_t
*,
4316 if (BE (new_array
== NULL
, 0))
4318 mctx
->sub_tops
= new_array
;
4319 mctx
->asub_tops
= new_asub_tops
;
4321 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4322 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4324 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4325 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4329 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4330 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4332 static re_sub_match_last_t
*
4334 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4336 re_sub_match_last_t
*new_entry
;
4337 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4339 int new_alasts
= 2 * subtop
->alasts
+ 1;
4340 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4341 re_sub_match_last_t
*,
4343 if (BE (new_array
== NULL
, 0))
4345 subtop
->lasts
= new_array
;
4346 subtop
->alasts
= new_alasts
;
4348 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4349 if (BE (new_entry
!= NULL
, 1))
4351 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4352 new_entry
->node
= node
;
4353 new_entry
->str_idx
= str_idx
;
4361 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4362 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4364 sctx
->sifted_states
= sifted_sts
;
4365 sctx
->limited_states
= limited_sts
;
4366 sctx
->last_node
= last_node
;
4367 sctx
->last_str_idx
= last_str_idx
;
4368 re_node_set_init_empty (&sctx
->limits
);