1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27 #if defined HAVE_WCHAR_H || defined _LIBC
29 #endif /* HAVE_WCHAR_H || _LIBC */
30 #if defined HAVE_WCTYPE_H || defined _LIBC
32 #endif /* HAVE_WCTYPE_H || _LIBC */
35 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
36 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
37 # include <locale/localeinfo.h>
38 # include <locale/elem-hash.h>
39 # include <locale/coll-lookup.h>
44 #include "regex_internal.h"
46 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
47 re_string_t
*input
, int n
);
48 static void match_ctx_free (re_match_context_t
*cache
);
49 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
50 int str_idx
, int from
, int to
);
51 static void match_ctx_clear_flag (re_match_context_t
*mctx
);
52 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
53 re_dfastate_t
**limited_sts
, int last_node
,
54 int last_str_idx
, int check_subexp
);
55 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
56 const char *string
, int length
,
57 int start
, int range
, int stop
,
58 size_t nmatch
, regmatch_t pmatch
[],
60 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
61 const char *string1
, int length1
,
62 const char *string2
, int length2
,
63 int start
, int range
, struct re_registers
*regs
,
64 int stop
, int ret_len
);
65 static int re_search_stub (struct re_pattern_buffer
*bufp
,
66 const char *string
, int length
, int start
,
67 int range
, int stop
, struct re_registers
*regs
,
69 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
70 int nregs
, int regs_allocated
);
71 static inline re_dfastate_t
*acquire_init_state_context (reg_errcode_t
*err
,
73 const re_match_context_t
*mctx
,
75 static int check_matching (const regex_t
*preg
, re_match_context_t
*mctx
,
76 int fl_search
, int fl_longest_match
);
77 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
78 unsigned int context
);
79 static int check_halt_state_context (const regex_t
*preg
,
80 const re_dfastate_t
*state
,
81 const re_match_context_t
*mctx
, int idx
);
82 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
, int cur_node
,
83 int cur_idx
, int nmatch
);
84 static int proceed_next_node (const regex_t
*preg
, int nregs
, regmatch_t
*regs
,
85 const re_match_context_t
*mctx
,
86 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
87 struct re_fail_stack_t
*fs
);
88 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
89 int str_idx
, int *dests
, int nregs
,
91 re_node_set
*eps_via_nodes
);
92 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
93 regmatch_t
*regs
, re_node_set
*eps_via_nodes
);
94 static reg_errcode_t
set_regs (const regex_t
*preg
,
95 const re_match_context_t
*mctx
,
96 size_t nmatch
, regmatch_t
*pmatch
,
99 static int sift_states_iter_mb (const regex_t
*preg
,
100 const re_match_context_t
*mctx
,
101 re_sift_context_t
*sctx
,
102 int node_idx
, int str_idx
, int max_str_idx
);
103 #endif /* RE_ENABLE_I18N */
104 static reg_errcode_t
sift_states_backward (const regex_t
*preg
,
105 re_match_context_t
*mctx
,
106 re_sift_context_t
*sctx
);
107 static reg_errcode_t
update_cur_sifted_state (const regex_t
*preg
,
108 re_match_context_t
*mctx
,
109 re_sift_context_t
*sctx
,
111 re_node_set
*dest_nodes
);
112 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
113 re_node_set
*dest_nodes
,
114 const re_node_set
*candidates
);
115 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
116 re_node_set
*dest_nodes
,
117 const re_node_set
*and_nodes
);
118 static int check_dst_limits (re_dfa_t
*dfa
, re_node_set
*limits
,
119 re_match_context_t
*mctx
, int dst_node
,
120 int dst_idx
, int src_node
, int src_idx
);
121 static int check_dst_limits_calc_pos (re_dfa_t
*dfa
, re_match_context_t
*mctx
,
122 int limit
, re_node_set
*eclosures
,
123 int subexp_idx
, int node
, int str_idx
);
124 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
125 re_node_set
*dest_nodes
,
126 const re_node_set
*candidates
,
128 struct re_backref_cache_entry
*bkref_ents
,
130 static reg_errcode_t
search_subexp (const regex_t
*preg
,
131 re_match_context_t
*mctx
,
132 re_sift_context_t
*sctx
, int str_idx
,
133 re_node_set
*dest_nodes
);
134 static reg_errcode_t
sift_states_bkref (const regex_t
*preg
,
135 re_match_context_t
*mctx
,
136 re_sift_context_t
*sctx
,
137 int str_idx
, re_node_set
*dest_nodes
);
138 static reg_errcode_t
clean_state_log_if_need (re_match_context_t
*mctx
,
139 int next_state_log_idx
);
140 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
141 re_dfastate_t
**src
, int num
);
142 static re_dfastate_t
*transit_state (reg_errcode_t
*err
, const regex_t
*preg
,
143 re_match_context_t
*mctx
,
144 re_dfastate_t
*state
, int fl_search
);
145 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
, const regex_t
*preg
,
146 re_dfastate_t
*pstate
,
148 re_match_context_t
*mctx
);
149 #ifdef RE_ENABLE_I18N
150 static reg_errcode_t
transit_state_mb (const regex_t
*preg
,
151 re_dfastate_t
*pstate
,
152 re_match_context_t
*mctx
);
153 #endif /* RE_ENABLE_I18N */
154 static reg_errcode_t
transit_state_bkref (const regex_t
*preg
,
155 re_dfastate_t
*pstate
,
156 re_match_context_t
*mctx
);
157 static reg_errcode_t
transit_state_bkref_loop (const regex_t
*preg
,
159 re_dfastate_t
**work_state_log
,
160 re_match_context_t
*mctx
);
161 static re_dfastate_t
**build_trtable (const regex_t
*dfa
,
162 const re_dfastate_t
*state
,
164 #ifdef RE_ENABLE_I18N
165 static int check_node_accept_bytes (const regex_t
*preg
, int node_idx
,
166 const re_string_t
*input
, int idx
);
168 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
171 #endif /* RE_ENABLE_I18N */
172 static int group_nodes_into_DFAstates (const regex_t
*dfa
,
173 const re_dfastate_t
*state
,
174 re_node_set
*states_node
,
176 static int check_node_accept (const regex_t
*preg
, const re_token_t
*node
,
177 const re_match_context_t
*mctx
, int idx
);
178 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
);
180 /* Entry point for POSIX code. */
182 /* regexec searches for a given pattern, specified by PREG, in the
185 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
186 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
187 least NMATCH elements, and we set them to the offsets of the
188 corresponding matched substrings.
190 EFLAGS specifies `execution flags' which affect matching: if
191 REG_NOTBOL is set, then ^ does not match at the beginning of the
192 string; if REG_NOTEOL is set, then $ does not match at the end.
194 We return 0 if we find a match and REG_NOMATCH if not. */
197 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
198 const regex_t
*__restrict preg
;
199 const char *__restrict string
;
205 int length
= strlen (string
);
207 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, 0,
210 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, nmatch
,
212 return err
!= REG_NOERROR
;
215 weak_alias (__regexec
, regexec
)
218 /* Entry points for GNU code. */
220 /* re_match, re_search, re_match_2, re_search_2
222 The former two functions operate on STRING with length LENGTH,
223 while the later two operate on concatenation of STRING1 and STRING2
224 with lengths LENGTH1 and LENGTH2, respectively.
226 re_match() matches the compiled pattern in BUFP against the string,
227 starting at index START.
229 re_search() first tries matching at index START, then it tries to match
230 starting from index START + 1, and so on. The last start position tried
231 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
234 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
235 the first STOP characters of the concatenation of the strings should be
238 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
239 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
240 computed relative to the concatenation, not relative to the individual
243 On success, re_match* functions return the length of the match, re_search*
244 return the position of the start of the match. Return value -1 means no
245 match was found and -2 indicates an internal error. */
248 re_match (bufp
, string
, length
, start
, regs
)
249 struct re_pattern_buffer
*bufp
;
252 struct re_registers
*regs
;
254 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
257 weak_alias (__re_match
, re_match
)
261 re_search (bufp
, string
, length
, start
, range
, regs
)
262 struct re_pattern_buffer
*bufp
;
264 int length
, start
, range
;
265 struct re_registers
*regs
;
267 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
270 weak_alias (__re_search
, re_search
)
274 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
275 struct re_pattern_buffer
*bufp
;
276 const char *string1
, *string2
;
277 int length1
, length2
, start
, stop
;
278 struct re_registers
*regs
;
280 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
281 start
, 0, regs
, stop
, 1);
284 weak_alias (__re_match_2
, re_match_2
)
288 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
289 struct re_pattern_buffer
*bufp
;
290 const char *string1
, *string2
;
291 int length1
, length2
, start
, range
, stop
;
292 struct re_registers
*regs
;
294 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
295 start
, range
, regs
, stop
, 0);
298 weak_alias (__re_search_2
, re_search_2
)
302 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
304 struct re_pattern_buffer
*bufp
;
305 const char *string1
, *string2
;
306 int length1
, length2
, start
, range
, stop
, ret_len
;
307 struct re_registers
*regs
;
311 int len
= length1
+ length2
;
314 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
317 /* Concatenate the strings. */
321 char *s
= re_malloc (char, len
);
323 if (BE (s
== NULL
, 0))
325 memcpy (s
, string1
, length1
);
326 memcpy (s
+ length1
, string2
, length2
);
335 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
338 re_free ((char *) str
);
342 /* The parameters have the same meaning as those of re_search.
343 Additional parameters:
344 If RET_LEN is nonzero the length of the match is returned (re_match style);
345 otherwise the position of the match is returned. */
348 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
349 struct re_pattern_buffer
*bufp
;
351 int length
, start
, range
, stop
, ret_len
;
352 struct re_registers
*regs
;
354 reg_errcode_t result
;
359 /* Check for out-of-range. */
360 if (BE (start
< 0 || start
> length
|| range
< 0, 0))
362 if (BE (start
+ range
> length
, 0))
363 range
= length
- start
;
365 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
366 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
368 /* Compile fastmap if we haven't yet. */
369 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
370 re_compile_fastmap (bufp
);
372 if (BE (bufp
->no_sub
, 0))
375 /* We need at least 1 register. */
378 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
379 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
381 nregs
= regs
->num_regs
;
382 if (BE (nregs
< 1, 0))
384 /* Nothing can be copied to regs. */
390 nregs
= bufp
->re_nsub
+ 1;
391 pmatch
= re_malloc (regmatch_t
, nregs
);
392 if (BE (pmatch
== NULL
, 0))
395 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
396 nregs
, pmatch
, eflags
);
400 /* I hope we needn't fill ther regs with -1's when no match was found. */
401 if (result
!= REG_NOERROR
)
403 else if (regs
!= NULL
)
405 /* If caller wants register contents data back, copy them. */
406 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
407 bufp
->regs_allocated
);
408 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
412 if (BE (rval
== 0, 1))
416 assert (pmatch
[0].rm_so
== start
);
417 rval
= pmatch
[0].rm_eo
- start
;
420 rval
= pmatch
[0].rm_so
;
427 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
428 struct re_registers
*regs
;
430 int nregs
, regs_allocated
;
432 int rval
= REGS_REALLOCATE
;
434 int need_regs
= nregs
+ 1;
435 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
438 /* Have the register data arrays been allocated? */
439 if (regs_allocated
== REGS_UNALLOCATED
)
440 { /* No. So allocate them with malloc. */
441 regs
->start
= re_malloc (regoff_t
, need_regs
);
442 if (BE (regs
->start
== NULL
, 0))
443 return REGS_UNALLOCATED
;
444 regs
->end
= re_malloc (regoff_t
, need_regs
);
445 if (BE (regs
->end
== NULL
, 0))
447 re_free (regs
->start
);
448 return REGS_UNALLOCATED
;
450 regs
->num_regs
= need_regs
;
452 else if (regs_allocated
== REGS_REALLOCATE
)
453 { /* Yes. If we need more elements than were already
454 allocated, reallocate them. If we need fewer, just
456 if (need_regs
> regs
->num_regs
)
458 regs
->start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
459 if (BE (regs
->start
== NULL
, 0))
461 if (regs
->end
!= NULL
)
463 return REGS_UNALLOCATED
;
465 regs
->end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
466 if (BE (regs
->end
== NULL
, 0))
468 re_free (regs
->start
);
469 return REGS_UNALLOCATED
;
471 regs
->num_regs
= need_regs
;
476 assert (regs_allocated
== REGS_FIXED
);
477 /* This function may not be called with REGS_FIXED and nregs too big. */
478 assert (regs
->num_regs
>= nregs
);
483 for (i
= 0; i
< nregs
; ++i
)
485 regs
->start
[i
] = pmatch
[i
].rm_so
;
486 regs
->end
[i
] = pmatch
[i
].rm_eo
;
488 for ( ; i
< regs
->num_regs
; ++i
)
489 regs
->start
[i
] = regs
->end
[i
] = -1;
494 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
495 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
496 this memory for recording register information. STARTS and ENDS
497 must be allocated using the malloc library routine, and must each
498 be at least NUM_REGS * sizeof (regoff_t) bytes long.
500 If NUM_REGS == 0, then subsequent matches should allocate their own
503 Unless this function is called, the first search or match using
504 PATTERN_BUFFER will allocate its own register data, without
505 freeing the old data. */
508 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
509 struct re_pattern_buffer
*bufp
;
510 struct re_registers
*regs
;
512 regoff_t
*starts
, *ends
;
516 bufp
->regs_allocated
= REGS_REALLOCATE
;
517 regs
->num_regs
= num_regs
;
518 regs
->start
= starts
;
523 bufp
->regs_allocated
= REGS_UNALLOCATED
;
525 regs
->start
= regs
->end
= (regoff_t
*) 0;
529 weak_alias (__re_set_registers
, re_set_registers
)
532 /* Entry points compatible with 4.2 BSD regex library. We don't define
533 them unless specifically requested. */
535 #if defined _REGEX_RE_COMP || defined _LIBC
543 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
545 #endif /* _REGEX_RE_COMP */
547 static re_node_set empty_set
;
549 /* Internal entry point. */
551 /* Searches for a compiled pattern PREG in the string STRING, whose
552 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
553 mingings with regexec. START, and RANGE have the same meanings
555 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
556 otherwise return the error code.
557 Note: We assume front end functions already check ranges.
558 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
561 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
565 int length
, start
, range
, stop
, eflags
;
570 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
572 int left_lim
, right_lim
, incr
;
573 int fl_longest_match
, match_first
, match_last
= -1;
574 re_match_context_t mctx
;
575 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
)
576 ? preg
->fastmap
: NULL
);
578 /* Check if the DFA haven't been compiled. */
579 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
580 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
581 || dfa
->init_state_begbuf
== NULL
, 0))
584 re_node_set_init_empty (&empty_set
);
586 /* We must check the longest matching, if nmatch > 0. */
587 fl_longest_match
= (nmatch
!= 0);
589 err
= re_string_allocate (&input
, string
, length
, dfa
->nodes_len
+ 1,
590 preg
->translate
, preg
->syntax
& RE_ICASE
);
591 if (BE (err
!= REG_NOERROR
, 0))
595 err
= match_ctx_init (&mctx
, eflags
, &input
, dfa
->nbackref
* 2);
596 if (BE (err
!= REG_NOERROR
, 0))
599 /* We will log all the DFA states through which the dfa pass,
600 if nmatch > 1, or this dfa has "multibyte node", which is a
601 back-reference or a node which can accept multibyte character or
602 multi character collating element. */
603 if (nmatch
> 1 || dfa
->has_mb_node
)
605 mctx
.state_log
= re_malloc (re_dfastate_t
*, dfa
->nodes_len
+ 1);
606 if (BE (mctx
.state_log
== NULL
, 0))
610 mctx
.state_log
= NULL
;
613 /* We assume front-end functions already check them. */
614 assert (start
+ range
>= 0 && start
+ range
<= length
);
618 input
.tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
619 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
621 /* Check incrementally whether of not the input string match. */
622 incr
= (range
< 0) ? -1 : 1;
623 left_lim
= (range
< 0) ? start
+ range
: start
;
624 right_lim
= (range
< 0) ? start
: start
+ range
;
628 /* At first get the current byte from input string. */
630 if (MB_CUR_MAX
> 1 && (preg
->syntax
& RE_ICASE
|| preg
->translate
))
632 /* In this case, we can't determin easily the current byte,
633 since it might be a component byte of a multibyte character.
634 Then we use the constructed buffer instead. */
635 /* If MATCH_FIRST is out of the valid range, reconstruct the
637 if (input
.raw_mbs_idx
+ input
.valid_len
<= match_first
)
638 re_string_reconstruct (&input
, match_first
, eflags
,
639 preg
->newline_anchor
);
640 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
641 Note that MATCH_FIRST must not be smaller than 0. */
642 ch
= ((match_first
>= length
) ? 0
643 : re_string_byte_at (&input
, match_first
- input
.raw_mbs_idx
));
647 /* We apply translate/conversion manually, since it is trivial
649 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
650 Note that MATCH_FIRST must not be smaller than 0. */
651 ch
= (match_first
< length
) ? (unsigned char)string
[match_first
] : 0;
652 /* Apply translation if we need. */
653 ch
= preg
->translate
? preg
->translate
[ch
] : ch
;
654 /* In case of case insensitive mode, convert to upper case. */
655 ch
= ((preg
->syntax
& RE_ICASE
) && islower (ch
)) ? toupper (ch
) : ch
;
658 /* Eliminate inappropriate one by fastmap. */
659 if (preg
->can_be_null
|| fastmap
== NULL
|| fastmap
[ch
])
661 /* Reconstruct the buffers so that the matcher can assume that
662 the matching starts from the begining of the buffer. */
663 re_string_reconstruct (&input
, match_first
, eflags
,
664 preg
->newline_anchor
);
665 #ifdef RE_ENABLE_I18N
666 /* Eliminate it when it is a component of a multibyte character
667 and isn't the head of a multibyte character. */
668 if (MB_CUR_MAX
== 1 || re_string_first_byte (&input
, 0))
671 /* It seems to be appropriate one, then use the matcher. */
672 /* We assume that the matching starts from 0. */
673 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
674 match_last
= check_matching (preg
, &mctx
, 0, fl_longest_match
);
675 if (match_last
!= -1)
677 if (BE (match_last
== -2, 0))
680 break; /* We found a matching. */
684 /* Update counter. */
686 if (match_first
< left_lim
|| right_lim
< match_first
)
690 /* Set pmatch[] if we need. */
691 if (match_last
!= -1 && nmatch
> 0)
695 /* Initialize registers. */
696 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
697 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
699 /* Set the points where matching start/end. */
701 mctx
.match_last
= pmatch
[0].rm_eo
= match_last
;
703 if (!preg
->no_sub
&& nmatch
> 1)
705 /* We need the ranges of all the subexpressions. */
707 re_dfastate_t
**sifted_states
;
708 re_dfastate_t
**lim_states
= NULL
;
709 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
710 re_sift_context_t sctx
;
712 assert (mctx
.state_log
!= NULL
);
714 halt_node
= check_halt_state_context (preg
, pstate
, &mctx
,
716 if (dfa
->has_plural_match
)
718 match_ctx_clear_flag (&mctx
);
719 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
720 if (BE (sifted_states
== NULL
, 0))
724 lim_states
= calloc (sizeof (re_dfastate_t
*),
726 if (BE (lim_states
== NULL
, 0))
729 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
731 err
= sift_states_backward (preg
, &mctx
, &sctx
);
732 if (BE (err
!= REG_NOERROR
, 0))
734 if (lim_states
!= NULL
)
736 err
= merge_state_array (dfa
, sifted_states
, lim_states
,
738 if (BE (err
!= REG_NOERROR
, 0))
740 re_free (lim_states
);
742 re_node_set_free (&sctx
.limits
);
743 re_free (mctx
.state_log
);
744 mctx
.state_log
= sifted_states
;
746 mctx
.last_node
= halt_node
;
747 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
748 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
749 if (BE (err
!= REG_NOERROR
, 0))
753 /* At last, add the offset to the each registers, since we slided
754 the buffers so that We can assume that the matching starts from 0. */
755 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
756 if (pmatch
[reg_idx
].rm_so
!= -1)
758 pmatch
[reg_idx
].rm_so
+= match_first
;
759 pmatch
[reg_idx
].rm_eo
+= match_first
;
763 re_free (mctx
.state_log
);
765 match_ctx_free (&mctx
);
766 re_string_destruct (&input
);
768 return (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
771 /* Acquire an initial state and return it.
772 We must select appropriate initial state depending on the context,
773 since initial states may have constraints like "\<", "^", etc.. */
775 static inline re_dfastate_t
*
776 acquire_init_state_context (err
, preg
, mctx
, idx
)
779 const re_match_context_t
*mctx
;
782 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
785 if (dfa
->init_state
->has_constraint
)
787 unsigned int context
;
788 context
= re_string_context_at (mctx
->input
, idx
- 1, mctx
->eflags
,
789 preg
->newline_anchor
);
790 if (IS_WORD_CONTEXT (context
))
791 return dfa
->init_state_word
;
792 else if (IS_ORDINARY_CONTEXT (context
))
793 return dfa
->init_state
;
794 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
795 return dfa
->init_state_begbuf
;
796 else if (IS_NEWLINE_CONTEXT (context
))
797 return dfa
->init_state_nl
;
798 else if (IS_BEGBUF_CONTEXT (context
))
800 /* It is relatively rare case, then calculate on demand. */
801 return re_acquire_state_context (err
, dfa
,
802 dfa
->init_state
->entrance_nodes
,
806 /* Must not happen? */
807 return dfa
->init_state
;
810 return dfa
->init_state
;
813 /* Check whether the regular expression match input string INPUT or not,
814 and return the index where the matching end, return -1 if not match,
815 or return -2 in case of an error.
816 FL_SEARCH means we must search where the matching starts,
817 FL_LONGEST_MATCH means we want the POSIX longest matching.
818 Note that the matcher assume that the maching starts from the current
819 index of the buffer. */
822 check_matching (preg
, mctx
, fl_search
, fl_longest_match
)
824 re_match_context_t
*mctx
;
825 int fl_search
, fl_longest_match
;
830 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
831 re_dfastate_t
*cur_state
;
833 cur_state
= acquire_init_state_context (&err
, preg
, mctx
, cur_str_idx
);
834 /* An initial state must not be NULL(invalid state). */
835 if (BE (cur_state
== NULL
, 0))
837 if (mctx
->state_log
!= NULL
)
838 mctx
->state_log
[cur_str_idx
] = cur_state
;
840 if (cur_state
->has_backref
)
843 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
844 for (i
= 0; i
< cur_state
->nodes
.nelem
; ++i
)
846 re_token_type_t type
;
847 int node
= cur_state
->nodes
.elems
[i
];
848 int entity
= (dfa
->nodes
[node
].type
!= OP_CONTEXT_NODE
? node
849 : dfa
->nodes
[node
].opr
.ctx_info
->entity
);
850 type
= dfa
->nodes
[entity
].type
;
851 if (type
== OP_BACK_REF
)
854 for (clexp_idx
= 0; clexp_idx
< cur_state
->nodes
.nelem
;
857 re_token_t
*clexp_node
;
858 clexp_node
= dfa
->nodes
+ cur_state
->nodes
.elems
[clexp_idx
];
859 if (clexp_node
->type
== OP_CLOSE_SUBEXP
860 && clexp_node
->opr
.idx
+ 1== dfa
->nodes
[entity
].opr
.idx
)
862 err
= match_ctx_add_entry (mctx
, node
, 0, 0, 0);
863 if (BE (err
!= REG_NOERROR
, 0))
872 /* If the RE accepts NULL string. */
875 if (!cur_state
->has_constraint
876 || check_halt_state_context (preg
, cur_state
, mctx
, cur_str_idx
))
878 if (!fl_longest_match
)
882 match_last
= cur_str_idx
;
888 while (!re_string_eoi (mctx
->input
))
890 cur_state
= transit_state (&err
, preg
, mctx
, cur_state
,
891 fl_search
&& !match
);
892 if (cur_state
== NULL
) /* Reached at the invalid state or an error. */
894 cur_str_idx
= re_string_cur_idx (mctx
->input
);
895 if (BE (err
!= REG_NOERROR
, 0))
897 if (fl_search
&& !match
)
899 /* Restart from initial state, since we are searching
900 the point from where matching start. */
901 #ifdef RE_ENABLE_I18N
903 || re_string_first_byte (mctx
->input
, cur_str_idx
))
904 #endif /* RE_ENABLE_I18N */
905 cur_state
= acquire_init_state_context (&err
, preg
, mctx
,
907 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
909 if (mctx
->state_log
!= NULL
)
910 mctx
->state_log
[cur_str_idx
] = cur_state
;
912 else if (!fl_longest_match
&& match
)
914 else /* (fl_longest_match && match) || (!fl_search && !match) */
916 if (mctx
->state_log
== NULL
)
920 int max
= mctx
->state_log_top
;
921 for (; cur_str_idx
<= max
; ++cur_str_idx
)
922 if (mctx
->state_log
[cur_str_idx
] != NULL
)
924 if (cur_str_idx
> max
)
930 if (cur_state
!= NULL
&& cur_state
->halt
)
932 /* Reached at a halt state.
933 Check the halt state can satisfy the current context. */
934 if (!cur_state
->has_constraint
935 || check_halt_state_context (preg
, cur_state
, mctx
,
936 re_string_cur_idx (mctx
->input
)))
938 /* We found an appropriate halt state. */
939 match_last
= re_string_cur_idx (mctx
->input
);
941 if (!fl_longest_match
)
949 /* Check NODE match the current context. */
951 static int check_halt_node_context (dfa
, node
, context
)
954 unsigned int context
;
957 re_token_type_t type
= dfa
->nodes
[node
].type
;
958 if (type
== END_OF_RE
)
960 if (type
!= OP_CONTEXT_NODE
)
962 entity
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
963 if (dfa
->nodes
[entity
].type
!= END_OF_RE
964 || NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[node
].constraint
, context
))
969 /* Check the halt state STATE match the current context.
970 Return 0 if not match, if the node, STATE has, is a halt node and
971 match the context, return the node. */
974 check_halt_state_context (preg
, state
, mctx
, idx
)
976 const re_dfastate_t
*state
;
977 const re_match_context_t
*mctx
;
980 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
982 unsigned int context
;
984 assert (state
->halt
);
986 context
= re_string_context_at (mctx
->input
, idx
, mctx
->eflags
,
987 preg
->newline_anchor
);
988 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
989 if (check_halt_node_context (dfa
, state
->nodes
.elems
[i
], context
))
990 return state
->nodes
.elems
[i
];
994 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
995 corresponding to the DFA).
996 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1000 proceed_next_node (preg
, nregs
, regs
, mctx
, pidx
, node
, eps_via_nodes
, fs
)
1001 const regex_t
*preg
;
1003 const re_match_context_t
*mctx
;
1004 int nregs
, *pidx
, node
;
1005 re_node_set
*eps_via_nodes
;
1006 struct re_fail_stack_t
*fs
;
1008 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1009 int i
, err
, dest_node
, cur_entity
;
1011 cur_entity
= ((dfa
->nodes
[node
].type
== OP_CONTEXT_NODE
)
1012 ? dfa
->nodes
[node
].opr
.ctx_info
->entity
: node
);
1013 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1015 int ndest
, dest_nodes
[2], dest_entities
[2];
1016 err
= re_node_set_insert (eps_via_nodes
, node
);
1017 if (BE (err
< 0, 0))
1019 /* Pick up valid destinations. */
1020 for (ndest
= 0, i
= 0; i
< mctx
->state_log
[*pidx
]->nodes
.nelem
; ++i
)
1022 int candidate
= mctx
->state_log
[*pidx
]->nodes
.elems
[i
];
1024 entity
= ((dfa
->nodes
[candidate
].type
== OP_CONTEXT_NODE
)
1025 ? dfa
->nodes
[candidate
].opr
.ctx_info
->entity
: candidate
);
1026 if (!re_node_set_contains (dfa
->edests
+ node
, entity
))
1028 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1029 dest_entities
[0] = (ndest
== 0) ? entity
: dest_entities
[0];
1030 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1031 dest_entities
[1] = (ndest
== 1) ? entity
: dest_entities
[1];
1035 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1036 if (dest_entities
[0] > dest_entities
[1])
1038 int swap_work
= dest_nodes
[0];
1039 dest_nodes
[0] = dest_nodes
[1];
1040 dest_nodes
[1] = swap_work
;
1042 /* In order to avoid infinite loop like "(a*)*". */
1043 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1044 return dest_nodes
[1];
1046 push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
, eps_via_nodes
);
1047 return dest_nodes
[0];
1051 int naccepted
= 0, entity
= node
;
1052 re_token_type_t type
= dfa
->nodes
[node
].type
;
1053 if (type
== OP_CONTEXT_NODE
)
1055 entity
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
1056 type
= dfa
->nodes
[entity
].type
;
1059 #ifdef RE_ENABLE_I18N
1060 if (ACCEPT_MB_NODE (type
))
1061 naccepted
= check_node_accept_bytes (preg
, entity
, mctx
->input
, *pidx
);
1063 #endif /* RE_ENABLE_I18N */
1064 if (type
== OP_BACK_REF
)
1066 int subexp_idx
= dfa
->nodes
[entity
].opr
.idx
;
1067 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1070 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1074 char *buf
= re_string_get_buffer (mctx
->input
);
1075 if (strncmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1083 err
= re_node_set_insert (eps_via_nodes
, node
);
1084 if (BE (err
< 0, 0))
1086 dest_node
= dfa
->nexts
[node
];
1087 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1090 for (i
= 0; i
< mctx
->state_log
[*pidx
]->nodes
.nelem
; ++i
)
1092 dest_node
= mctx
->state_log
[*pidx
]->nodes
.elems
[i
];
1093 if ((dfa
->nodes
[dest_node
].type
== OP_CONTEXT_NODE
1094 && (dfa
->nexts
[node
]
1095 == dfa
->nodes
[dest_node
].opr
.ctx_info
->entity
)))
1102 || check_node_accept (preg
, dfa
->nodes
+ node
, mctx
, *pidx
))
1104 dest_node
= dfa
->nexts
[node
];
1105 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1106 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1107 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1110 re_node_set_empty (eps_via_nodes
);
1117 static reg_errcode_t
1118 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1119 struct re_fail_stack_t
*fs
;
1120 int str_idx
, *dests
, nregs
;
1122 re_node_set
*eps_via_nodes
;
1125 int num
= fs
->num
++;
1126 if (fs
->num
== fs
->alloc
)
1129 fs
->stack
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1131 if (fs
->stack
== NULL
)
1134 fs
->stack
[num
].idx
= str_idx
;
1135 fs
->stack
[num
].node
= dests
[1];
1136 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1137 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1138 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1143 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1144 struct re_fail_stack_t
*fs
;
1147 re_node_set
*eps_via_nodes
;
1149 int num
= --fs
->num
;
1151 *pidx
= fs
->stack
[num
].idx
;
1152 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1153 re_node_set_free (eps_via_nodes
);
1154 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1155 return fs
->stack
[num
].node
;
1158 /* Set the positions where the subexpressions are starts/ends to registers
1160 Note: We assume that pmatch[0] is already set, and
1161 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1163 static reg_errcode_t
1164 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1165 const regex_t
*preg
;
1166 const re_match_context_t
*mctx
;
1171 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1172 int idx
, cur_node
, real_nmatch
;
1173 re_node_set eps_via_nodes
;
1174 struct re_fail_stack_t
*fs
;
1175 struct re_fail_stack_t fs_body
= {0, 2, NULL
};
1177 assert (nmatch
> 1);
1178 assert (mctx
->state_log
!= NULL
);
1183 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1187 cur_node
= dfa
->init_node
;
1188 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1189 re_node_set_init_empty (&eps_via_nodes
);
1190 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1192 update_regs (dfa
, pmatch
, cur_node
, idx
, real_nmatch
);
1193 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1198 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1199 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1201 if (reg_idx
== nmatch
)
1203 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1210 /* Proceed to next node. */
1211 cur_node
= proceed_next_node (preg
, nmatch
, pmatch
, mctx
, &idx
, cur_node
,
1212 &eps_via_nodes
, fs
);
1214 if (BE (cur_node
< 0, 0))
1219 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1225 re_node_set_free (&eps_via_nodes
);
1230 update_regs (dfa
, pmatch
, cur_node
, cur_idx
, nmatch
)
1233 int cur_node
, cur_idx
, nmatch
;
1235 int type
= dfa
->nodes
[cur_node
].type
;
1237 if (type
!= OP_OPEN_SUBEXP
&& type
!= OP_CLOSE_SUBEXP
)
1239 reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1240 if (reg_num
>= nmatch
)
1242 if (type
== OP_OPEN_SUBEXP
)
1244 /* We are at the first node of this sub expression. */
1245 pmatch
[reg_num
].rm_so
= cur_idx
;
1246 pmatch
[reg_num
].rm_eo
= -1;
1248 else if (type
== OP_CLOSE_SUBEXP
)
1249 /* We are at the first node of this sub expression. */
1250 pmatch
[reg_num
].rm_eo
= cur_idx
;
1253 #define NUMBER_OF_STATE 1
1255 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1256 and sift the nodes in each states according to the following rules.
1257 Updated state_log will be wrote to STATE_LOG.
1259 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1260 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1261 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1262 the LAST_NODE, we throw away the node `a'.
1263 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1264 string `s' and transit to `b':
1265 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1267 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1268 throwed away, we throw away the node `a'.
1269 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1270 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1272 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1273 we throw away the node `a'. */
1275 #define STATE_NODE_CONTAINS(state,node) \
1276 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1278 static reg_errcode_t
1279 sift_states_backward (preg
, mctx
, sctx
)
1280 const regex_t
*preg
;
1281 re_match_context_t
*mctx
;
1282 re_sift_context_t
*sctx
;
1285 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1287 int str_idx
= sctx
->last_str_idx
;
1288 re_node_set cur_dest
;
1289 re_node_set
*cur_src
; /* Points the state_log[str_idx]->nodes */
1292 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1294 cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1296 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1297 transit to the last_node and the last_node itself. */
1298 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1299 if (BE (err
!= REG_NOERROR
, 0))
1301 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1302 if (BE (err
!= REG_NOERROR
, 0))
1305 /* Then check each states in the state_log. */
1309 /* Update counters. */
1310 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1311 if (null_cnt
> mctx
->max_mb_elem_len
)
1313 memset (sctx
->sifted_states
, '\0',
1314 sizeof (re_dfastate_t
*) * str_idx
);
1317 re_node_set_empty (&cur_dest
);
1319 cur_src
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1320 : &mctx
->state_log
[str_idx
]->nodes
);
1322 /* Then build the next sifted state.
1323 We build the next sifted state on `cur_dest', and update
1324 `sifted_states[str_idx]' with `cur_dest'.
1326 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1327 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1328 for (i
= 0; i
< cur_src
->nelem
; i
++)
1330 int prev_node
= cur_src
->elems
[i
];
1331 int entity
= prev_node
;
1333 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1335 if (IS_EPSILON_NODE(type
))
1337 if (type
== OP_CONTEXT_NODE
)
1339 entity
= dfa
->nodes
[prev_node
].opr
.ctx_info
->entity
;
1340 type
= dfa
->nodes
[entity
].type
;
1342 #ifdef RE_ENABLE_I18N
1343 /* If the node may accept `multi byte'. */
1344 if (ACCEPT_MB_NODE (type
))
1345 naccepted
= sift_states_iter_mb (preg
, mctx
, sctx
, entity
, str_idx
,
1346 sctx
->last_str_idx
);
1348 #endif /* RE_ENABLE_I18N */
1349 /* We don't check backreferences here.
1350 See update_cur_sifted_state(). */
1353 && check_node_accept (preg
, dfa
->nodes
+ prev_node
, mctx
,
1355 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1356 dfa
->nexts
[prev_node
]))
1362 if (sctx
->limits
.nelem
)
1364 int to_idx
= str_idx
+ naccepted
;
1365 if (check_dst_limits (dfa
, &sctx
->limits
, mctx
,
1366 dfa
->nexts
[prev_node
], to_idx
,
1367 prev_node
, str_idx
))
1370 ret
= re_node_set_insert (&cur_dest
, prev_node
);
1371 if (BE (ret
== -1, 0))
1375 /* Add all the nodes which satisfy the following conditions:
1376 - It can epsilon transit to a node in CUR_DEST.
1378 And update state_log. */
1379 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1380 if (BE (err
!= REG_NOERROR
, 0))
1384 re_node_set_free (&cur_dest
);
1388 /* Helper functions. */
1390 static inline reg_errcode_t
1391 clean_state_log_if_need (mctx
, next_state_log_idx
)
1392 re_match_context_t
*mctx
;
1393 int next_state_log_idx
;
1395 int top
= mctx
->state_log_top
;
1397 if (next_state_log_idx
>= mctx
->input
->bufs_len
1398 || (next_state_log_idx
>= mctx
->input
->valid_len
1399 && mctx
->input
->valid_len
< mctx
->input
->len
))
1402 err
= extend_buffers (mctx
);
1403 if (BE (err
!= REG_NOERROR
, 0))
1407 if (top
< next_state_log_idx
)
1409 memset (mctx
->state_log
+ top
+ 1, '\0',
1410 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1411 mctx
->state_log_top
= next_state_log_idx
;
1416 static reg_errcode_t
merge_state_array (dfa
, dst
, src
, num
)
1418 re_dfastate_t
**dst
;
1419 re_dfastate_t
**src
;
1424 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1426 if (dst
[st_idx
] == NULL
)
1427 dst
[st_idx
] = src
[st_idx
];
1428 else if (src
[st_idx
] != NULL
)
1430 re_node_set merged_set
;
1431 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1432 &src
[st_idx
]->nodes
);
1433 if (BE (err
!= REG_NOERROR
, 0))
1435 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1436 if (BE (err
!= REG_NOERROR
, 0))
1438 re_node_set_free (&merged_set
);
1444 static reg_errcode_t
1445 update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1446 const regex_t
*preg
;
1447 re_match_context_t
*mctx
;
1448 re_sift_context_t
*sctx
;
1450 re_node_set
*dest_nodes
;
1453 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1454 const re_node_set
*candidates
;
1455 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1456 : &mctx
->state_log
[str_idx
]->nodes
);
1458 /* At first, add the nodes which can epsilon transit to a node in
1460 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1461 if (BE (err
!= REG_NOERROR
, 0))
1464 /* Then, check the limitations in the current sift_context. */
1465 if (sctx
->limits
.nelem
)
1467 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1468 mctx
->bkref_ents
, str_idx
);
1469 if (BE (err
!= REG_NOERROR
, 0))
1473 /* Update state_log. */
1474 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1475 if (BE (sctx
->sifted_states
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1478 /* If we are searching for the subexpression candidates.
1479 Note that we were from transit_state_bkref_loop() in this case. */
1480 if (sctx
->check_subexp
)
1482 err
= search_subexp (preg
, mctx
, sctx
, str_idx
, dest_nodes
);
1483 if (BE (err
!= REG_NOERROR
, 0))
1487 if ((mctx
->state_log
[str_idx
] != NULL
1488 && mctx
->state_log
[str_idx
]->has_backref
))
1490 err
= sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
);
1491 if (BE (err
!= REG_NOERROR
, 0))
1497 static reg_errcode_t
1498 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1500 re_node_set
*dest_nodes
;
1501 const re_node_set
*candidates
;
1505 re_node_set src_copy
;
1507 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1508 if (BE (err
!= REG_NOERROR
, 0))
1510 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1512 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1514 + src_copy
.elems
[src_idx
]);
1515 if (BE (err
!= REG_NOERROR
, 0))
1518 re_node_set_free (&src_copy
);
1522 static reg_errcode_t
1523 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1526 re_node_set
*dest_nodes
;
1527 const re_node_set
*candidates
;
1531 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1532 re_node_set except_nodes
;
1533 re_node_set_init_empty (&except_nodes
);
1534 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1536 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1537 if (cur_node
== node
)
1539 if (dfa
->edests
[cur_node
].nelem
)
1541 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1542 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1543 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1544 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1545 && re_node_set_contains (dest_nodes
, edst1
))
1547 && !re_node_set_contains (inv_eclosure
, edst2
)
1548 && re_node_set_contains (dest_nodes
, edst2
)))
1550 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1551 dfa
->inveclosures
+ cur_node
);
1552 if (BE (err
!= REG_NOERROR
, 0))
1557 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1559 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1560 if (!re_node_set_contains (&except_nodes
, cur_node
))
1562 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1563 re_node_set_remove_at (dest_nodes
, idx
);
1566 re_node_set_free (&except_nodes
);
1571 check_dst_limits (dfa
, limits
, mctx
, dst_node
, dst_idx
, src_node
, src_idx
)
1573 re_node_set
*limits
;
1574 re_match_context_t
*mctx
;
1575 int dst_node
, dst_idx
, src_node
, src_idx
;
1577 int lim_idx
, src_pos
, dst_pos
;
1579 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1581 int bkref
, subexp_idx
/*, node_idx, cls_node*/;
1582 struct re_backref_cache_entry
*ent
;
1583 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1584 bkref
= (dfa
->nodes
[ent
->node
].type
== OP_CONTEXT_NODE
1585 ? dfa
->nodes
[ent
->node
].opr
.ctx_info
->entity
: ent
->node
);
1586 subexp_idx
= dfa
->nodes
[bkref
].opr
.idx
- 1;
1588 dst_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1589 dfa
->eclosures
+ dst_node
,
1590 subexp_idx
, dst_node
, dst_idx
);
1591 src_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1592 dfa
->eclosures
+ src_node
,
1593 subexp_idx
, src_node
, src_idx
);
1596 <src> <dst> ( <subexp> )
1597 ( <subexp> ) <src> <dst>
1598 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1599 if (src_pos
== dst_pos
)
1600 continue; /* This is unrelated limitation. */
1608 check_dst_limits_calc_pos (dfa
, mctx
, limit
, eclosures
, subexp_idx
, node
,
1611 re_match_context_t
*mctx
;
1612 re_node_set
*eclosures
;
1613 int limit
, subexp_idx
, node
, str_idx
;
1615 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1616 int pos
= (str_idx
< lim
->subexp_from
? -1
1617 : (lim
->subexp_to
< str_idx
? 1 : 0));
1619 && (str_idx
== lim
->subexp_from
|| str_idx
== lim
->subexp_to
))
1622 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1624 int node
= eclosures
->elems
[node_idx
];
1626 re_token_type_t type
= dfa
->nodes
[node
].type
;
1627 if (type
== OP_CONTEXT_NODE
)
1629 entity
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
1630 type
= dfa
->nodes
[entity
].type
;
1632 if (type
== OP_BACK_REF
)
1635 for (bi
= 0; bi
< mctx
->nbkref_ents
; ++bi
)
1637 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bi
;
1638 if (ent
->node
== node
&& ent
->subexp_from
== ent
->subexp_to
1639 && ent
->str_idx
== str_idx
)
1642 dst
= dfa
->nexts
[node
];
1643 cpos
= check_dst_limits_calc_pos (dfa
, mctx
, limit
,
1644 dfa
->eclosures
+ dst
,
1647 if ((str_idx
== lim
->subexp_from
&& cpos
== -1)
1648 || (str_idx
== lim
->subexp_to
&& cpos
== 0))
1653 if (type
== OP_OPEN_SUBEXP
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
1654 && str_idx
== lim
->subexp_from
)
1659 if (type
== OP_CLOSE_SUBEXP
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
1660 && str_idx
== lim
->subexp_to
)
1663 if (node_idx
== eclosures
->nelem
&& str_idx
== lim
->subexp_to
)
1669 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1670 which are against limitations from DEST_NODES. */
1672 static reg_errcode_t
1673 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1675 re_node_set
*dest_nodes
;
1676 const re_node_set
*candidates
;
1677 re_node_set
*limits
;
1678 struct re_backref_cache_entry
*bkref_ents
;
1682 int node_idx
, lim_idx
;
1684 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1686 int bkref
, subexp_idx
;
1687 struct re_backref_cache_entry
*ent
;
1688 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1690 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1691 continue; /* This is unrelated limitation. */
1693 bkref
= (dfa
->nodes
[ent
->node
].type
== OP_CONTEXT_NODE
1694 ? dfa
->nodes
[ent
->node
].opr
.ctx_info
->entity
: ent
->node
);
1695 subexp_idx
= dfa
->nodes
[bkref
].opr
.idx
- 1;
1697 if (ent
->subexp_to
== str_idx
)
1701 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1703 int node
= dest_nodes
->elems
[node_idx
];
1704 re_token_type_t type
= dfa
->nodes
[node
].type
;
1705 if (type
== OP_OPEN_SUBEXP
1706 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1708 else if (type
== OP_CLOSE_SUBEXP
1709 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1713 /* Check the limitation of the open subexpression. */
1714 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1717 err
= sub_epsilon_src_nodes(dfa
, ops_node
, dest_nodes
,
1719 if (BE (err
!= REG_NOERROR
, 0))
1722 /* Check the limitation of the close subexpression. */
1723 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1725 int node
= dest_nodes
->elems
[node_idx
];
1726 if (!re_node_set_contains (dfa
->inveclosures
+ node
, cls_node
)
1727 && !re_node_set_contains (dfa
->eclosures
+ node
, cls_node
))
1729 /* It is against this limitation.
1730 Remove it form the current sifted state. */
1731 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
,
1733 if (BE (err
!= REG_NOERROR
, 0))
1739 else /* (ent->subexp_to != str_idx) */
1741 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1743 int node
= dest_nodes
->elems
[node_idx
];
1744 re_token_type_t type
= dfa
->nodes
[node
].type
;
1745 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
1747 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
1749 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
1750 || (type
== OP_OPEN_SUBEXP
))
1752 /* It is against this limitation.
1753 Remove it form the current sifted state. */
1754 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
,
1756 if (BE (err
!= REG_NOERROR
, 0))
1766 /* Search for the top (in case of sctx->check_subexp < 0) or the
1767 bottom (in case of sctx->check_subexp > 0) of the subexpressions
1768 which the backreference sctx->cur_bkref can match. */
1770 static reg_errcode_t
1771 search_subexp (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1772 const regex_t
*preg
;
1773 re_match_context_t
*mctx
;
1774 re_sift_context_t
*sctx
;
1776 re_node_set
*dest_nodes
;
1779 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1780 re_sift_context_t local_sctx
;
1782 const re_node_set
*candidates
;
1783 re_dfastate_t
**lim_states
= NULL
;
1784 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1785 : &mctx
->state_log
[str_idx
]->nodes
);
1786 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
1788 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1790 re_token_type_t type
;
1792 node
= dest_nodes
->elems
[node_idx
];
1793 type
= dfa
->nodes
[node
].type
;
1794 entity
= (type
!= OP_CONTEXT_NODE
? node
1795 : dfa
->nodes
[node
].opr
.ctx_info
->entity
);
1796 type
= (type
!= OP_CONTEXT_NODE
? type
: dfa
->nodes
[entity
].type
);
1798 if (type
== OP_CLOSE_SUBEXP
1799 && sctx
->check_subexp
== dfa
->nodes
[node
].opr
.idx
+ 1)
1801 re_dfastate_t
*cur_state
;
1802 /* Found the bottom of the subexpression, then search for the
1804 if (local_sctx
.sifted_states
== NULL
)
1806 /* It hasn't been initialized yet, initialize it now. */
1808 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
1809 if (BE (err
!= REG_NOERROR
, 0))
1812 local_sctx
.check_subexp
= -sctx
->check_subexp
;
1813 local_sctx
.limited_states
= sctx
->limited_states
;
1814 local_sctx
.last_node
= node
;
1815 local_sctx
.last_str_idx
= local_sctx
.cls_subexp_idx
= str_idx
;
1816 cur_state
= local_sctx
.sifted_states
[str_idx
];
1817 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
1818 local_sctx
.sifted_states
[str_idx
] = cur_state
;
1819 if (BE (err
!= REG_NOERROR
, 0))
1821 /* There must not 2 same node in a node set. */
1824 else if (type
== OP_OPEN_SUBEXP
1825 && -sctx
->check_subexp
== dfa
->nodes
[node
].opr
.idx
+ 1)
1827 /* Found the top of the subexpression, check that the
1828 backreference can match the input string. */
1831 int bkref_str_idx
= re_string_cur_idx (mctx
->input
);
1832 int subexp_len
= sctx
->cls_subexp_idx
- str_idx
;
1833 if (subexp_len
< 0 || bkref_str_idx
+ subexp_len
> mctx
->input
->len
)
1836 if (bkref_str_idx
+ subexp_len
> mctx
->input
->valid_len
1837 && mctx
->input
->valid_len
< mctx
->input
->len
)
1840 err
= extend_buffers (mctx
);
1841 if (BE (err
!= REG_NOERROR
, 0))
1844 buf
= (char *) re_string_get_buffer (mctx
->input
);
1845 if (strncmp (buf
+ str_idx
, buf
+ bkref_str_idx
, subexp_len
) != 0)
1848 if (sctx
->limits
.nelem
&& str_idx
> 0)
1850 re_dfastate_t
*cur_state
= sctx
->sifted_states
[str_idx
];
1851 if (lim_states
== NULL
)
1853 lim_states
= re_malloc (re_dfastate_t
*, str_idx
+ 1);
1855 if (local_sctx
.sifted_states
== NULL
)
1857 /* It hasn't been initialized yet, initialize it now. */
1859 if (BE (lim_states
== NULL
, 0))
1861 err
= re_node_set_init_copy (&local_sctx
.limits
,
1863 if (BE (err
!= REG_NOERROR
, 0))
1866 local_sctx
.check_subexp
= 0;
1867 local_sctx
.last_node
= node
;
1868 local_sctx
.last_str_idx
= str_idx
;
1869 local_sctx
.limited_states
= lim_states
;
1870 memset (lim_states
, '\0',
1871 sizeof (re_dfastate_t
*) * (str_idx
+ 1));
1872 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
1873 if (BE (err
!= REG_NOERROR
, 0))
1875 if (local_sctx
.sifted_states
[0] == NULL
1876 && local_sctx
.limited_states
[0] == NULL
)
1878 sctx
->sifted_states
[str_idx
] = cur_state
;
1881 sctx
->sifted_states
[str_idx
] = cur_state
;
1883 /* Successfully matched, add a new cache entry. */
1884 dest_str_idx
= bkref_str_idx
+ subexp_len
;
1885 err
= match_ctx_add_entry (mctx
, sctx
->cur_bkref
, bkref_str_idx
,
1886 str_idx
, sctx
->cls_subexp_idx
);
1887 if (BE (err
!= REG_NOERROR
, 0))
1889 err
= clean_state_log_if_need (mctx
, dest_str_idx
);
1890 if (BE (err
!= REG_NOERROR
, 0))
1896 /* Remove the top/bottom of the sub expression we processed. */
1897 if (node_idx
< dest_nodes
->nelem
)
1899 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
, candidates
);
1900 if (BE (err
!= REG_NOERROR
, 0))
1902 /* Update state_log. */
1903 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1904 if (BE (err
!= REG_NOERROR
, 0))
1908 if (local_sctx
.sifted_states
!= NULL
)
1909 re_node_set_free (&local_sctx
.limits
);
1910 if (lim_states
!= NULL
)
1911 re_free (lim_states
);
1915 static reg_errcode_t
1916 sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1917 const regex_t
*preg
;
1918 re_match_context_t
*mctx
;
1919 re_sift_context_t
*sctx
;
1921 re_node_set
*dest_nodes
;
1924 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1926 re_sift_context_t local_sctx
;
1927 const re_node_set
*candidates
;
1928 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1929 : &mctx
->state_log
[str_idx
]->nodes
);
1930 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
1932 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
1935 int cur_bkref_idx
= re_string_cur_idx (mctx
->input
);
1936 re_token_type_t type
;
1937 node
= candidates
->elems
[node_idx
];
1938 type
= dfa
->nodes
[node
].type
;
1939 entity
= (type
!= OP_CONTEXT_NODE
? node
1940 : dfa
->nodes
[node
].opr
.ctx_info
->entity
);
1941 type
= (type
!= OP_CONTEXT_NODE
? type
: dfa
->nodes
[entity
].type
);
1942 if (node
== sctx
->cur_bkref
&& str_idx
== cur_bkref_idx
)
1944 /* Avoid infinite loop for the REs like "()\1+". */
1945 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
1947 if (type
== OP_BACK_REF
)
1950 for (enabled_idx
= 0; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
1952 int disabled_idx
, subexp_len
, to_idx
;
1953 struct re_backref_cache_entry
*entry
;
1954 entry
= mctx
->bkref_ents
+ enabled_idx
;
1955 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
1956 to_idx
= str_idx
+ subexp_len
;
1958 if (entry
->node
!= node
|| entry
->str_idx
!= str_idx
1959 || to_idx
> sctx
->last_str_idx
1960 || sctx
->sifted_states
[to_idx
] == NULL
)
1962 if (!STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
],
1966 re_node_set
*dsts
= &sctx
->sifted_states
[to_idx
]->nodes
;
1967 for (dst_idx
= 0; dst_idx
< dsts
->nelem
; ++dst_idx
)
1969 int dst_node
= dsts
->elems
[dst_idx
];
1970 if (dfa
->nodes
[dst_node
].type
== OP_CONTEXT_NODE
1971 && (dfa
->nodes
[dst_node
].opr
.ctx_info
->entity
1972 == dfa
->nexts
[node
]))
1975 if (dst_idx
== dsts
->nelem
)
1979 if (check_dst_limits (dfa
, &sctx
->limits
, mctx
, node
,
1980 str_idx
, dfa
->nexts
[node
], to_idx
))
1982 if (sctx
->check_subexp
== dfa
->nodes
[entity
].opr
.idx
)
1985 buf
= (char *) re_string_get_buffer (mctx
->input
);
1986 if (strncmp (buf
+ entry
->subexp_from
,
1987 buf
+ cur_bkref_idx
, subexp_len
) != 0)
1989 err
= match_ctx_add_entry (mctx
, sctx
->cur_bkref
,
1990 cur_bkref_idx
, entry
->subexp_from
,
1992 if (BE (err
!= REG_NOERROR
, 0))
1994 err
= clean_state_log_if_need (mctx
, cur_bkref_idx
1996 if (BE (err
!= REG_NOERROR
, 0))
2001 re_dfastate_t
*cur_state
;
2003 for (disabled_idx
= enabled_idx
+ 1;
2004 disabled_idx
< mctx
->nbkref_ents
; ++disabled_idx
)
2006 struct re_backref_cache_entry
*entry2
;
2007 entry2
= mctx
->bkref_ents
+ disabled_idx
;
2008 if (entry2
->node
!= node
|| entry2
->str_idx
!= str_idx
)
2013 if (local_sctx
.sifted_states
== NULL
)
2016 err
= re_node_set_init_copy (&local_sctx
.limits
,
2018 if (BE (err
!= REG_NOERROR
, 0))
2021 local_sctx
.last_node
= node
;
2022 local_sctx
.last_str_idx
= str_idx
;
2023 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2024 if (BE (err
< 0, 0))
2026 cur_state
= local_sctx
.sifted_states
[str_idx
];
2027 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
2028 if (BE (err
!= REG_NOERROR
, 0))
2030 if (sctx
->limited_states
!= NULL
)
2032 err
= merge_state_array (dfa
, sctx
->limited_states
,
2033 local_sctx
.sifted_states
,
2035 if (BE (err
!= REG_NOERROR
, 0))
2038 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2039 re_node_set_remove_at (&local_sctx
.limits
,
2040 local_sctx
.limits
.nelem
- 1);
2044 for (enabled_idx
= 0; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2046 struct re_backref_cache_entry
*entry
;
2047 entry
= mctx
->bkref_ents
+ enabled_idx
;
2048 if (entry
->node
== node
&& entry
->str_idx
== str_idx
)
2053 if (local_sctx
.sifted_states
!= NULL
)
2055 re_node_set_free (&local_sctx
.limits
);
2062 #ifdef RE_ENABLE_I18N
2064 sift_states_iter_mb (preg
, mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2065 const regex_t
*preg
;
2066 const re_match_context_t
*mctx
;
2067 re_sift_context_t
*sctx
;
2068 int node_idx
, str_idx
, max_str_idx
;
2070 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2072 /* Check the node can accept `multi byte'. */
2073 naccepted
= check_node_accept_bytes (preg
, node_idx
, mctx
->input
, str_idx
);
2074 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2075 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2076 dfa
->nexts
[node_idx
]))
2077 /* The node can't accept the `multi byte', or the
2078 destination was already throwed away, then the node
2079 could't accept the current input `multi byte'. */
2081 /* Otherwise, it is sure that the node could accept
2082 `naccepted' bytes input. */
2085 #endif /* RE_ENABLE_I18N */
2088 /* Functions for state transition. */
2090 /* Return the next state to which the current state STATE will transit by
2091 accepting the current input byte, and update STATE_LOG if necessary.
2092 If STATE can accept a multibyte char/collating element/back reference
2093 update the destination of STATE_LOG. */
2095 static re_dfastate_t
*
2096 transit_state (err
, preg
, mctx
, state
, fl_search
)
2098 const regex_t
*preg
;
2099 re_match_context_t
*mctx
;
2100 re_dfastate_t
*state
;
2103 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2104 re_dfastate_t
**trtable
, *next_state
;
2107 if (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->bufs_len
2108 || (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->valid_len
2109 && mctx
->input
->valid_len
< mctx
->input
->len
))
2111 *err
= extend_buffers (mctx
);
2112 if (BE (*err
!= REG_NOERROR
, 0))
2120 re_string_skip_bytes (mctx
->input
, 1);
2124 #ifdef RE_ENABLE_I18N
2125 /* If the current state can accept multibyte. */
2126 if (state
->accept_mb
)
2128 *err
= transit_state_mb (preg
, state
, mctx
);
2129 if (BE (*err
!= REG_NOERROR
, 0))
2132 #endif /* RE_ENABLE_I18N */
2134 /* Then decide the next state with the single byte. */
2137 /* Use transition table */
2138 ch
= re_string_fetch_byte (mctx
->input
);
2139 trtable
= fl_search
? state
->trtable_search
: state
->trtable
;
2140 if (trtable
== NULL
)
2142 trtable
= build_trtable (preg
, state
, fl_search
);
2144 state
->trtable_search
= trtable
;
2146 state
->trtable
= trtable
;
2148 next_state
= trtable
[ch
];
2152 /* don't use transition table */
2153 next_state
= transit_state_sb (err
, preg
, state
, fl_search
, mctx
);
2154 if (BE (next_state
== NULL
&& err
!= REG_NOERROR
, 0))
2159 /* Update the state_log if we need. */
2160 if (mctx
->state_log
!= NULL
)
2162 int cur_idx
= re_string_cur_idx (mctx
->input
);
2163 if (cur_idx
> mctx
->state_log_top
)
2165 mctx
->state_log
[cur_idx
] = next_state
;
2166 mctx
->state_log_top
= cur_idx
;
2168 else if (mctx
->state_log
[cur_idx
] == 0)
2170 mctx
->state_log
[cur_idx
] = next_state
;
2174 re_dfastate_t
*pstate
;
2175 unsigned int context
;
2176 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2177 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2178 the destination of a multibyte char/collating element/
2179 back reference. Then the next state is the union set of
2180 these destinations and the results of the transition table. */
2181 pstate
= mctx
->state_log
[cur_idx
];
2182 log_nodes
= pstate
->entrance_nodes
;
2183 if (next_state
!= NULL
)
2185 table_nodes
= next_state
->entrance_nodes
;
2186 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2188 if (BE (*err
!= REG_NOERROR
, 0))
2192 next_nodes
= *log_nodes
;
2193 /* Note: We already add the nodes of the initial state,
2194 then we don't need to add them here. */
2196 context
= re_string_context_at (mctx
->input
,
2197 re_string_cur_idx (mctx
->input
) - 1,
2198 mctx
->eflags
, preg
->newline_anchor
);
2199 next_state
= mctx
->state_log
[cur_idx
]
2200 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2201 /* We don't need to check errors here, since the return value of
2202 this function is next_state and ERR is already set. */
2204 if (table_nodes
!= NULL
)
2205 re_node_set_free (&next_nodes
);
2207 /* If the next state has back references. */
2208 if (next_state
!= NULL
&& next_state
->has_backref
)
2210 *err
= transit_state_bkref (preg
, next_state
, mctx
);
2211 if (BE (*err
!= REG_NOERROR
, 0))
2213 next_state
= mctx
->state_log
[cur_idx
];
2219 /* Helper functions for transit_state. */
2221 /* Return the next state to which the current state STATE will transit by
2222 accepting the current input byte. */
2224 static re_dfastate_t
*
2225 transit_state_sb (err
, preg
, state
, fl_search
, mctx
)
2227 const regex_t
*preg
;
2228 re_dfastate_t
*state
;
2230 re_match_context_t
*mctx
;
2232 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2233 re_node_set next_nodes
;
2234 re_dfastate_t
*next_state
;
2235 int node_cnt
, cur_str_idx
= re_string_cur_idx (mctx
->input
);
2236 unsigned int context
;
2238 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2239 if (BE (*err
!= REG_NOERROR
, 0))
2241 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2243 int cur_node
= state
->nodes
.elems
[node_cnt
];
2244 if (check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
, cur_str_idx
))
2246 *err
= re_node_set_merge (&next_nodes
,
2247 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2248 if (BE (*err
!= REG_NOERROR
, 0))
2254 #ifdef RE_ENABLE_I18N
2255 int not_initial
= 0;
2257 for (node_cnt
= 0; node_cnt
< next_nodes
.nelem
; ++node_cnt
)
2258 if (dfa
->nodes
[next_nodes
.elems
[node_cnt
]].type
== CHARACTER
)
2260 not_initial
= dfa
->nodes
[next_nodes
.elems
[node_cnt
]].mb_partial
;
2266 *err
= re_node_set_merge (&next_nodes
,
2267 dfa
->init_state
->entrance_nodes
);
2268 if (BE (*err
!= REG_NOERROR
, 0))
2272 context
= re_string_context_at (mctx
->input
, cur_str_idx
, mctx
->eflags
,
2273 preg
->newline_anchor
);
2274 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2275 /* We don't need to check errors here, since the return value of
2276 this function is next_state and ERR is already set. */
2278 re_node_set_free (&next_nodes
);
2279 re_string_skip_bytes (mctx
->input
, 1);
2283 #ifdef RE_ENABLE_I18N
2284 static reg_errcode_t
2285 transit_state_mb (preg
, pstate
, mctx
)
2286 const regex_t
*preg
;
2287 re_dfastate_t
*pstate
;
2288 re_match_context_t
*mctx
;
2291 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2294 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2296 re_node_set dest_nodes
, *new_nodes
;
2297 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2298 int naccepted
= 0, dest_idx
;
2299 unsigned int context
;
2300 re_dfastate_t
*dest_state
;
2302 if (dfa
->nodes
[cur_node_idx
].type
== OP_CONTEXT_NODE
)
2304 context
= re_string_context_at (mctx
->input
,
2305 re_string_cur_idx (mctx
->input
),
2306 mctx
->eflags
, preg
->newline_anchor
);
2307 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2310 cur_node_idx
= dfa
->nodes
[cur_node_idx
].opr
.ctx_info
->entity
;
2313 /* How many bytes the node can accepts? */
2314 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2315 naccepted
= check_node_accept_bytes (preg
, cur_node_idx
, mctx
->input
,
2316 re_string_cur_idx (mctx
->input
));
2320 /* The node can accepts `naccepted' bytes. */
2321 dest_idx
= re_string_cur_idx (mctx
->input
) + naccepted
;
2322 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2323 : mctx
->max_mb_elem_len
);
2324 err
= clean_state_log_if_need (mctx
, dest_idx
);
2325 if (BE (err
!= REG_NOERROR
, 0))
2328 assert (dfa
->nexts
[cur_node_idx
] != -1);
2330 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2331 then we use pstate->nodes.elems[i] instead. */
2332 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2334 dest_state
= mctx
->state_log
[dest_idx
];
2335 if (dest_state
== NULL
)
2336 dest_nodes
= *new_nodes
;
2339 err
= re_node_set_init_union (&dest_nodes
,
2340 dest_state
->entrance_nodes
, new_nodes
);
2341 if (BE (err
!= REG_NOERROR
, 0))
2344 context
= re_string_context_at (mctx
->input
, dest_idx
- 1, mctx
->eflags
,
2345 preg
->newline_anchor
);
2346 mctx
->state_log
[dest_idx
]
2347 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2348 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2350 if (dest_state
!= NULL
)
2351 re_node_set_free (&dest_nodes
);
2355 #endif /* RE_ENABLE_I18N */
2357 static reg_errcode_t
2358 transit_state_bkref (preg
, pstate
, mctx
)
2359 const regex_t
*preg
;
2360 re_dfastate_t
*pstate
;
2361 re_match_context_t
*mctx
;
2364 re_dfastate_t
**work_state_log
;
2366 work_state_log
= re_malloc (re_dfastate_t
*,
2367 re_string_cur_idx (mctx
->input
) + 1);
2368 if (BE (work_state_log
== NULL
, 0))
2371 err
= transit_state_bkref_loop (preg
, &pstate
->nodes
, work_state_log
, mctx
);
2372 re_free (work_state_log
);
2376 /* Caller must allocate `work_state_log'. */
2378 static reg_errcode_t
2379 transit_state_bkref_loop (preg
, nodes
, work_state_log
, mctx
)
2380 const regex_t
*preg
;
2382 re_dfastate_t
**work_state_log
;
2383 re_match_context_t
*mctx
;
2386 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2388 regmatch_t
*cur_regs
= re_malloc (regmatch_t
, preg
->re_nsub
+ 1);
2389 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
2390 if (BE (cur_regs
== NULL
, 0))
2393 for (i
= 0; i
< nodes
->nelem
; ++i
)
2395 int dest_str_idx
, subexp_idx
, prev_nelem
, bkc_idx
;
2396 int node_idx
= nodes
->elems
[i
];
2397 unsigned int context
;
2398 re_token_t
*node
= dfa
->nodes
+ node_idx
;
2399 re_node_set
*new_dest_nodes
;
2400 re_sift_context_t sctx
;
2402 /* Check whether `node' is a backreference or not. */
2403 if (node
->type
== OP_BACK_REF
)
2404 subexp_idx
= node
->opr
.idx
;
2405 else if (node
->type
== OP_CONTEXT_NODE
&&
2406 dfa
->nodes
[node
->opr
.ctx_info
->entity
].type
== OP_BACK_REF
)
2408 context
= re_string_context_at (mctx
->input
, cur_str_idx
,
2409 mctx
->eflags
, preg
->newline_anchor
);
2410 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2412 subexp_idx
= dfa
->nodes
[node
->opr
.ctx_info
->entity
].opr
.idx
;
2417 /* `node' is a backreference.
2418 Check the substring which the substring matched. */
2419 sift_ctx_init (&sctx
, work_state_log
, NULL
, node_idx
, cur_str_idx
,
2421 sctx
.cur_bkref
= node_idx
;
2422 match_ctx_clear_flag (mctx
);
2423 err
= sift_states_backward (preg
, mctx
, &sctx
);
2424 if (BE (err
!= REG_NOERROR
, 0))
2427 /* And add the epsilon closures (which is `new_dest_nodes') of
2428 the backreference to appropriate state_log. */
2430 assert (dfa
->nexts
[node_idx
] != -1);
2432 for (bkc_idx
= 0; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2435 re_dfastate_t
*dest_state
;
2436 struct re_backref_cache_entry
*bkref_ent
;
2437 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2438 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2440 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2441 new_dest_nodes
= ((node
->type
== OP_CONTEXT_NODE
&& subexp_len
== 0)
2442 ? dfa
->nodes
[node_idx
].opr
.ctx_info
->bkref_eclosure
2443 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2444 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2445 - bkref_ent
->subexp_from
);
2446 context
= (IS_WORD_CHAR (re_string_byte_at (mctx
->input
,
2448 ? CONTEXT_WORD
: 0);
2449 dest_state
= mctx
->state_log
[dest_str_idx
];
2450 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2451 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2452 /* Add `new_dest_node' to state_log. */
2453 if (dest_state
== NULL
)
2455 mctx
->state_log
[dest_str_idx
]
2456 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2458 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2459 && err
!= REG_NOERROR
, 0))
2464 re_node_set dest_nodes
;
2465 err
= re_node_set_init_union (&dest_nodes
,
2466 dest_state
->entrance_nodes
,
2468 if (BE (err
!= REG_NOERROR
, 0))
2470 mctx
->state_log
[dest_str_idx
]
2471 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2472 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2473 && err
!= REG_NOERROR
, 0))
2475 re_node_set_free (&dest_nodes
);
2477 /* We need to check recursively if the backreference can epsilon
2480 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2482 err
= transit_state_bkref_loop (preg
, new_dest_nodes
,
2483 work_state_log
, mctx
);
2484 if (BE (err
!= REG_NOERROR
, 0))
2493 /* Build transition table for the state.
2494 Return the new table if succeeded, otherwise return NULL. */
2496 static re_dfastate_t
**
2497 build_trtable (preg
, state
, fl_search
)
2498 const regex_t
*preg
;
2499 const re_dfastate_t
*state
;
2503 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2505 int ndests
; /* Number of the destination states from `state'. */
2506 re_dfastate_t
**trtable
, **dest_states
, **dest_states_word
, **dest_states_nl
;
2507 re_node_set follows
, *dests_node
;
2511 /* We build DFA states which corresponds to the destination nodes
2512 from `state'. `dests_node[i]' represents the nodes which i-th
2513 destination state contains, and `dests_ch[i]' represents the
2514 characters which i-th destination state accepts. */
2515 dests_node
= re_malloc (re_node_set
, SBC_MAX
);
2516 dests_ch
= re_malloc (bitset
, SBC_MAX
);
2518 /* Initialize transiton table. */
2519 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
2520 if (BE (dests_node
== NULL
|| dests_ch
== NULL
|| trtable
== NULL
, 0))
2523 /* At first, group all nodes belonging to `state' into several
2525 ndests
= group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
);
2526 if (BE (ndests
<= 0, 0))
2528 re_free (dests_node
);
2530 /* Return NULL in case of an error, trtable otherwise. */
2531 return (ndests
< 0) ? NULL
: trtable
;
2534 dest_states
= re_malloc (re_dfastate_t
*, ndests
);
2535 dest_states_word
= re_malloc (re_dfastate_t
*, ndests
);
2536 dest_states_nl
= re_malloc (re_dfastate_t
*, ndests
);
2537 bitset_empty (acceptable
);
2539 err
= re_node_set_alloc (&follows
, ndests
+ 1);
2540 if (BE (dest_states
== NULL
|| dest_states_word
== NULL
2541 || dest_states_nl
== NULL
|| err
!= REG_NOERROR
, 0))
2544 /* Then build the states for all destinations. */
2545 for (i
= 0; i
< ndests
; ++i
)
2548 re_node_set_empty (&follows
);
2549 /* Merge the follows of this destination states. */
2550 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
2552 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
2553 if (next_node
!= -1)
2555 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
2556 if (BE (err
!= REG_NOERROR
, 0))
2560 /* If search flag is set, merge the initial state. */
2563 #ifdef RE_ENABLE_I18N
2564 int not_initial
= 0;
2565 for (j
= 0; j
< follows
.nelem
; ++j
)
2566 if (dfa
->nodes
[follows
.elems
[j
]].type
== CHARACTER
)
2568 not_initial
= dfa
->nodes
[follows
.elems
[j
]].mb_partial
;
2574 err
= re_node_set_merge (&follows
,
2575 dfa
->init_state
->entrance_nodes
);
2576 if (BE (err
!= REG_NOERROR
, 0))
2580 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
2581 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
2583 /* If the new state has context constraint,
2584 build appropriate states for these contexts. */
2585 if (dest_states
[i
]->has_constraint
)
2587 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
2589 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
2591 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
2593 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
2598 dest_states_word
[i
] = dest_states
[i
];
2599 dest_states_nl
[i
] = dest_states
[i
];
2601 bitset_merge (acceptable
, dests_ch
[i
]);
2604 /* Update the transition table. */
2605 /* For all characters ch...: */
2606 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
2607 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
2608 if ((acceptable
[i
] >> j
) & 1)
2610 /* The current state accepts the character ch. */
2611 if (IS_WORD_CHAR (ch
))
2613 for (k
= 0; k
< ndests
; ++k
)
2614 if ((dests_ch
[k
][i
] >> j
) & 1)
2616 /* k-th destination accepts the word character ch. */
2617 trtable
[ch
] = dest_states_word
[k
];
2618 /* There must be only one destination which accepts
2619 character ch. See group_nodes_into_DFAstates. */
2623 else /* not WORD_CHAR */
2625 for (k
= 0; k
< ndests
; ++k
)
2626 if ((dests_ch
[k
][i
] >> j
) & 1)
2628 /* k-th destination accepts the non-word character ch. */
2629 trtable
[ch
] = dest_states
[k
];
2630 /* There must be only one destination which accepts
2631 character ch. See group_nodes_into_DFAstates. */
2637 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
2639 /* The current state accepts newline character. */
2640 for (k
= 0; k
< ndests
; ++k
)
2641 if (bitset_contain (dests_ch
[k
], NEWLINE_CHAR
))
2643 /* k-th destination accepts newline character. */
2644 trtable
[NEWLINE_CHAR
] = dest_states_nl
[k
];
2645 /* There must be only one destination which accepts
2646 newline. See group_nodes_into_DFAstates. */
2651 re_free (dest_states_nl
);
2652 re_free (dest_states_word
);
2653 re_free (dest_states
);
2655 re_node_set_free (&follows
);
2656 for (i
= 0; i
< ndests
; ++i
)
2657 re_node_set_free (dests_node
+ i
);
2660 re_free (dests_node
);
2665 /* Group all nodes belonging to STATE into several destinations.
2666 Then for all destinations, set the nodes belonging to the destination
2667 to DESTS_NODE[i] and set the characters accepted by the destination
2668 to DEST_CH[i]. This function return the number of destinations. */
2671 group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
)
2672 const regex_t
*preg
;
2673 const re_dfastate_t
*state
;
2674 re_node_set
*dests_node
;
2678 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2680 int ndests
; /* Number of the destinations from `state'. */
2681 bitset accepts
; /* Characters a node can accept. */
2682 const re_node_set
*cur_nodes
= &state
->nodes
;
2683 bitset_empty (accepts
);
2686 /* For all the nodes belonging to `state', */
2687 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
2689 unsigned int constraint
= 0;
2690 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
2691 re_token_type_t type
= node
->type
;
2693 if (type
== OP_CONTEXT_NODE
)
2695 constraint
= node
->constraint
;
2696 node
= dfa
->nodes
+ node
->opr
.ctx_info
->entity
;
2700 /* Enumerate all single byte character this node can accept. */
2701 if (type
== CHARACTER
)
2702 bitset_set (accepts
, node
->opr
.c
);
2703 else if (type
== SIMPLE_BRACKET
)
2705 bitset_merge (accepts
, node
->opr
.sbcset
);
2707 else if (type
== OP_PERIOD
)
2709 bitset_set_all (accepts
);
2710 if (!(preg
->syntax
& RE_DOT_NEWLINE
))
2711 bitset_clear (accepts
, '\n');
2712 if (preg
->syntax
& RE_DOT_NOT_NULL
)
2713 bitset_clear (accepts
, '\0');
2718 /* Check the `accepts' and sift the characters which are not
2719 match it the context. */
2722 if (constraint
& NEXT_WORD_CONSTRAINT
)
2723 for (j
= 0; j
< BITSET_UINTS
; ++j
)
2724 accepts
[j
] &= dfa
->word_char
[j
];
2725 else if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
2726 for (j
= 0; j
< BITSET_UINTS
; ++j
)
2727 accepts
[j
] &= ~dfa
->word_char
[j
];
2728 else if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
2730 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
2731 bitset_empty (accepts
);
2732 if (accepts_newline
)
2733 bitset_set (accepts
, NEWLINE_CHAR
);
2739 /* Then divide `accepts' into DFA states, or create a new
2741 for (j
= 0; j
< ndests
; ++j
)
2743 bitset intersec
; /* Intersection sets, see below. */
2745 /* Flags, see below. */
2746 int has_intersec
, not_subset
, not_consumed
;
2748 /* Optimization, skip if this state doesn't accept the character. */
2749 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
2752 /* Enumerate the intersection set of this state and `accepts'. */
2754 for (k
= 0; k
< BITSET_UINTS
; ++k
)
2755 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
2756 /* And skip if the intersection set is empty. */
2760 /* Then check if this state is a subset of `accepts'. */
2761 not_subset
= not_consumed
= 0;
2762 for (k
= 0; k
< BITSET_UINTS
; ++k
)
2764 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
2765 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
2768 /* If this state isn't a subset of `accepts', create a
2769 new group state, which has the `remains'. */
2772 bitset_copy (dests_ch
[ndests
], remains
);
2773 bitset_copy (dests_ch
[j
], intersec
);
2774 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
2775 if (BE (err
!= REG_NOERROR
, 0))
2780 /* Put the position in the current group. */
2781 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
2782 if (BE (err
< 0, 0))
2785 /* If all characters are consumed, go to next node. */
2789 /* Some characters remain, create a new group. */
2792 bitset_copy (dests_ch
[ndests
], accepts
);
2793 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
2794 if (BE (err
!= REG_NOERROR
, 0))
2797 bitset_empty (accepts
);
2803 #ifdef RE_ENABLE_I18N
2804 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2805 Return the number of the bytes the node accepts.
2806 STR_IDX is the current index of the input string.
2808 This function handles the nodes which can accept one character, or
2809 one collating element like '.', '[a-z]', opposite to the other nodes
2810 can only accept one byte. */
2813 check_node_accept_bytes (preg
, node_idx
, input
, str_idx
)
2814 const regex_t
*preg
;
2815 int node_idx
, str_idx
;
2816 const re_string_t
*input
;
2818 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2819 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2820 int elem_len
= re_string_elem_size_at (input
, str_idx
);
2821 int char_len
= re_string_char_size_at (input
, str_idx
);
2825 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2827 if (elem_len
<= 1 && char_len
<= 1)
2829 if (node
->type
== OP_PERIOD
)
2831 /* '.' accepts any one character except the following two cases. */
2832 if ((!(preg
->syntax
& RE_DOT_NEWLINE
) &&
2833 re_string_byte_at (input
, str_idx
) == '\n') ||
2834 ((preg
->syntax
& RE_DOT_NOT_NULL
) &&
2835 re_string_byte_at (input
, str_idx
) == '\0'))
2839 else if (node
->type
== COMPLEX_BRACKET
)
2841 const re_charset_t
*cset
= node
->opr
.mbcset
;
2843 const unsigned char *pin
= re_string_get_buffer (input
) + str_idx
;
2846 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
2847 ? re_string_wchar_at (input
, str_idx
) : 0);
2849 /* match with multibyte character? */
2850 for (i
= 0; i
< cset
->nmbchars
; ++i
)
2851 if (wc
== cset
->mbchars
[i
])
2853 match_len
= char_len
;
2854 goto check_node_accept_bytes_match
;
2856 /* match with character_class? */
2857 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
2859 wctype_t wt
= cset
->char_classes
[i
];
2860 if (__iswctype (wc
, wt
))
2862 match_len
= char_len
;
2863 goto check_node_accept_bytes_match
;
2870 unsigned int in_collseq
= 0;
2871 const int32_t *table
, *indirect
;
2872 const unsigned char *weights
, *extra
;
2873 const char *collseqwc
;
2875 /* This #include defines a local function! */
2876 # include <locale/weight.h>
2878 /* match with collating_symbol? */
2879 if (cset
->ncoll_syms
)
2880 extra
= (const unsigned char *)
2881 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
2882 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
2884 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
2885 /* Compare the length of input collating element and
2886 the length of current collating element. */
2887 if (*coll_sym
!= elem_len
)
2889 /* Compare each bytes. */
2890 for (j
= 0; j
< *coll_sym
; j
++)
2891 if (pin
[j
] != coll_sym
[1 + j
])
2895 /* Match if every bytes is equal. */
2897 goto check_node_accept_bytes_match
;
2903 if (elem_len
<= char_len
)
2905 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2906 in_collseq
= collseq_table_lookup (collseqwc
, wc
);
2909 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
2911 /* match with range expression? */
2912 for (i
= 0; i
< cset
->nranges
; ++i
)
2913 if (cset
->range_starts
[i
] <= in_collseq
2914 && in_collseq
<= cset
->range_ends
[i
])
2916 match_len
= elem_len
;
2917 goto check_node_accept_bytes_match
;
2920 /* match with equivalence_class? */
2921 if (cset
->nequiv_classes
)
2923 const unsigned char *cp
= pin
;
2924 table
= (const int32_t *)
2925 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
2926 weights
= (const unsigned char *)
2927 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
2928 extra
= (const unsigned char *)
2929 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
2930 indirect
= (const int32_t *)
2931 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
2932 idx
= findidx (&cp
);
2934 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
2936 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
2937 size_t weight_len
= weights
[idx
];
2938 if (weight_len
== weights
[equiv_class_idx
])
2941 while (cnt
<= weight_len
2942 && (weights
[equiv_class_idx
+ 1 + cnt
]
2943 == weights
[idx
+ 1 + cnt
]))
2945 if (cnt
> weight_len
)
2947 match_len
= elem_len
;
2948 goto check_node_accept_bytes_match
;
2957 /* match with range expression? */
2959 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
2961 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2964 for (i
= 0; i
< cset
->nranges
; ++i
)
2966 cmp_buf
[0] = cset
->range_starts
[i
];
2967 cmp_buf
[4] = cset
->range_ends
[i
];
2968 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2969 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2971 match_len
= char_len
;
2972 goto check_node_accept_bytes_match
;
2976 check_node_accept_bytes_match
:
2977 if (!cset
->non_match
)
2984 return (elem_len
> char_len
) ? elem_len
: char_len
;
2992 find_collation_sequence_value (mbs
, mbs_len
)
2993 const unsigned char *mbs
;
2996 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3001 /* No valid character. Match it as a single byte character. */
3002 const unsigned char *collseq
= (const unsigned char *)
3003 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3004 return collseq
[mbs
[0]];
3011 const unsigned char *extra
= (const unsigned char *)
3012 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3016 int mbs_cnt
, found
= 0;
3017 int32_t elem_mbs_len
;
3018 /* Skip the name of collating element name. */
3019 idx
= idx
+ extra
[idx
] + 1;
3020 elem_mbs_len
= extra
[idx
++];
3021 if (mbs_len
== elem_mbs_len
)
3023 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3024 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3026 if (mbs_cnt
== elem_mbs_len
)
3027 /* Found the entry. */
3030 /* Skip the byte sequence of the collating element. */
3031 idx
+= elem_mbs_len
;
3032 /* Adjust for the alignment. */
3033 idx
= (idx
+ 3) & ~3;
3034 /* Skip the collation sequence value. */
3035 idx
+= sizeof (uint32_t);
3036 /* Skip the wide char sequence of the collating element. */
3037 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3038 /* If we found the entry, return the sequence value. */
3040 return *(uint32_t *) (extra
+ idx
);
3041 /* Skip the collation sequence value. */
3042 idx
+= sizeof (uint32_t);
3047 #endif /* RE_ENABLE_I18N */
3049 /* Check whether the node accepts the byte which is IDX-th
3050 byte of the INPUT. */
3053 check_node_accept (preg
, node
, mctx
, idx
)
3054 const regex_t
*preg
;
3055 const re_token_t
*node
;
3056 const re_match_context_t
*mctx
;
3059 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3060 const re_token_t
*cur_node
;
3062 if (node
->type
== OP_CONTEXT_NODE
)
3064 /* The node has constraints. Check whether the current context
3065 satisfies the constraints. */
3066 unsigned int context
= re_string_context_at (mctx
->input
, idx
,
3068 preg
->newline_anchor
);
3069 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3071 cur_node
= dfa
->nodes
+ node
->opr
.ctx_info
->entity
;
3076 ch
= re_string_byte_at (mctx
->input
, idx
);
3077 if (cur_node
->type
== CHARACTER
)
3078 return cur_node
->opr
.c
== ch
;
3079 else if (cur_node
->type
== SIMPLE_BRACKET
)
3080 return bitset_contain (cur_node
->opr
.sbcset
, ch
);
3081 else if (cur_node
->type
== OP_PERIOD
)
3082 return !((ch
== '\n' && !(preg
->syntax
& RE_DOT_NEWLINE
))
3083 || (ch
== '\0' && (preg
->syntax
& RE_DOT_NOT_NULL
)));
3088 /* Extend the buffers, if the buffers have run out. */
3090 static reg_errcode_t
3091 extend_buffers (mctx
)
3092 re_match_context_t
*mctx
;
3095 re_string_t
*pstr
= mctx
->input
;
3097 /* Double the lengthes of the buffers. */
3098 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3099 if (BE (ret
!= REG_NOERROR
, 0))
3102 if (mctx
->state_log
!= NULL
)
3104 /* And double the length of state_log. */
3105 mctx
->state_log
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
3106 pstr
->bufs_len
* 2);
3107 if (BE (mctx
->state_log
== NULL
, 0))
3111 /* Then reconstruct the buffers. */
3114 #ifdef RE_ENABLE_I18N
3116 build_wcs_upper_buffer (pstr
);
3118 #endif /* RE_ENABLE_I18N */
3119 build_upper_buffer (pstr
);
3123 #ifdef RE_ENABLE_I18N
3125 build_wcs_buffer (pstr
);
3127 #endif /* RE_ENABLE_I18N */
3129 if (pstr
->trans
!= NULL
)
3130 re_string_translate_buffer (pstr
);
3132 pstr
->valid_len
= pstr
->bufs_len
;
3139 /* Functions for matching context. */
3141 static reg_errcode_t
3142 match_ctx_init (mctx
, eflags
, input
, n
)
3143 re_match_context_t
*mctx
;
3147 mctx
->eflags
= eflags
;
3148 mctx
->input
= input
;
3149 mctx
->match_last
= -1;
3152 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
3153 if (BE (mctx
->bkref_ents
== NULL
, 0))
3157 mctx
->bkref_ents
= NULL
;
3158 mctx
->nbkref_ents
= 0;
3159 mctx
->abkref_ents
= n
;
3160 mctx
->max_mb_elem_len
= 0;
3165 match_ctx_free (mctx
)
3166 re_match_context_t
*mctx
;
3168 re_free (mctx
->bkref_ents
);
3171 /* Add a new backreference entry to the cache. */
3173 static reg_errcode_t
3174 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
3175 re_match_context_t
*mctx
;
3176 int node
, str_idx
, from
, to
;
3178 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
3180 mctx
->bkref_ents
= re_realloc (mctx
->bkref_ents
,
3181 struct re_backref_cache_entry
,
3182 mctx
->abkref_ents
* 2);
3183 if (BE (mctx
->bkref_ents
== NULL
, 0))
3185 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
3186 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
3187 mctx
->abkref_ents
*= 2;
3189 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
3190 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
3191 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
3192 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
3193 mctx
->bkref_ents
[mctx
->nbkref_ents
++].flag
= 0;
3194 if (mctx
->max_mb_elem_len
< to
- from
)
3195 mctx
->max_mb_elem_len
= to
- from
;
3200 match_ctx_clear_flag (mctx
)
3201 re_match_context_t
*mctx
;
3204 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
3206 mctx
->bkref_ents
[i
].flag
= 0;
3211 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
,
3213 re_sift_context_t
*sctx
;
3214 re_dfastate_t
**sifted_sts
, **limited_sts
;
3215 int last_node
, last_str_idx
, check_subexp
;
3217 sctx
->sifted_states
= sifted_sts
;
3218 sctx
->limited_states
= limited_sts
;
3219 sctx
->last_node
= last_node
;
3220 sctx
->last_str_idx
= last_str_idx
;
3221 sctx
->check_subexp
= check_subexp
;
3222 re_node_set_init_empty (&sctx
->limits
);