1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27 #if defined HAVE_WCHAR_H || defined _LIBC
29 #endif /* HAVE_WCHAR_H || _LIBC */
30 #if defined HAVE_WCTYPE_H || defined _LIBC
32 #endif /* HAVE_WCTYPE_H || _LIBC */
35 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
36 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
37 # include <locale/localeinfo.h>
38 # include <locale/elem-hash.h>
39 # include <locale/coll-lookup.h>
44 #include "regex_internal.h"
46 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
47 re_string_t
*input
, int n
);
48 static void match_ctx_free (re_match_context_t
*cache
);
49 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
50 int str_idx
, int from
, int to
);
51 static void match_ctx_clear_flag (re_match_context_t
*mctx
);
52 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
53 re_dfastate_t
**limited_sts
, int last_node
,
54 int last_str_idx
, int check_subexp
);
55 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
56 const char *string
, int length
,
57 int start
, int range
, int stop
,
58 size_t nmatch
, regmatch_t pmatch
[],
60 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
61 const char *string1
, int length1
,
62 const char *string2
, int length2
,
63 int start
, int range
, struct re_registers
*regs
,
64 int stop
, int ret_len
);
65 static int re_search_stub (struct re_pattern_buffer
*bufp
,
66 const char *string
, int length
, int start
,
67 int range
, int stop
, struct re_registers
*regs
,
69 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
70 int nregs
, int regs_allocated
);
71 static inline re_dfastate_t
*acquire_init_state_context (reg_errcode_t
*err
,
73 const re_match_context_t
*mctx
,
75 static int check_matching (const regex_t
*preg
, re_match_context_t
*mctx
,
76 int fl_search
, int fl_longest_match
);
77 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
78 unsigned int context
);
79 static int check_halt_state_context (const regex_t
*preg
,
80 const re_dfastate_t
*state
,
81 const re_match_context_t
*mctx
, int idx
);
82 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
, int cur_node
,
83 int cur_idx
, int nmatch
);
84 static int proceed_next_node (const regex_t
*preg
, int nregs
, regmatch_t
*regs
,
85 const re_match_context_t
*mctx
,
86 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
87 struct re_fail_stack_t
*fs
);
88 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
89 int str_idx
, int *dests
, int nregs
,
91 re_node_set
*eps_via_nodes
);
92 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
93 regmatch_t
*regs
, re_node_set
*eps_via_nodes
);
94 static reg_errcode_t
set_regs (const regex_t
*preg
,
95 const re_match_context_t
*mctx
,
96 size_t nmatch
, regmatch_t
*pmatch
,
98 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
);
100 #ifdef RE_ENABLE_I18N
101 static int sift_states_iter_mb (const regex_t
*preg
,
102 const re_match_context_t
*mctx
,
103 re_sift_context_t
*sctx
,
104 int node_idx
, int str_idx
, int max_str_idx
);
105 #endif /* RE_ENABLE_I18N */
106 static reg_errcode_t
sift_states_backward (const regex_t
*preg
,
107 re_match_context_t
*mctx
,
108 re_sift_context_t
*sctx
);
109 static reg_errcode_t
update_cur_sifted_state (const regex_t
*preg
,
110 re_match_context_t
*mctx
,
111 re_sift_context_t
*sctx
,
113 re_node_set
*dest_nodes
);
114 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
115 re_node_set
*dest_nodes
,
116 const re_node_set
*candidates
);
117 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
118 re_node_set
*dest_nodes
,
119 const re_node_set
*and_nodes
);
120 static int check_dst_limits (re_dfa_t
*dfa
, re_node_set
*limits
,
121 re_match_context_t
*mctx
, int dst_node
,
122 int dst_idx
, int src_node
, int src_idx
);
123 static int check_dst_limits_calc_pos (re_dfa_t
*dfa
, re_match_context_t
*mctx
,
124 int limit
, re_node_set
*eclosures
,
125 int subexp_idx
, int node
, int str_idx
);
126 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
127 re_node_set
*dest_nodes
,
128 const re_node_set
*candidates
,
130 struct re_backref_cache_entry
*bkref_ents
,
132 static reg_errcode_t
search_subexp (const regex_t
*preg
,
133 re_match_context_t
*mctx
,
134 re_sift_context_t
*sctx
, int str_idx
,
135 re_node_set
*dest_nodes
);
136 static reg_errcode_t
sift_states_bkref (const regex_t
*preg
,
137 re_match_context_t
*mctx
,
138 re_sift_context_t
*sctx
,
139 int str_idx
, re_node_set
*dest_nodes
);
140 static reg_errcode_t
clean_state_log_if_need (re_match_context_t
*mctx
,
141 int next_state_log_idx
);
142 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
143 re_dfastate_t
**src
, int num
);
144 static re_dfastate_t
*transit_state (reg_errcode_t
*err
, const regex_t
*preg
,
145 re_match_context_t
*mctx
,
146 re_dfastate_t
*state
, int fl_search
);
147 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
, const regex_t
*preg
,
148 re_dfastate_t
*pstate
,
150 re_match_context_t
*mctx
);
151 #ifdef RE_ENABLE_I18N
152 static reg_errcode_t
transit_state_mb (const regex_t
*preg
,
153 re_dfastate_t
*pstate
,
154 re_match_context_t
*mctx
);
155 #endif /* RE_ENABLE_I18N */
156 static reg_errcode_t
transit_state_bkref (const regex_t
*preg
,
157 re_dfastate_t
*pstate
,
158 re_match_context_t
*mctx
);
159 static reg_errcode_t
transit_state_bkref_loop (const regex_t
*preg
,
161 re_dfastate_t
**work_state_log
,
162 re_match_context_t
*mctx
);
163 static re_dfastate_t
**build_trtable (const regex_t
*dfa
,
164 const re_dfastate_t
*state
,
166 #ifdef RE_ENABLE_I18N
167 static int check_node_accept_bytes (const regex_t
*preg
, int node_idx
,
168 const re_string_t
*input
, int idx
);
170 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
173 #endif /* RE_ENABLE_I18N */
174 static int group_nodes_into_DFAstates (const regex_t
*dfa
,
175 const re_dfastate_t
*state
,
176 re_node_set
*states_node
,
178 static int check_node_accept (const regex_t
*preg
, const re_token_t
*node
,
179 const re_match_context_t
*mctx
, int idx
);
180 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
);
182 /* Entry point for POSIX code. */
184 /* regexec searches for a given pattern, specified by PREG, in the
187 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
188 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
189 least NMATCH elements, and we set them to the offsets of the
190 corresponding matched substrings.
192 EFLAGS specifies `execution flags' which affect matching: if
193 REG_NOTBOL is set, then ^ does not match at the beginning of the
194 string; if REG_NOTEOL is set, then $ does not match at the end.
196 We return 0 if we find a match and REG_NOMATCH if not. */
199 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
200 const regex_t
*__restrict preg
;
201 const char *__restrict string
;
207 int length
= strlen (string
);
209 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, 0,
212 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, nmatch
,
214 return err
!= REG_NOERROR
;
217 weak_alias (__regexec
, regexec
)
220 /* Entry points for GNU code. */
222 /* re_match, re_search, re_match_2, re_search_2
224 The former two functions operate on STRING with length LENGTH,
225 while the later two operate on concatenation of STRING1 and STRING2
226 with lengths LENGTH1 and LENGTH2, respectively.
228 re_match() matches the compiled pattern in BUFP against the string,
229 starting at index START.
231 re_search() first tries matching at index START, then it tries to match
232 starting from index START + 1, and so on. The last start position tried
233 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
236 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
237 the first STOP characters of the concatenation of the strings should be
240 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
241 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
242 computed relative to the concatenation, not relative to the individual
245 On success, re_match* functions return the length of the match, re_search*
246 return the position of the start of the match. Return value -1 means no
247 match was found and -2 indicates an internal error. */
250 re_match (bufp
, string
, length
, start
, regs
)
251 struct re_pattern_buffer
*bufp
;
254 struct re_registers
*regs
;
256 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
259 weak_alias (__re_match
, re_match
)
263 re_search (bufp
, string
, length
, start
, range
, regs
)
264 struct re_pattern_buffer
*bufp
;
266 int length
, start
, range
;
267 struct re_registers
*regs
;
269 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
272 weak_alias (__re_search
, re_search
)
276 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
277 struct re_pattern_buffer
*bufp
;
278 const char *string1
, *string2
;
279 int length1
, length2
, start
, stop
;
280 struct re_registers
*regs
;
282 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
283 start
, 0, regs
, stop
, 1);
286 weak_alias (__re_match_2
, re_match_2
)
290 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
291 struct re_pattern_buffer
*bufp
;
292 const char *string1
, *string2
;
293 int length1
, length2
, start
, range
, stop
;
294 struct re_registers
*regs
;
296 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
297 start
, range
, regs
, stop
, 0);
300 weak_alias (__re_search_2
, re_search_2
)
304 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
306 struct re_pattern_buffer
*bufp
;
307 const char *string1
, *string2
;
308 int length1
, length2
, start
, range
, stop
, ret_len
;
309 struct re_registers
*regs
;
313 int len
= length1
+ length2
;
316 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
319 /* Concatenate the strings. */
323 char *s
= re_malloc (char, len
);
325 if (BE (s
== NULL
, 0))
327 memcpy (s
, string1
, length1
);
328 memcpy (s
+ length1
, string2
, length2
);
337 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
340 re_free ((char *) str
);
344 /* The parameters have the same meaning as those of re_search.
345 Additional parameters:
346 If RET_LEN is nonzero the length of the match is returned (re_match style);
347 otherwise the position of the match is returned. */
350 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
351 struct re_pattern_buffer
*bufp
;
353 int length
, start
, range
, stop
, ret_len
;
354 struct re_registers
*regs
;
356 reg_errcode_t result
;
361 /* Check for out-of-range. */
362 if (BE (start
< 0 || start
> length
, 0))
364 if (BE (start
+ range
> length
, 0))
365 range
= length
- start
;
366 else if (BE (start
+ range
< 0, 0))
369 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
370 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
372 /* Compile fastmap if we haven't yet. */
373 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
374 re_compile_fastmap (bufp
);
376 if (BE (bufp
->no_sub
, 0))
379 /* We need at least 1 register. */
382 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
383 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
385 nregs
= regs
->num_regs
;
386 if (BE (nregs
< 1, 0))
388 /* Nothing can be copied to regs. */
394 nregs
= bufp
->re_nsub
+ 1;
395 pmatch
= re_malloc (regmatch_t
, nregs
);
396 if (BE (pmatch
== NULL
, 0))
399 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
400 nregs
, pmatch
, eflags
);
404 /* I hope we needn't fill ther regs with -1's when no match was found. */
405 if (result
!= REG_NOERROR
)
407 else if (regs
!= NULL
)
409 /* If caller wants register contents data back, copy them. */
410 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
411 bufp
->regs_allocated
);
412 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
416 if (BE (rval
== 0, 1))
420 assert (pmatch
[0].rm_so
== start
);
421 rval
= pmatch
[0].rm_eo
- start
;
424 rval
= pmatch
[0].rm_so
;
431 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
432 struct re_registers
*regs
;
434 int nregs
, regs_allocated
;
436 int rval
= REGS_REALLOCATE
;
438 int need_regs
= nregs
+ 1;
439 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
442 /* Have the register data arrays been allocated? */
443 if (regs_allocated
== REGS_UNALLOCATED
)
444 { /* No. So allocate them with malloc. */
445 regs
->start
= re_malloc (regoff_t
, need_regs
);
446 if (BE (regs
->start
== NULL
, 0))
447 return REGS_UNALLOCATED
;
448 regs
->end
= re_malloc (regoff_t
, need_regs
);
449 if (BE (regs
->end
== NULL
, 0))
451 re_free (regs
->start
);
452 return REGS_UNALLOCATED
;
454 regs
->num_regs
= need_regs
;
456 else if (regs_allocated
== REGS_REALLOCATE
)
457 { /* Yes. If we need more elements than were already
458 allocated, reallocate them. If we need fewer, just
460 if (need_regs
> regs
->num_regs
)
462 regs
->start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
463 if (BE (regs
->start
== NULL
, 0))
465 if (regs
->end
!= NULL
)
467 return REGS_UNALLOCATED
;
469 regs
->end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
470 if (BE (regs
->end
== NULL
, 0))
472 re_free (regs
->start
);
473 return REGS_UNALLOCATED
;
475 regs
->num_regs
= need_regs
;
480 assert (regs_allocated
== REGS_FIXED
);
481 /* This function may not be called with REGS_FIXED and nregs too big. */
482 assert (regs
->num_regs
>= nregs
);
487 for (i
= 0; i
< nregs
; ++i
)
489 regs
->start
[i
] = pmatch
[i
].rm_so
;
490 regs
->end
[i
] = pmatch
[i
].rm_eo
;
492 for ( ; i
< regs
->num_regs
; ++i
)
493 regs
->start
[i
] = regs
->end
[i
] = -1;
498 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
499 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
500 this memory for recording register information. STARTS and ENDS
501 must be allocated using the malloc library routine, and must each
502 be at least NUM_REGS * sizeof (regoff_t) bytes long.
504 If NUM_REGS == 0, then subsequent matches should allocate their own
507 Unless this function is called, the first search or match using
508 PATTERN_BUFFER will allocate its own register data, without
509 freeing the old data. */
512 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
513 struct re_pattern_buffer
*bufp
;
514 struct re_registers
*regs
;
516 regoff_t
*starts
, *ends
;
520 bufp
->regs_allocated
= REGS_REALLOCATE
;
521 regs
->num_regs
= num_regs
;
522 regs
->start
= starts
;
527 bufp
->regs_allocated
= REGS_UNALLOCATED
;
529 regs
->start
= regs
->end
= (regoff_t
*) 0;
533 weak_alias (__re_set_registers
, re_set_registers
)
536 /* Entry points compatible with 4.2 BSD regex library. We don't define
537 them unless specifically requested. */
539 #if defined _REGEX_RE_COMP || defined _LIBC
547 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
549 #endif /* _REGEX_RE_COMP */
551 static re_node_set empty_set
;
553 /* Internal entry point. */
555 /* Searches for a compiled pattern PREG in the string STRING, whose
556 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
557 mingings with regexec. START, and RANGE have the same meanings
559 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
560 otherwise return the error code.
561 Note: We assume front end functions already check ranges.
562 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
565 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
569 int length
, start
, range
, stop
, eflags
;
574 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
576 int left_lim
, right_lim
, incr
;
577 int fl_longest_match
, match_first
, match_last
= -1;
578 int fast_translate
, sb
;
579 re_match_context_t mctx
;
580 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
581 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
);
583 /* Check if the DFA haven't been compiled. */
584 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
585 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
586 || dfa
->init_state_begbuf
== NULL
, 0))
589 re_node_set_init_empty (&empty_set
);
590 memset (&mctx
, '\0', sizeof (re_match_context_t
));
592 /* We must check the longest matching, if nmatch > 0. */
593 fl_longest_match
= (nmatch
!= 0);
595 err
= re_string_allocate (&input
, string
, length
, dfa
->nodes_len
+ 1,
596 preg
->translate
, preg
->syntax
& RE_ICASE
);
597 if (BE (err
!= REG_NOERROR
, 0))
601 err
= match_ctx_init (&mctx
, eflags
, &input
, dfa
->nbackref
* 2);
602 if (BE (err
!= REG_NOERROR
, 0))
605 /* We will log all the DFA states through which the dfa pass,
606 if nmatch > 1, or this dfa has "multibyte node", which is a
607 back-reference or a node which can accept multibyte character or
608 multi character collating element. */
609 if (nmatch
> 1 || dfa
->has_mb_node
)
611 mctx
.state_log
= re_malloc (re_dfastate_t
*, dfa
->nodes_len
+ 1);
612 if (BE (mctx
.state_log
== NULL
, 0))
619 mctx
.state_log
= NULL
;
622 /* We assume front-end functions already check them. */
623 assert (start
+ range
>= 0 && start
+ range
<= length
);
627 input
.tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
628 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
630 /* Check incrementally whether of not the input string match. */
631 incr
= (range
< 0) ? -1 : 1;
632 left_lim
= (range
< 0) ? start
+ range
: start
;
633 right_lim
= (range
< 0) ? start
: start
+ range
;
634 sb
= MB_CUR_MAX
== 1;
635 fast_translate
= sb
|| !(preg
->syntax
& RE_ICASE
|| preg
->translate
);
639 /* At first get the current byte from input string. */
642 if (BE (fast_translate
, 1))
644 unsigned RE_TRANSLATE_TYPE t
645 = (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
646 if (BE (range
>= 0, 1))
648 if (BE (t
!= NULL
, 0))
650 while (BE (match_first
< right_lim
, 1)
651 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
656 while (BE (match_first
< right_lim
, 1)
657 && !fastmap
[(unsigned char) string
[match_first
]])
660 if (BE (match_first
== right_lim
, 0))
662 int ch
= match_first
>= length
663 ? 0 : (unsigned char) string
[match_first
];
664 if (!fastmap
[t
? t
[ch
] : ch
])
670 while (match_first
>= left_lim
)
672 int ch
= match_first
>= length
673 ? 0 : (unsigned char) string
[match_first
];
674 if (fastmap
[t
? t
[ch
] : ch
])
678 if (match_first
< left_lim
)
688 /* In this case, we can't determine easily the current byte,
689 since it might be a component byte of a multibyte
690 character. Then we use the constructed buffer
692 /* If MATCH_FIRST is out of the valid range, reconstruct the
694 if (input
.raw_mbs_idx
+ input
.valid_len
<= match_first
695 || match_first
< input
.raw_mbs_idx
)
697 err
= re_string_reconstruct (&input
, match_first
, eflags
,
698 preg
->newline_anchor
);
699 if (BE (err
!= REG_NOERROR
, 0))
702 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
703 Note that MATCH_FIRST must not be smaller than 0. */
704 ch
= ((match_first
>= length
) ? 0
705 : re_string_byte_at (&input
,
706 match_first
- input
.raw_mbs_idx
));
711 while (match_first
>= left_lim
&& match_first
<= right_lim
);
717 /* Reconstruct the buffers so that the matcher can assume that
718 the matching starts from the begining of the buffer. */
719 err
= re_string_reconstruct (&input
, match_first
, eflags
,
720 preg
->newline_anchor
);
721 if (BE (err
!= REG_NOERROR
, 0))
723 #ifdef RE_ENABLE_I18N
724 /* Eliminate it when it is a component of a multibyte character
725 and isn't the head of a multibyte character. */
726 if (sb
|| re_string_first_byte (&input
, 0))
729 /* It seems to be appropriate one, then use the matcher. */
730 /* We assume that the matching starts from 0. */
731 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
732 match_last
= check_matching (preg
, &mctx
, 0, fl_longest_match
);
733 if (match_last
!= -1)
735 if (BE (match_last
== -2, 0))
741 break; /* We found a matching. */
745 /* Update counter. */
747 if (match_first
< left_lim
|| right_lim
< match_first
)
751 /* Set pmatch[] if we need. */
752 if (match_last
!= -1 && nmatch
> 0)
756 /* Initialize registers. */
757 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
758 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
760 /* Set the points where matching start/end. */
762 mctx
.match_last
= pmatch
[0].rm_eo
= match_last
;
764 if (!preg
->no_sub
&& nmatch
> 1)
766 /* We need the ranges of all the subexpressions. */
768 re_dfastate_t
**sifted_states
;
769 re_dfastate_t
**lim_states
= NULL
;
770 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
771 re_sift_context_t sctx
;
773 assert (mctx
.state_log
!= NULL
);
775 halt_node
= check_halt_state_context (preg
, pstate
, &mctx
,
777 if (dfa
->has_plural_match
)
779 match_ctx_clear_flag (&mctx
);
780 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
781 if (BE (sifted_states
== NULL
, 0))
788 lim_states
= calloc (sizeof (re_dfastate_t
*),
790 if (BE (lim_states
== NULL
, 0))
792 re_free (sifted_states
);
797 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
799 err
= sift_states_backward (preg
, &mctx
, &sctx
);
800 re_node_set_free (&sctx
.limits
);
801 if (BE (err
!= REG_NOERROR
, 0))
803 re_free (sifted_states
);
804 re_free (lim_states
);
807 if (lim_states
!= NULL
)
809 err
= merge_state_array (dfa
, sifted_states
, lim_states
,
811 re_free (lim_states
);
812 if (BE (err
!= REG_NOERROR
, 0))
814 re_free (sifted_states
);
818 re_free (mctx
.state_log
);
819 mctx
.state_log
= sifted_states
;
821 mctx
.last_node
= halt_node
;
822 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
823 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
824 if (BE (err
!= REG_NOERROR
, 0))
828 /* At last, add the offset to the each registers, since we slided
829 the buffers so that We can assume that the matching starts from 0. */
830 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
831 if (pmatch
[reg_idx
].rm_so
!= -1)
833 pmatch
[reg_idx
].rm_so
+= match_first
;
834 pmatch
[reg_idx
].rm_eo
+= match_first
;
837 err
= (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
839 re_free (mctx
.state_log
);
841 match_ctx_free (&mctx
);
842 re_string_destruct (&input
);
846 /* Acquire an initial state and return it.
847 We must select appropriate initial state depending on the context,
848 since initial states may have constraints like "\<", "^", etc.. */
850 static inline re_dfastate_t
*
851 acquire_init_state_context (err
, preg
, mctx
, idx
)
854 const re_match_context_t
*mctx
;
857 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
860 if (dfa
->init_state
->has_constraint
)
862 unsigned int context
;
863 context
= re_string_context_at (mctx
->input
, idx
- 1, mctx
->eflags
,
864 preg
->newline_anchor
);
865 if (IS_WORD_CONTEXT (context
))
866 return dfa
->init_state_word
;
867 else if (IS_ORDINARY_CONTEXT (context
))
868 return dfa
->init_state
;
869 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
870 return dfa
->init_state_begbuf
;
871 else if (IS_NEWLINE_CONTEXT (context
))
872 return dfa
->init_state_nl
;
873 else if (IS_BEGBUF_CONTEXT (context
))
875 /* It is relatively rare case, then calculate on demand. */
876 return re_acquire_state_context (err
, dfa
,
877 dfa
->init_state
->entrance_nodes
,
881 /* Must not happen? */
882 return dfa
->init_state
;
885 return dfa
->init_state
;
888 /* Check whether the regular expression match input string INPUT or not,
889 and return the index where the matching end, return -1 if not match,
890 or return -2 in case of an error.
891 FL_SEARCH means we must search where the matching starts,
892 FL_LONGEST_MATCH means we want the POSIX longest matching.
893 Note that the matcher assume that the maching starts from the current
894 index of the buffer. */
897 check_matching (preg
, mctx
, fl_search
, fl_longest_match
)
899 re_match_context_t
*mctx
;
900 int fl_search
, fl_longest_match
;
905 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
906 re_dfastate_t
*cur_state
;
908 cur_state
= acquire_init_state_context (&err
, preg
, mctx
, cur_str_idx
);
909 /* An initial state must not be NULL(invalid state). */
910 if (BE (cur_state
== NULL
, 0))
912 if (mctx
->state_log
!= NULL
)
913 mctx
->state_log
[cur_str_idx
] = cur_state
;
915 if (cur_state
->has_backref
)
918 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
919 for (i
= 0; i
< cur_state
->nodes
.nelem
; ++i
)
921 int node
= cur_state
->nodes
.elems
[i
];
922 re_token_type_t type
= dfa
->nodes
[node
].type
;
923 if (type
== OP_BACK_REF
)
926 for (clexp_idx
= 0; clexp_idx
< cur_state
->nodes
.nelem
;
929 re_token_t
*clexp_node
;
930 clexp_node
= dfa
->nodes
+ cur_state
->nodes
.elems
[clexp_idx
];
931 if (clexp_node
->type
== OP_CLOSE_SUBEXP
932 && clexp_node
->opr
.idx
+ 1== dfa
->nodes
[node
].opr
.idx
)
934 err
= match_ctx_add_entry (mctx
, node
, 0, 0, 0);
935 if (BE (err
!= REG_NOERROR
, 0))
944 /* If the RE accepts NULL string. */
947 if (!cur_state
->has_constraint
948 || check_halt_state_context (preg
, cur_state
, mctx
, cur_str_idx
))
950 if (!fl_longest_match
)
954 match_last
= cur_str_idx
;
960 while (!re_string_eoi (mctx
->input
))
962 cur_state
= transit_state (&err
, preg
, mctx
, cur_state
,
963 fl_search
&& !match
);
964 if (cur_state
== NULL
) /* Reached at the invalid state or an error. */
966 cur_str_idx
= re_string_cur_idx (mctx
->input
);
967 if (BE (err
!= REG_NOERROR
, 0))
969 if (fl_search
&& !match
)
971 /* Restart from initial state, since we are searching
972 the point from where matching start. */
973 #ifdef RE_ENABLE_I18N
975 || re_string_first_byte (mctx
->input
, cur_str_idx
))
976 #endif /* RE_ENABLE_I18N */
977 cur_state
= acquire_init_state_context (&err
, preg
, mctx
,
979 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
981 if (mctx
->state_log
!= NULL
)
982 mctx
->state_log
[cur_str_idx
] = cur_state
;
984 else if (!fl_longest_match
&& match
)
986 else /* (fl_longest_match && match) || (!fl_search && !match) */
988 if (mctx
->state_log
== NULL
)
992 int max
= mctx
->state_log_top
;
993 for (; cur_str_idx
<= max
; ++cur_str_idx
)
994 if (mctx
->state_log
[cur_str_idx
] != NULL
)
996 if (cur_str_idx
> max
)
1002 if (cur_state
!= NULL
&& cur_state
->halt
)
1004 /* Reached at a halt state.
1005 Check the halt state can satisfy the current context. */
1006 if (!cur_state
->has_constraint
1007 || check_halt_state_context (preg
, cur_state
, mctx
,
1008 re_string_cur_idx (mctx
->input
)))
1010 /* We found an appropriate halt state. */
1011 match_last
= re_string_cur_idx (mctx
->input
);
1013 if (!fl_longest_match
)
1021 /* Check NODE match the current context. */
1023 static int check_halt_node_context (dfa
, node
, context
)
1024 const re_dfa_t
*dfa
;
1026 unsigned int context
;
1028 re_token_type_t type
= dfa
->nodes
[node
].type
;
1029 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1030 if (type
!= END_OF_RE
)
1034 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1039 /* Check the halt state STATE match the current context.
1040 Return 0 if not match, if the node, STATE has, is a halt node and
1041 match the context, return the node. */
1044 check_halt_state_context (preg
, state
, mctx
, idx
)
1045 const regex_t
*preg
;
1046 const re_dfastate_t
*state
;
1047 const re_match_context_t
*mctx
;
1050 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1052 unsigned int context
;
1054 assert (state
->halt
);
1056 context
= re_string_context_at (mctx
->input
, idx
, mctx
->eflags
,
1057 preg
->newline_anchor
);
1058 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1059 if (check_halt_node_context (dfa
, state
->nodes
.elems
[i
], context
))
1060 return state
->nodes
.elems
[i
];
1064 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1065 corresponding to the DFA).
1066 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1070 proceed_next_node (preg
, nregs
, regs
, mctx
, pidx
, node
, eps_via_nodes
, fs
)
1071 const regex_t
*preg
;
1073 const re_match_context_t
*mctx
;
1074 int nregs
, *pidx
, node
;
1075 re_node_set
*eps_via_nodes
;
1076 struct re_fail_stack_t
*fs
;
1078 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1079 int i
, err
, dest_node
;
1081 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1083 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1084 int ndest
, dest_nodes
[2];
1085 err
= re_node_set_insert (eps_via_nodes
, node
);
1086 if (BE (err
< 0, 0))
1088 /* Pick up valid destinations. */
1089 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1091 int candidate
= dfa
->edests
[node
].elems
[i
];
1092 if (!re_node_set_contains (cur_nodes
, candidate
))
1094 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1095 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1099 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1100 /* In order to avoid infinite loop like "(a*)*". */
1101 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1102 return dest_nodes
[1];
1104 push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
, eps_via_nodes
);
1105 return dest_nodes
[0];
1110 re_token_type_t type
= dfa
->nodes
[node
].type
;
1112 #ifdef RE_ENABLE_I18N
1113 if (ACCEPT_MB_NODE (type
))
1114 naccepted
= check_node_accept_bytes (preg
, node
, mctx
->input
, *pidx
);
1116 #endif /* RE_ENABLE_I18N */
1117 if (type
== OP_BACK_REF
)
1119 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1120 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1123 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1127 char *buf
= re_string_get_buffer (mctx
->input
);
1128 if (strncmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1136 err
= re_node_set_insert (eps_via_nodes
, node
);
1137 if (BE (err
< 0, 0))
1139 dest_node
= dfa
->edests
[node
].elems
[0];
1140 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1147 || check_node_accept (preg
, dfa
->nodes
+ node
, mctx
, *pidx
))
1149 dest_node
= dfa
->nexts
[node
];
1150 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1151 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1152 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1155 re_node_set_empty (eps_via_nodes
);
1162 static reg_errcode_t
1163 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1164 struct re_fail_stack_t
*fs
;
1165 int str_idx
, *dests
, nregs
;
1167 re_node_set
*eps_via_nodes
;
1170 int num
= fs
->num
++;
1171 if (fs
->num
== fs
->alloc
)
1173 struct re_fail_stack_ent_t
*new_array
;
1175 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1177 if (new_array
== NULL
)
1179 fs
->stack
= new_array
;
1181 fs
->stack
[num
].idx
= str_idx
;
1182 fs
->stack
[num
].node
= dests
[1];
1183 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1184 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1185 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1190 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1191 struct re_fail_stack_t
*fs
;
1194 re_node_set
*eps_via_nodes
;
1196 int num
= --fs
->num
;
1198 *pidx
= fs
->stack
[num
].idx
;
1199 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1200 re_node_set_free (eps_via_nodes
);
1201 re_free (fs
->stack
[num
].regs
);
1202 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1203 return fs
->stack
[num
].node
;
1206 /* Set the positions where the subexpressions are starts/ends to registers
1208 Note: We assume that pmatch[0] is already set, and
1209 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1211 static reg_errcode_t
1212 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1213 const regex_t
*preg
;
1214 const re_match_context_t
*mctx
;
1219 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1220 int idx
, cur_node
, real_nmatch
;
1221 re_node_set eps_via_nodes
;
1222 struct re_fail_stack_t
*fs
;
1223 struct re_fail_stack_t fs_body
= {0, 2, NULL
};
1225 assert (nmatch
> 1);
1226 assert (mctx
->state_log
!= NULL
);
1231 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1235 cur_node
= dfa
->init_node
;
1236 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1237 re_node_set_init_empty (&eps_via_nodes
);
1238 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1240 update_regs (dfa
, pmatch
, cur_node
, idx
, real_nmatch
);
1241 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1246 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1247 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1249 if (reg_idx
== nmatch
)
1251 re_node_set_free (&eps_via_nodes
);
1252 return free_fail_stack_return (fs
);
1254 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1259 re_node_set_free (&eps_via_nodes
);
1264 /* Proceed to next node. */
1265 cur_node
= proceed_next_node (preg
, nmatch
, pmatch
, mctx
, &idx
, cur_node
,
1266 &eps_via_nodes
, fs
);
1268 if (BE (cur_node
< 0, 0))
1273 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1277 re_node_set_free (&eps_via_nodes
);
1282 re_node_set_free (&eps_via_nodes
);
1283 return free_fail_stack_return (fs
);
1286 static reg_errcode_t
1287 free_fail_stack_return (fs
)
1288 struct re_fail_stack_t
*fs
;
1293 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1295 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1296 re_free (fs
->stack
[fs_idx
].regs
);
1298 re_free (fs
->stack
);
1304 update_regs (dfa
, pmatch
, cur_node
, cur_idx
, nmatch
)
1307 int cur_node
, cur_idx
, nmatch
;
1309 int type
= dfa
->nodes
[cur_node
].type
;
1311 if (type
!= OP_OPEN_SUBEXP
&& type
!= OP_CLOSE_SUBEXP
)
1313 reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1314 if (reg_num
>= nmatch
)
1316 if (type
== OP_OPEN_SUBEXP
)
1318 /* We are at the first node of this sub expression. */
1319 pmatch
[reg_num
].rm_so
= cur_idx
;
1320 pmatch
[reg_num
].rm_eo
= -1;
1322 else if (type
== OP_CLOSE_SUBEXP
)
1323 /* We are at the first node of this sub expression. */
1324 pmatch
[reg_num
].rm_eo
= cur_idx
;
1327 #define NUMBER_OF_STATE 1
1329 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1330 and sift the nodes in each states according to the following rules.
1331 Updated state_log will be wrote to STATE_LOG.
1333 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1334 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1335 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1336 the LAST_NODE, we throw away the node `a'.
1337 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1338 string `s' and transit to `b':
1339 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1341 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1342 throwed away, we throw away the node `a'.
1343 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1344 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1346 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1347 we throw away the node `a'. */
1349 #define STATE_NODE_CONTAINS(state,node) \
1350 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1352 static reg_errcode_t
1353 sift_states_backward (preg
, mctx
, sctx
)
1354 const regex_t
*preg
;
1355 re_match_context_t
*mctx
;
1356 re_sift_context_t
*sctx
;
1359 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1361 int str_idx
= sctx
->last_str_idx
;
1362 re_node_set cur_dest
;
1363 re_node_set
*cur_src
; /* Points the state_log[str_idx]->nodes */
1366 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1368 cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1370 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1371 transit to the last_node and the last_node itself. */
1372 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1373 if (BE (err
!= REG_NOERROR
, 0))
1375 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1376 if (BE (err
!= REG_NOERROR
, 0))
1379 /* Then check each states in the state_log. */
1383 /* Update counters. */
1384 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1385 if (null_cnt
> mctx
->max_mb_elem_len
)
1387 memset (sctx
->sifted_states
, '\0',
1388 sizeof (re_dfastate_t
*) * str_idx
);
1389 re_node_set_free (&cur_dest
);
1392 re_node_set_empty (&cur_dest
);
1394 cur_src
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1395 : &mctx
->state_log
[str_idx
]->nodes
);
1397 /* Then build the next sifted state.
1398 We build the next sifted state on `cur_dest', and update
1399 `sifted_states[str_idx]' with `cur_dest'.
1401 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1402 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1403 for (i
= 0; i
< cur_src
->nelem
; i
++)
1405 int prev_node
= cur_src
->elems
[i
];
1407 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1409 if (IS_EPSILON_NODE(type
))
1411 #ifdef RE_ENABLE_I18N
1412 /* If the node may accept `multi byte'. */
1413 if (ACCEPT_MB_NODE (type
))
1414 naccepted
= sift_states_iter_mb (preg
, mctx
, sctx
, prev_node
,
1415 str_idx
, sctx
->last_str_idx
);
1417 #endif /* RE_ENABLE_I18N */
1418 /* We don't check backreferences here.
1419 See update_cur_sifted_state(). */
1422 && check_node_accept (preg
, dfa
->nodes
+ prev_node
, mctx
,
1424 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1425 dfa
->nexts
[prev_node
]))
1431 if (sctx
->limits
.nelem
)
1433 int to_idx
= str_idx
+ naccepted
;
1434 if (check_dst_limits (dfa
, &sctx
->limits
, mctx
,
1435 dfa
->nexts
[prev_node
], to_idx
,
1436 prev_node
, str_idx
))
1439 ret
= re_node_set_insert (&cur_dest
, prev_node
);
1440 if (BE (ret
== -1, 0))
1447 /* Add all the nodes which satisfy the following conditions:
1448 - It can epsilon transit to a node in CUR_DEST.
1450 And update state_log. */
1451 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1452 if (BE (err
!= REG_NOERROR
, 0))
1457 re_node_set_free (&cur_dest
);
1461 /* Helper functions. */
1463 static inline reg_errcode_t
1464 clean_state_log_if_need (mctx
, next_state_log_idx
)
1465 re_match_context_t
*mctx
;
1466 int next_state_log_idx
;
1468 int top
= mctx
->state_log_top
;
1470 if (next_state_log_idx
>= mctx
->input
->bufs_len
1471 || (next_state_log_idx
>= mctx
->input
->valid_len
1472 && mctx
->input
->valid_len
< mctx
->input
->len
))
1475 err
= extend_buffers (mctx
);
1476 if (BE (err
!= REG_NOERROR
, 0))
1480 if (top
< next_state_log_idx
)
1482 memset (mctx
->state_log
+ top
+ 1, '\0',
1483 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1484 mctx
->state_log_top
= next_state_log_idx
;
1489 static reg_errcode_t
1490 merge_state_array (dfa
, dst
, src
, num
)
1492 re_dfastate_t
**dst
;
1493 re_dfastate_t
**src
;
1498 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1500 if (dst
[st_idx
] == NULL
)
1501 dst
[st_idx
] = src
[st_idx
];
1502 else if (src
[st_idx
] != NULL
)
1504 re_node_set merged_set
;
1505 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1506 &src
[st_idx
]->nodes
);
1507 if (BE (err
!= REG_NOERROR
, 0))
1509 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1510 re_node_set_free (&merged_set
);
1511 if (BE (err
!= REG_NOERROR
, 0))
1518 static reg_errcode_t
1519 update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1520 const regex_t
*preg
;
1521 re_match_context_t
*mctx
;
1522 re_sift_context_t
*sctx
;
1524 re_node_set
*dest_nodes
;
1527 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1528 const re_node_set
*candidates
;
1529 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1530 : &mctx
->state_log
[str_idx
]->nodes
);
1532 /* At first, add the nodes which can epsilon transit to a node in
1534 if (dest_nodes
->nelem
)
1536 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1537 if (BE (err
!= REG_NOERROR
, 0))
1541 /* Then, check the limitations in the current sift_context. */
1542 if (dest_nodes
->nelem
&& sctx
->limits
.nelem
)
1544 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1545 mctx
->bkref_ents
, str_idx
);
1546 if (BE (err
!= REG_NOERROR
, 0))
1550 /* Update state_log. */
1551 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1552 if (BE (sctx
->sifted_states
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1555 /* If we are searching for the subexpression candidates.
1556 Note that we were from transit_state_bkref_loop() in this case. */
1557 if (dest_nodes
->nelem
&& sctx
->check_subexp
)
1559 err
= search_subexp (preg
, mctx
, sctx
, str_idx
, dest_nodes
);
1560 if (BE (err
!= REG_NOERROR
, 0))
1564 if ((mctx
->state_log
[str_idx
] != NULL
1565 && mctx
->state_log
[str_idx
]->has_backref
))
1567 err
= sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
);
1568 if (BE (err
!= REG_NOERROR
, 0))
1574 static reg_errcode_t
1575 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1577 re_node_set
*dest_nodes
;
1578 const re_node_set
*candidates
;
1582 re_node_set src_copy
;
1584 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1585 if (BE (err
!= REG_NOERROR
, 0))
1587 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1589 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1591 + src_copy
.elems
[src_idx
]);
1592 if (BE (err
!= REG_NOERROR
, 0))
1594 re_node_set_free (&src_copy
);
1598 re_node_set_free (&src_copy
);
1602 static reg_errcode_t
1603 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1606 re_node_set
*dest_nodes
;
1607 const re_node_set
*candidates
;
1611 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1612 re_node_set except_nodes
;
1613 re_node_set_init_empty (&except_nodes
);
1614 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1616 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1617 if (cur_node
== node
)
1619 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1621 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1622 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1623 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1624 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1625 && re_node_set_contains (dest_nodes
, edst1
))
1627 && !re_node_set_contains (inv_eclosure
, edst2
)
1628 && re_node_set_contains (dest_nodes
, edst2
)))
1630 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1631 dfa
->inveclosures
+ cur_node
);
1632 if (BE (err
!= REG_NOERROR
, 0))
1634 re_node_set_free (&except_nodes
);
1640 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1642 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1643 if (!re_node_set_contains (&except_nodes
, cur_node
))
1645 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1646 re_node_set_remove_at (dest_nodes
, idx
);
1649 re_node_set_free (&except_nodes
);
1654 check_dst_limits (dfa
, limits
, mctx
, dst_node
, dst_idx
, src_node
, src_idx
)
1656 re_node_set
*limits
;
1657 re_match_context_t
*mctx
;
1658 int dst_node
, dst_idx
, src_node
, src_idx
;
1660 int lim_idx
, src_pos
, dst_pos
;
1662 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1665 struct re_backref_cache_entry
*ent
;
1666 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1667 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1669 dst_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1670 dfa
->eclosures
+ dst_node
,
1671 subexp_idx
, dst_node
, dst_idx
);
1672 src_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1673 dfa
->eclosures
+ src_node
,
1674 subexp_idx
, src_node
, src_idx
);
1677 <src> <dst> ( <subexp> )
1678 ( <subexp> ) <src> <dst>
1679 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1680 if (src_pos
== dst_pos
)
1681 continue; /* This is unrelated limitation. */
1689 check_dst_limits_calc_pos (dfa
, mctx
, limit
, eclosures
, subexp_idx
, node
,
1692 re_match_context_t
*mctx
;
1693 re_node_set
*eclosures
;
1694 int limit
, subexp_idx
, node
, str_idx
;
1696 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1697 int pos
= (str_idx
< lim
->subexp_from
? -1
1698 : (lim
->subexp_to
< str_idx
? 1 : 0));
1700 && (str_idx
== lim
->subexp_from
|| str_idx
== lim
->subexp_to
))
1703 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1705 int node
= eclosures
->elems
[node_idx
];
1706 re_token_type_t type
= dfa
->nodes
[node
].type
;
1707 if (type
== OP_BACK_REF
)
1710 for (bi
= 0; bi
< mctx
->nbkref_ents
; ++bi
)
1712 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bi
;
1713 if (ent
->node
== node
&& ent
->subexp_from
== ent
->subexp_to
1714 && ent
->str_idx
== str_idx
)
1717 dst
= dfa
->edests
[node
].elems
[0];
1718 cpos
= check_dst_limits_calc_pos (dfa
, mctx
, limit
,
1719 dfa
->eclosures
+ dst
,
1722 if ((str_idx
== lim
->subexp_from
&& cpos
== -1)
1723 || (str_idx
== lim
->subexp_to
&& cpos
== 0))
1728 if (type
== OP_OPEN_SUBEXP
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
1729 && str_idx
== lim
->subexp_from
)
1734 if (type
== OP_CLOSE_SUBEXP
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
1735 && str_idx
== lim
->subexp_to
)
1738 if (node_idx
== eclosures
->nelem
&& str_idx
== lim
->subexp_to
)
1744 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1745 which are against limitations from DEST_NODES. */
1747 static reg_errcode_t
1748 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1750 re_node_set
*dest_nodes
;
1751 const re_node_set
*candidates
;
1752 re_node_set
*limits
;
1753 struct re_backref_cache_entry
*bkref_ents
;
1757 int node_idx
, lim_idx
;
1759 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1762 struct re_backref_cache_entry
*ent
;
1763 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1765 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1766 continue; /* This is unrelated limitation. */
1768 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1769 if (ent
->subexp_to
== str_idx
)
1773 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1775 int node
= dest_nodes
->elems
[node_idx
];
1776 re_token_type_t type
= dfa
->nodes
[node
].type
;
1777 if (type
== OP_OPEN_SUBEXP
1778 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1780 else if (type
== OP_CLOSE_SUBEXP
1781 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1785 /* Check the limitation of the open subexpression. */
1786 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1789 err
= sub_epsilon_src_nodes(dfa
, ops_node
, dest_nodes
,
1791 if (BE (err
!= REG_NOERROR
, 0))
1794 /* Check the limitation of the close subexpression. */
1795 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1797 int node
= dest_nodes
->elems
[node_idx
];
1798 if (!re_node_set_contains (dfa
->inveclosures
+ node
, cls_node
)
1799 && !re_node_set_contains (dfa
->eclosures
+ node
, cls_node
))
1801 /* It is against this limitation.
1802 Remove it form the current sifted state. */
1803 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
,
1805 if (BE (err
!= REG_NOERROR
, 0))
1811 else /* (ent->subexp_to != str_idx) */
1813 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1815 int node
= dest_nodes
->elems
[node_idx
];
1816 re_token_type_t type
= dfa
->nodes
[node
].type
;
1817 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
1819 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
1821 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
1822 || (type
== OP_OPEN_SUBEXP
))
1824 /* It is against this limitation.
1825 Remove it form the current sifted state. */
1826 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
,
1828 if (BE (err
!= REG_NOERROR
, 0))
1838 /* Search for the top (in case of sctx->check_subexp < 0) or the
1839 bottom (in case of sctx->check_subexp > 0) of the subexpressions
1840 which the backreference sctx->cur_bkref can match. */
1842 static reg_errcode_t
1843 search_subexp (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1844 const regex_t
*preg
;
1845 re_match_context_t
*mctx
;
1846 re_sift_context_t
*sctx
;
1848 re_node_set
*dest_nodes
;
1851 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1852 re_sift_context_t local_sctx
;
1854 const re_node_set
*candidates
;
1855 re_dfastate_t
**lim_states
= NULL
;
1856 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1857 : &mctx
->state_log
[str_idx
]->nodes
);
1858 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
1860 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1862 re_token_type_t type
;
1863 node
= dest_nodes
->elems
[node_idx
];
1864 type
= dfa
->nodes
[node
].type
;
1866 if (type
== OP_CLOSE_SUBEXP
1867 && sctx
->check_subexp
== dfa
->nodes
[node
].opr
.idx
+ 1)
1869 re_dfastate_t
*cur_state
;
1870 /* Found the bottom of the subexpression, then search for the
1872 if (local_sctx
.sifted_states
== NULL
)
1874 /* It hasn't been initialized yet, initialize it now. */
1876 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
1877 if (BE (err
!= REG_NOERROR
, 0))
1880 local_sctx
.check_subexp
= -sctx
->check_subexp
;
1881 local_sctx
.limited_states
= sctx
->limited_states
;
1882 local_sctx
.last_node
= node
;
1883 local_sctx
.last_str_idx
= local_sctx
.cls_subexp_idx
= str_idx
;
1884 cur_state
= local_sctx
.sifted_states
[str_idx
];
1885 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
1886 local_sctx
.sifted_states
[str_idx
] = cur_state
;
1887 if (BE (err
!= REG_NOERROR
, 0))
1889 /* There must not 2 same node in a node set. */
1892 else if (type
== OP_OPEN_SUBEXP
1893 && -sctx
->check_subexp
== dfa
->nodes
[node
].opr
.idx
+ 1)
1895 /* Found the top of the subexpression, check that the
1896 backreference can match the input string. */
1899 int bkref_str_idx
= re_string_cur_idx (mctx
->input
);
1900 int subexp_len
= sctx
->cls_subexp_idx
- str_idx
;
1901 if (subexp_len
< 0 || bkref_str_idx
+ subexp_len
> mctx
->input
->len
)
1904 if (bkref_str_idx
+ subexp_len
> mctx
->input
->valid_len
1905 && mctx
->input
->valid_len
< mctx
->input
->len
)
1908 err
= extend_buffers (mctx
);
1909 if (BE (err
!= REG_NOERROR
, 0))
1912 buf
= (char *) re_string_get_buffer (mctx
->input
);
1913 if (strncmp (buf
+ str_idx
, buf
+ bkref_str_idx
, subexp_len
) != 0)
1916 if (sctx
->limits
.nelem
&& str_idx
> 0)
1918 re_dfastate_t
*cur_state
= sctx
->sifted_states
[str_idx
];
1919 if (lim_states
== NULL
)
1921 lim_states
= re_malloc (re_dfastate_t
*, str_idx
+ 1);
1922 if (BE (lim_states
== NULL
, 0))
1928 if (local_sctx
.sifted_states
== NULL
)
1930 /* It hasn't been initialized yet, initialize it now. */
1932 err
= re_node_set_init_copy (&local_sctx
.limits
,
1934 if (BE (err
!= REG_NOERROR
, 0))
1937 local_sctx
.check_subexp
= 0;
1938 local_sctx
.last_node
= node
;
1939 local_sctx
.last_str_idx
= str_idx
;
1940 local_sctx
.limited_states
= lim_states
;
1941 memset (lim_states
, '\0',
1942 sizeof (re_dfastate_t
*) * (str_idx
+ 1));
1943 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
1944 if (BE (err
!= REG_NOERROR
, 0))
1946 if (local_sctx
.sifted_states
[0] == NULL
1947 && local_sctx
.limited_states
[0] == NULL
)
1949 sctx
->sifted_states
[str_idx
] = cur_state
;
1952 sctx
->sifted_states
[str_idx
] = cur_state
;
1954 /* Successfully matched, add a new cache entry. */
1955 dest_str_idx
= bkref_str_idx
+ subexp_len
;
1956 err
= match_ctx_add_entry (mctx
, sctx
->cur_bkref
, bkref_str_idx
,
1957 str_idx
, sctx
->cls_subexp_idx
);
1958 if (BE (err
!= REG_NOERROR
, 0))
1960 err
= clean_state_log_if_need (mctx
, dest_str_idx
);
1961 if (BE (err
!= REG_NOERROR
, 0))
1967 /* Remove the top/bottom of the sub expression we processed. */
1968 if (node_idx
< dest_nodes
->nelem
)
1970 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
, candidates
);
1971 if (BE (err
!= REG_NOERROR
, 0))
1973 /* Update state_log. */
1974 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1975 if (BE (err
!= REG_NOERROR
, 0))
1980 if (local_sctx
.sifted_states
!= NULL
)
1981 re_node_set_free (&local_sctx
.limits
);
1982 if (lim_states
!= NULL
)
1983 re_free (lim_states
);
1987 static reg_errcode_t
1988 sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1989 const regex_t
*preg
;
1990 re_match_context_t
*mctx
;
1991 re_sift_context_t
*sctx
;
1993 re_node_set
*dest_nodes
;
1996 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1998 re_sift_context_t local_sctx
;
1999 const re_node_set
*candidates
;
2000 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
2001 : &mctx
->state_log
[str_idx
]->nodes
);
2002 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2004 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2006 int cur_bkref_idx
= re_string_cur_idx (mctx
->input
);
2007 re_token_type_t type
;
2008 node
= candidates
->elems
[node_idx
];
2009 type
= dfa
->nodes
[node
].type
;
2010 if (node
== sctx
->cur_bkref
&& str_idx
== cur_bkref_idx
)
2012 /* Avoid infinite loop for the REs like "()\1+". */
2013 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2015 if (type
== OP_BACK_REF
)
2018 for (enabled_idx
= 0; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2020 int disabled_idx
, subexp_len
, to_idx
, dst_node
;
2021 struct re_backref_cache_entry
*entry
;
2022 entry
= mctx
->bkref_ents
+ enabled_idx
;
2023 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2024 to_idx
= str_idx
+ subexp_len
;
2025 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2026 : dfa
->edests
[node
].elems
[0]);
2028 if (entry
->node
!= node
|| entry
->str_idx
!= str_idx
2029 || to_idx
> sctx
->last_str_idx
2030 || sctx
->sifted_states
[to_idx
] == NULL
)
2032 if (!STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
))
2035 if (check_dst_limits (dfa
, &sctx
->limits
, mctx
, node
,
2036 str_idx
, dst_node
, to_idx
))
2038 if (sctx
->check_subexp
== dfa
->nodes
[node
].opr
.idx
)
2041 buf
= (char *) re_string_get_buffer (mctx
->input
);
2042 if (strncmp (buf
+ entry
->subexp_from
,
2043 buf
+ cur_bkref_idx
, subexp_len
) != 0)
2045 err
= match_ctx_add_entry (mctx
, sctx
->cur_bkref
,
2046 cur_bkref_idx
, entry
->subexp_from
,
2048 if (BE (err
!= REG_NOERROR
, 0))
2050 err
= clean_state_log_if_need (mctx
, cur_bkref_idx
2052 if (BE (err
!= REG_NOERROR
, 0))
2057 re_dfastate_t
*cur_state
;
2059 for (disabled_idx
= enabled_idx
+ 1;
2060 disabled_idx
< mctx
->nbkref_ents
; ++disabled_idx
)
2062 struct re_backref_cache_entry
*entry2
;
2063 entry2
= mctx
->bkref_ents
+ disabled_idx
;
2064 if (entry2
->node
!= node
|| entry2
->str_idx
!= str_idx
)
2069 if (local_sctx
.sifted_states
== NULL
)
2072 err
= re_node_set_init_copy (&local_sctx
.limits
,
2074 if (BE (err
!= REG_NOERROR
, 0))
2077 local_sctx
.last_node
= node
;
2078 local_sctx
.last_str_idx
= str_idx
;
2079 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2080 if (BE (err
< 0, 0))
2085 cur_state
= local_sctx
.sifted_states
[str_idx
];
2086 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
2087 if (BE (err
!= REG_NOERROR
, 0))
2089 if (sctx
->limited_states
!= NULL
)
2091 err
= merge_state_array (dfa
, sctx
->limited_states
,
2092 local_sctx
.sifted_states
,
2094 if (BE (err
!= REG_NOERROR
, 0))
2097 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2098 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2099 /* We must not use the variable entry here, since
2100 mctx->bkref_ents might be realloced. */
2101 mctx
->bkref_ents
[enabled_idx
].flag
= 1;
2104 for (enabled_idx
= 0; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2106 struct re_backref_cache_entry
*entry
;
2107 entry
= mctx
->bkref_ents
+ enabled_idx
;
2108 if (entry
->node
== node
&& entry
->str_idx
== str_idx
)
2115 if (local_sctx
.sifted_states
!= NULL
)
2117 re_node_set_free (&local_sctx
.limits
);
2124 #ifdef RE_ENABLE_I18N
2126 sift_states_iter_mb (preg
, mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2127 const regex_t
*preg
;
2128 const re_match_context_t
*mctx
;
2129 re_sift_context_t
*sctx
;
2130 int node_idx
, str_idx
, max_str_idx
;
2132 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2134 /* Check the node can accept `multi byte'. */
2135 naccepted
= check_node_accept_bytes (preg
, node_idx
, mctx
->input
, str_idx
);
2136 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2137 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2138 dfa
->nexts
[node_idx
]))
2139 /* The node can't accept the `multi byte', or the
2140 destination was already throwed away, then the node
2141 could't accept the current input `multi byte'. */
2143 /* Otherwise, it is sure that the node could accept
2144 `naccepted' bytes input. */
2147 #endif /* RE_ENABLE_I18N */
2150 /* Functions for state transition. */
2152 /* Return the next state to which the current state STATE will transit by
2153 accepting the current input byte, and update STATE_LOG if necessary.
2154 If STATE can accept a multibyte char/collating element/back reference
2155 update the destination of STATE_LOG. */
2157 static re_dfastate_t
*
2158 transit_state (err
, preg
, mctx
, state
, fl_search
)
2160 const regex_t
*preg
;
2161 re_match_context_t
*mctx
;
2162 re_dfastate_t
*state
;
2165 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2166 re_dfastate_t
**trtable
, *next_state
;
2169 if (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->bufs_len
2170 || (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->valid_len
2171 && mctx
->input
->valid_len
< mctx
->input
->len
))
2173 *err
= extend_buffers (mctx
);
2174 if (BE (*err
!= REG_NOERROR
, 0))
2182 re_string_skip_bytes (mctx
->input
, 1);
2186 #ifdef RE_ENABLE_I18N
2187 /* If the current state can accept multibyte. */
2188 if (state
->accept_mb
)
2190 *err
= transit_state_mb (preg
, state
, mctx
);
2191 if (BE (*err
!= REG_NOERROR
, 0))
2194 #endif /* RE_ENABLE_I18N */
2196 /* Then decide the next state with the single byte. */
2199 /* Use transition table */
2200 ch
= re_string_fetch_byte (mctx
->input
);
2201 trtable
= fl_search
? state
->trtable_search
: state
->trtable
;
2202 if (trtable
== NULL
)
2204 trtable
= build_trtable (preg
, state
, fl_search
);
2206 state
->trtable_search
= trtable
;
2208 state
->trtable
= trtable
;
2210 next_state
= trtable
[ch
];
2214 /* don't use transition table */
2215 next_state
= transit_state_sb (err
, preg
, state
, fl_search
, mctx
);
2216 if (BE (next_state
== NULL
&& err
!= REG_NOERROR
, 0))
2221 /* Update the state_log if we need. */
2222 if (mctx
->state_log
!= NULL
)
2224 int cur_idx
= re_string_cur_idx (mctx
->input
);
2225 if (cur_idx
> mctx
->state_log_top
)
2227 mctx
->state_log
[cur_idx
] = next_state
;
2228 mctx
->state_log_top
= cur_idx
;
2230 else if (mctx
->state_log
[cur_idx
] == 0)
2232 mctx
->state_log
[cur_idx
] = next_state
;
2236 re_dfastate_t
*pstate
;
2237 unsigned int context
;
2238 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2239 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2240 the destination of a multibyte char/collating element/
2241 back reference. Then the next state is the union set of
2242 these destinations and the results of the transition table. */
2243 pstate
= mctx
->state_log
[cur_idx
];
2244 log_nodes
= pstate
->entrance_nodes
;
2245 if (next_state
!= NULL
)
2247 table_nodes
= next_state
->entrance_nodes
;
2248 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2250 if (BE (*err
!= REG_NOERROR
, 0))
2254 next_nodes
= *log_nodes
;
2255 /* Note: We already add the nodes of the initial state,
2256 then we don't need to add them here. */
2258 context
= re_string_context_at (mctx
->input
,
2259 re_string_cur_idx (mctx
->input
) - 1,
2260 mctx
->eflags
, preg
->newline_anchor
);
2261 next_state
= mctx
->state_log
[cur_idx
]
2262 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2263 /* We don't need to check errors here, since the return value of
2264 this function is next_state and ERR is already set. */
2266 if (table_nodes
!= NULL
)
2267 re_node_set_free (&next_nodes
);
2269 /* If the next state has back references. */
2270 if (next_state
!= NULL
&& next_state
->has_backref
)
2272 *err
= transit_state_bkref (preg
, next_state
, mctx
);
2273 if (BE (*err
!= REG_NOERROR
, 0))
2275 next_state
= mctx
->state_log
[cur_idx
];
2281 /* Helper functions for transit_state. */
2283 /* Return the next state to which the current state STATE will transit by
2284 accepting the current input byte. */
2286 static re_dfastate_t
*
2287 transit_state_sb (err
, preg
, state
, fl_search
, mctx
)
2289 const regex_t
*preg
;
2290 re_dfastate_t
*state
;
2292 re_match_context_t
*mctx
;
2294 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2295 re_node_set next_nodes
;
2296 re_dfastate_t
*next_state
;
2297 int node_cnt
, cur_str_idx
= re_string_cur_idx (mctx
->input
);
2298 unsigned int context
;
2300 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2301 if (BE (*err
!= REG_NOERROR
, 0))
2303 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2305 int cur_node
= state
->nodes
.elems
[node_cnt
];
2306 if (check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
, cur_str_idx
))
2308 *err
= re_node_set_merge (&next_nodes
,
2309 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2310 if (BE (*err
!= REG_NOERROR
, 0))
2312 re_node_set_free (&next_nodes
);
2319 #ifdef RE_ENABLE_I18N
2320 int not_initial
= 0;
2322 for (node_cnt
= 0; node_cnt
< next_nodes
.nelem
; ++node_cnt
)
2323 if (dfa
->nodes
[next_nodes
.elems
[node_cnt
]].type
== CHARACTER
)
2325 not_initial
= dfa
->nodes
[next_nodes
.elems
[node_cnt
]].mb_partial
;
2331 *err
= re_node_set_merge (&next_nodes
,
2332 dfa
->init_state
->entrance_nodes
);
2333 if (BE (*err
!= REG_NOERROR
, 0))
2335 re_node_set_free (&next_nodes
);
2340 context
= re_string_context_at (mctx
->input
, cur_str_idx
, mctx
->eflags
,
2341 preg
->newline_anchor
);
2342 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2343 /* We don't need to check errors here, since the return value of
2344 this function is next_state and ERR is already set. */
2346 re_node_set_free (&next_nodes
);
2347 re_string_skip_bytes (mctx
->input
, 1);
2351 #ifdef RE_ENABLE_I18N
2352 static reg_errcode_t
2353 transit_state_mb (preg
, pstate
, mctx
)
2354 const regex_t
*preg
;
2355 re_dfastate_t
*pstate
;
2356 re_match_context_t
*mctx
;
2359 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2362 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2364 re_node_set dest_nodes
, *new_nodes
;
2365 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2366 int naccepted
= 0, dest_idx
;
2367 unsigned int context
;
2368 re_dfastate_t
*dest_state
;
2370 if (dfa
->nodes
[cur_node_idx
].constraint
)
2372 context
= re_string_context_at (mctx
->input
,
2373 re_string_cur_idx (mctx
->input
),
2374 mctx
->eflags
, preg
->newline_anchor
);
2375 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2380 /* How many bytes the node can accepts? */
2381 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2382 naccepted
= check_node_accept_bytes (preg
, cur_node_idx
, mctx
->input
,
2383 re_string_cur_idx (mctx
->input
));
2387 /* The node can accepts `naccepted' bytes. */
2388 dest_idx
= re_string_cur_idx (mctx
->input
) + naccepted
;
2389 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2390 : mctx
->max_mb_elem_len
);
2391 err
= clean_state_log_if_need (mctx
, dest_idx
);
2392 if (BE (err
!= REG_NOERROR
, 0))
2395 assert (dfa
->nexts
[cur_node_idx
] != -1);
2397 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2398 then we use pstate->nodes.elems[i] instead. */
2399 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2401 dest_state
= mctx
->state_log
[dest_idx
];
2402 if (dest_state
== NULL
)
2403 dest_nodes
= *new_nodes
;
2406 err
= re_node_set_init_union (&dest_nodes
,
2407 dest_state
->entrance_nodes
, new_nodes
);
2408 if (BE (err
!= REG_NOERROR
, 0))
2411 context
= re_string_context_at (mctx
->input
, dest_idx
- 1, mctx
->eflags
,
2412 preg
->newline_anchor
);
2413 mctx
->state_log
[dest_idx
]
2414 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2415 if (dest_state
!= NULL
)
2416 re_node_set_free (&dest_nodes
);
2417 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2422 #endif /* RE_ENABLE_I18N */
2424 static reg_errcode_t
2425 transit_state_bkref (preg
, pstate
, mctx
)
2426 const regex_t
*preg
;
2427 re_dfastate_t
*pstate
;
2428 re_match_context_t
*mctx
;
2431 re_dfastate_t
**work_state_log
;
2433 work_state_log
= re_malloc (re_dfastate_t
*,
2434 re_string_cur_idx (mctx
->input
) + 1);
2435 if (BE (work_state_log
== NULL
, 0))
2438 err
= transit_state_bkref_loop (preg
, &pstate
->nodes
, work_state_log
, mctx
);
2439 re_free (work_state_log
);
2443 /* Caller must allocate `work_state_log'. */
2445 static reg_errcode_t
2446 transit_state_bkref_loop (preg
, nodes
, work_state_log
, mctx
)
2447 const regex_t
*preg
;
2449 re_dfastate_t
**work_state_log
;
2450 re_match_context_t
*mctx
;
2453 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2455 regmatch_t
*cur_regs
= re_malloc (regmatch_t
, preg
->re_nsub
+ 1);
2456 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
2457 if (BE (cur_regs
== NULL
, 0))
2460 for (i
= 0; i
< nodes
->nelem
; ++i
)
2462 int dest_str_idx
, subexp_idx
, prev_nelem
, bkc_idx
;
2463 int node_idx
= nodes
->elems
[i
];
2464 unsigned int context
;
2465 re_token_t
*node
= dfa
->nodes
+ node_idx
;
2466 re_node_set
*new_dest_nodes
;
2467 re_sift_context_t sctx
;
2469 /* Check whether `node' is a backreference or not. */
2470 if (node
->type
== OP_BACK_REF
)
2471 subexp_idx
= node
->opr
.idx
;
2475 if (node
->constraint
)
2477 context
= re_string_context_at (mctx
->input
, cur_str_idx
,
2478 mctx
->eflags
, preg
->newline_anchor
);
2479 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2483 /* `node' is a backreference.
2484 Check the substring which the substring matched. */
2485 sift_ctx_init (&sctx
, work_state_log
, NULL
, node_idx
, cur_str_idx
,
2487 sctx
.cur_bkref
= node_idx
;
2488 match_ctx_clear_flag (mctx
);
2489 err
= sift_states_backward (preg
, mctx
, &sctx
);
2490 if (BE (err
!= REG_NOERROR
, 0))
2493 /* And add the epsilon closures (which is `new_dest_nodes') of
2494 the backreference to appropriate state_log. */
2496 assert (dfa
->nexts
[node_idx
] != -1);
2498 for (bkc_idx
= 0; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2501 re_dfastate_t
*dest_state
;
2502 struct re_backref_cache_entry
*bkref_ent
;
2503 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2504 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2506 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2507 new_dest_nodes
= (subexp_len
== 0
2508 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2509 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2510 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2511 - bkref_ent
->subexp_from
);
2512 context
= (IS_WORD_CHAR (re_string_byte_at (mctx
->input
,
2514 ? CONTEXT_WORD
: 0);
2515 dest_state
= mctx
->state_log
[dest_str_idx
];
2516 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2517 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2518 /* Add `new_dest_node' to state_log. */
2519 if (dest_state
== NULL
)
2521 mctx
->state_log
[dest_str_idx
]
2522 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2524 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2525 && err
!= REG_NOERROR
, 0))
2530 re_node_set dest_nodes
;
2531 err
= re_node_set_init_union (&dest_nodes
,
2532 dest_state
->entrance_nodes
,
2534 if (BE (err
!= REG_NOERROR
, 0))
2536 re_node_set_free (&dest_nodes
);
2539 mctx
->state_log
[dest_str_idx
]
2540 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2541 re_node_set_free (&dest_nodes
);
2542 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2543 && err
!= REG_NOERROR
, 0))
2546 /* We need to check recursively if the backreference can epsilon
2549 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2551 err
= transit_state_bkref_loop (preg
, new_dest_nodes
,
2552 work_state_log
, mctx
);
2553 if (BE (err
!= REG_NOERROR
, 0))
2564 /* Build transition table for the state.
2565 Return the new table if succeeded, otherwise return NULL. */
2567 static re_dfastate_t
**
2568 build_trtable (preg
, state
, fl_search
)
2569 const regex_t
*preg
;
2570 const re_dfastate_t
*state
;
2574 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2576 int dests_node_malloced
= 0, dest_states_malloced
= 0;
2577 int ndests
; /* Number of the destination states from `state'. */
2578 re_dfastate_t
**trtable
;
2579 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
2580 re_node_set follows
, *dests_node
;
2584 /* We build DFA states which corresponds to the destination nodes
2585 from `state'. `dests_node[i]' represents the nodes which i-th
2586 destination state contains, and `dests_ch[i]' represents the
2587 characters which i-th destination state accepts. */
2589 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
2590 dests_node
= (re_node_set
*)
2591 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
2595 dests_node
= (re_node_set
*)
2596 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
2597 if (BE (dests_node
== NULL
, 0))
2599 dests_node_malloced
= 1;
2601 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
2603 /* Initialize transiton table. */
2604 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
2605 if (BE (trtable
== NULL
, 0))
2607 if (dests_node_malloced
)
2612 /* At first, group all nodes belonging to `state' into several
2614 ndests
= group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
);
2615 if (BE (ndests
<= 0, 0))
2617 if (dests_node_malloced
)
2619 /* Return NULL in case of an error, trtable otherwise. */
2626 err
= re_node_set_alloc (&follows
, ndests
+ 1);
2627 if (BE (err
!= REG_NOERROR
, 0))
2631 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
2632 + ndests
* 3 * sizeof (re_dfastate_t
*)))
2633 dest_states
= (re_dfastate_t
**)
2634 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
2638 dest_states
= (re_dfastate_t
**)
2639 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
2640 if (BE (dest_states
== NULL
, 0))
2643 if (dest_states_malloced
)
2645 re_node_set_free (&follows
);
2646 for (i
= 0; i
< ndests
; ++i
)
2647 re_node_set_free (dests_node
+ i
);
2649 if (dests_node_malloced
)
2653 dest_states_malloced
= 1;
2655 dest_states_word
= dest_states
+ ndests
;
2656 dest_states_nl
= dest_states_word
+ ndests
;
2657 bitset_empty (acceptable
);
2659 /* Then build the states for all destinations. */
2660 for (i
= 0; i
< ndests
; ++i
)
2663 re_node_set_empty (&follows
);
2664 /* Merge the follows of this destination states. */
2665 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
2667 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
2668 if (next_node
!= -1)
2670 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
2671 if (BE (err
!= REG_NOERROR
, 0))
2675 /* If search flag is set, merge the initial state. */
2678 #ifdef RE_ENABLE_I18N
2679 int not_initial
= 0;
2680 for (j
= 0; j
< follows
.nelem
; ++j
)
2681 if (dfa
->nodes
[follows
.elems
[j
]].type
== CHARACTER
)
2683 not_initial
= dfa
->nodes
[follows
.elems
[j
]].mb_partial
;
2689 err
= re_node_set_merge (&follows
,
2690 dfa
->init_state
->entrance_nodes
);
2691 if (BE (err
!= REG_NOERROR
, 0))
2695 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
2696 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
2698 /* If the new state has context constraint,
2699 build appropriate states for these contexts. */
2700 if (dest_states
[i
]->has_constraint
)
2702 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
2704 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
2706 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
2708 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
2713 dest_states_word
[i
] = dest_states
[i
];
2714 dest_states_nl
[i
] = dest_states
[i
];
2716 bitset_merge (acceptable
, dests_ch
[i
]);
2719 /* Update the transition table. */
2720 /* For all characters ch...: */
2721 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
2722 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
2723 if ((acceptable
[i
] >> j
) & 1)
2725 /* The current state accepts the character ch. */
2726 if (IS_WORD_CHAR (ch
))
2728 for (k
= 0; k
< ndests
; ++k
)
2729 if ((dests_ch
[k
][i
] >> j
) & 1)
2731 /* k-th destination accepts the word character ch. */
2732 trtable
[ch
] = dest_states_word
[k
];
2733 /* There must be only one destination which accepts
2734 character ch. See group_nodes_into_DFAstates. */
2738 else /* not WORD_CHAR */
2740 for (k
= 0; k
< ndests
; ++k
)
2741 if ((dests_ch
[k
][i
] >> j
) & 1)
2743 /* k-th destination accepts the non-word character ch. */
2744 trtable
[ch
] = dest_states
[k
];
2745 /* There must be only one destination which accepts
2746 character ch. See group_nodes_into_DFAstates. */
2752 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
2754 /* The current state accepts newline character. */
2755 for (k
= 0; k
< ndests
; ++k
)
2756 if (bitset_contain (dests_ch
[k
], NEWLINE_CHAR
))
2758 /* k-th destination accepts newline character. */
2759 trtable
[NEWLINE_CHAR
] = dest_states_nl
[k
];
2760 /* There must be only one destination which accepts
2761 newline. See group_nodes_into_DFAstates. */
2766 if (dest_states_malloced
)
2769 re_node_set_free (&follows
);
2770 for (i
= 0; i
< ndests
; ++i
)
2771 re_node_set_free (dests_node
+ i
);
2773 if (dests_node_malloced
)
2779 /* Group all nodes belonging to STATE into several destinations.
2780 Then for all destinations, set the nodes belonging to the destination
2781 to DESTS_NODE[i] and set the characters accepted by the destination
2782 to DEST_CH[i]. This function return the number of destinations. */
2785 group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
)
2786 const regex_t
*preg
;
2787 const re_dfastate_t
*state
;
2788 re_node_set
*dests_node
;
2792 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2794 int ndests
; /* Number of the destinations from `state'. */
2795 bitset accepts
; /* Characters a node can accept. */
2796 const re_node_set
*cur_nodes
= &state
->nodes
;
2797 bitset_empty (accepts
);
2800 /* For all the nodes belonging to `state', */
2801 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
2803 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
2804 re_token_type_t type
= node
->type
;
2805 unsigned int constraint
= node
->constraint
;
2807 /* Enumerate all single byte character this node can accept. */
2808 if (type
== CHARACTER
)
2809 bitset_set (accepts
, node
->opr
.c
);
2810 else if (type
== SIMPLE_BRACKET
)
2812 bitset_merge (accepts
, node
->opr
.sbcset
);
2814 else if (type
== OP_PERIOD
)
2816 bitset_set_all (accepts
);
2817 if (!(preg
->syntax
& RE_DOT_NEWLINE
))
2818 bitset_clear (accepts
, '\n');
2819 if (preg
->syntax
& RE_DOT_NOT_NULL
)
2820 bitset_clear (accepts
, '\0');
2825 /* Check the `accepts' and sift the characters which are not
2826 match it the context. */
2829 if (constraint
& NEXT_WORD_CONSTRAINT
)
2830 for (j
= 0; j
< BITSET_UINTS
; ++j
)
2831 accepts
[j
] &= dfa
->word_char
[j
];
2832 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
2833 for (j
= 0; j
< BITSET_UINTS
; ++j
)
2834 accepts
[j
] &= ~dfa
->word_char
[j
];
2835 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
2837 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
2838 bitset_empty (accepts
);
2839 if (accepts_newline
)
2840 bitset_set (accepts
, NEWLINE_CHAR
);
2846 /* Then divide `accepts' into DFA states, or create a new
2848 for (j
= 0; j
< ndests
; ++j
)
2850 bitset intersec
; /* Intersection sets, see below. */
2852 /* Flags, see below. */
2853 int has_intersec
, not_subset
, not_consumed
;
2855 /* Optimization, skip if this state doesn't accept the character. */
2856 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
2859 /* Enumerate the intersection set of this state and `accepts'. */
2861 for (k
= 0; k
< BITSET_UINTS
; ++k
)
2862 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
2863 /* And skip if the intersection set is empty. */
2867 /* Then check if this state is a subset of `accepts'. */
2868 not_subset
= not_consumed
= 0;
2869 for (k
= 0; k
< BITSET_UINTS
; ++k
)
2871 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
2872 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
2875 /* If this state isn't a subset of `accepts', create a
2876 new group state, which has the `remains'. */
2879 bitset_copy (dests_ch
[ndests
], remains
);
2880 bitset_copy (dests_ch
[j
], intersec
);
2881 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
2882 if (BE (err
!= REG_NOERROR
, 0))
2887 /* Put the position in the current group. */
2888 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
2889 if (BE (err
< 0, 0))
2892 /* If all characters are consumed, go to next node. */
2896 /* Some characters remain, create a new group. */
2899 bitset_copy (dests_ch
[ndests
], accepts
);
2900 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
2901 if (BE (err
!= REG_NOERROR
, 0))
2904 bitset_empty (accepts
);
2909 for (j
= 0; j
< ndests
; ++j
)
2910 re_node_set_free (dests_node
+ j
);
2914 #ifdef RE_ENABLE_I18N
2915 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2916 Return the number of the bytes the node accepts.
2917 STR_IDX is the current index of the input string.
2919 This function handles the nodes which can accept one character, or
2920 one collating element like '.', '[a-z]', opposite to the other nodes
2921 can only accept one byte. */
2924 check_node_accept_bytes (preg
, node_idx
, input
, str_idx
)
2925 const regex_t
*preg
;
2926 int node_idx
, str_idx
;
2927 const re_string_t
*input
;
2929 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2930 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2931 int elem_len
= re_string_elem_size_at (input
, str_idx
);
2932 int char_len
= re_string_char_size_at (input
, str_idx
);
2936 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2938 if (elem_len
<= 1 && char_len
<= 1)
2940 if (node
->type
== OP_PERIOD
)
2942 /* '.' accepts any one character except the following two cases. */
2943 if ((!(preg
->syntax
& RE_DOT_NEWLINE
) &&
2944 re_string_byte_at (input
, str_idx
) == '\n') ||
2945 ((preg
->syntax
& RE_DOT_NOT_NULL
) &&
2946 re_string_byte_at (input
, str_idx
) == '\0'))
2950 else if (node
->type
== COMPLEX_BRACKET
)
2952 const re_charset_t
*cset
= node
->opr
.mbcset
;
2954 const unsigned char *pin
= re_string_get_buffer (input
) + str_idx
;
2957 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
2958 ? re_string_wchar_at (input
, str_idx
) : 0);
2960 /* match with multibyte character? */
2961 for (i
= 0; i
< cset
->nmbchars
; ++i
)
2962 if (wc
== cset
->mbchars
[i
])
2964 match_len
= char_len
;
2965 goto check_node_accept_bytes_match
;
2967 /* match with character_class? */
2968 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
2970 wctype_t wt
= cset
->char_classes
[i
];
2971 if (__iswctype (wc
, wt
))
2973 match_len
= char_len
;
2974 goto check_node_accept_bytes_match
;
2981 unsigned int in_collseq
= 0;
2982 const int32_t *table
, *indirect
;
2983 const unsigned char *weights
, *extra
;
2984 const char *collseqwc
;
2986 /* This #include defines a local function! */
2987 # include <locale/weight.h>
2989 /* match with collating_symbol? */
2990 if (cset
->ncoll_syms
)
2991 extra
= (const unsigned char *)
2992 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
2993 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
2995 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
2996 /* Compare the length of input collating element and
2997 the length of current collating element. */
2998 if (*coll_sym
!= elem_len
)
3000 /* Compare each bytes. */
3001 for (j
= 0; j
< *coll_sym
; j
++)
3002 if (pin
[j
] != coll_sym
[1 + j
])
3006 /* Match if every bytes is equal. */
3008 goto check_node_accept_bytes_match
;
3014 if (elem_len
<= char_len
)
3016 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3017 in_collseq
= collseq_table_lookup (collseqwc
, wc
);
3020 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3022 /* match with range expression? */
3023 for (i
= 0; i
< cset
->nranges
; ++i
)
3024 if (cset
->range_starts
[i
] <= in_collseq
3025 && in_collseq
<= cset
->range_ends
[i
])
3027 match_len
= elem_len
;
3028 goto check_node_accept_bytes_match
;
3031 /* match with equivalence_class? */
3032 if (cset
->nequiv_classes
)
3034 const unsigned char *cp
= pin
;
3035 table
= (const int32_t *)
3036 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3037 weights
= (const unsigned char *)
3038 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3039 extra
= (const unsigned char *)
3040 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3041 indirect
= (const int32_t *)
3042 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3043 idx
= findidx (&cp
);
3045 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3047 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3048 size_t weight_len
= weights
[idx
];
3049 if (weight_len
== weights
[equiv_class_idx
])
3052 while (cnt
<= weight_len
3053 && (weights
[equiv_class_idx
+ 1 + cnt
]
3054 == weights
[idx
+ 1 + cnt
]))
3056 if (cnt
> weight_len
)
3058 match_len
= elem_len
;
3059 goto check_node_accept_bytes_match
;
3068 /* match with range expression? */
3070 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3072 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3075 for (i
= 0; i
< cset
->nranges
; ++i
)
3077 cmp_buf
[0] = cset
->range_starts
[i
];
3078 cmp_buf
[4] = cset
->range_ends
[i
];
3079 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3080 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3082 match_len
= char_len
;
3083 goto check_node_accept_bytes_match
;
3087 check_node_accept_bytes_match
:
3088 if (!cset
->non_match
)
3095 return (elem_len
> char_len
) ? elem_len
: char_len
;
3103 find_collation_sequence_value (mbs
, mbs_len
)
3104 const unsigned char *mbs
;
3107 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3112 /* No valid character. Match it as a single byte character. */
3113 const unsigned char *collseq
= (const unsigned char *)
3114 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3115 return collseq
[mbs
[0]];
3122 const unsigned char *extra
= (const unsigned char *)
3123 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3127 int mbs_cnt
, found
= 0;
3128 int32_t elem_mbs_len
;
3129 /* Skip the name of collating element name. */
3130 idx
= idx
+ extra
[idx
] + 1;
3131 elem_mbs_len
= extra
[idx
++];
3132 if (mbs_len
== elem_mbs_len
)
3134 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3135 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3137 if (mbs_cnt
== elem_mbs_len
)
3138 /* Found the entry. */
3141 /* Skip the byte sequence of the collating element. */
3142 idx
+= elem_mbs_len
;
3143 /* Adjust for the alignment. */
3144 idx
= (idx
+ 3) & ~3;
3145 /* Skip the collation sequence value. */
3146 idx
+= sizeof (uint32_t);
3147 /* Skip the wide char sequence of the collating element. */
3148 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3149 /* If we found the entry, return the sequence value. */
3151 return *(uint32_t *) (extra
+ idx
);
3152 /* Skip the collation sequence value. */
3153 idx
+= sizeof (uint32_t);
3158 #endif /* RE_ENABLE_I18N */
3160 /* Check whether the node accepts the byte which is IDX-th
3161 byte of the INPUT. */
3164 check_node_accept (preg
, node
, mctx
, idx
)
3165 const regex_t
*preg
;
3166 const re_token_t
*node
;
3167 const re_match_context_t
*mctx
;
3171 if (node
->constraint
)
3173 /* The node has constraints. Check whether the current context
3174 satisfies the constraints. */
3175 unsigned int context
= re_string_context_at (mctx
->input
, idx
,
3177 preg
->newline_anchor
);
3178 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3181 ch
= re_string_byte_at (mctx
->input
, idx
);
3182 if (node
->type
== CHARACTER
)
3183 return node
->opr
.c
== ch
;
3184 else if (node
->type
== SIMPLE_BRACKET
)
3185 return bitset_contain (node
->opr
.sbcset
, ch
);
3186 else if (node
->type
== OP_PERIOD
)
3187 return !((ch
== '\n' && !(preg
->syntax
& RE_DOT_NEWLINE
))
3188 || (ch
== '\0' && (preg
->syntax
& RE_DOT_NOT_NULL
)));
3193 /* Extend the buffers, if the buffers have run out. */
3195 static reg_errcode_t
3196 extend_buffers (mctx
)
3197 re_match_context_t
*mctx
;
3200 re_string_t
*pstr
= mctx
->input
;
3202 /* Double the lengthes of the buffers. */
3203 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3204 if (BE (ret
!= REG_NOERROR
, 0))
3207 if (mctx
->state_log
!= NULL
)
3209 /* And double the length of state_log. */
3210 re_dfastate_t
**new_array
;
3211 new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
3212 pstr
->bufs_len
* 2);
3213 if (BE (new_array
== NULL
, 0))
3215 mctx
->state_log
= new_array
;
3218 /* Then reconstruct the buffers. */
3221 #ifdef RE_ENABLE_I18N
3223 build_wcs_upper_buffer (pstr
);
3225 #endif /* RE_ENABLE_I18N */
3226 build_upper_buffer (pstr
);
3230 #ifdef RE_ENABLE_I18N
3232 build_wcs_buffer (pstr
);
3234 #endif /* RE_ENABLE_I18N */
3236 if (pstr
->trans
!= NULL
)
3237 re_string_translate_buffer (pstr
);
3239 pstr
->valid_len
= pstr
->bufs_len
;
3246 /* Functions for matching context. */
3248 static reg_errcode_t
3249 match_ctx_init (mctx
, eflags
, input
, n
)
3250 re_match_context_t
*mctx
;
3254 mctx
->eflags
= eflags
;
3255 mctx
->input
= input
;
3256 mctx
->match_last
= -1;
3259 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
3260 if (BE (mctx
->bkref_ents
== NULL
, 0))
3264 mctx
->bkref_ents
= NULL
;
3265 mctx
->nbkref_ents
= 0;
3266 mctx
->abkref_ents
= n
;
3267 mctx
->max_mb_elem_len
= 0;
3272 match_ctx_free (mctx
)
3273 re_match_context_t
*mctx
;
3275 re_free (mctx
->bkref_ents
);
3278 /* Add a new backreference entry to the cache. */
3280 static reg_errcode_t
3281 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
3282 re_match_context_t
*mctx
;
3283 int node
, str_idx
, from
, to
;
3285 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
3287 struct re_backref_cache_entry
* new_entry
;
3288 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
3289 mctx
->abkref_ents
* 2);
3290 if (BE (new_entry
== NULL
, 0))
3292 re_free (mctx
->bkref_ents
);
3295 mctx
->bkref_ents
= new_entry
;
3296 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
3297 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
3298 mctx
->abkref_ents
*= 2;
3300 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
3301 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
3302 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
3303 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
3304 mctx
->bkref_ents
[mctx
->nbkref_ents
++].flag
= 0;
3305 if (mctx
->max_mb_elem_len
< to
- from
)
3306 mctx
->max_mb_elem_len
= to
- from
;
3311 match_ctx_clear_flag (mctx
)
3312 re_match_context_t
*mctx
;
3315 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
3317 mctx
->bkref_ents
[i
].flag
= 0;
3322 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
,
3324 re_sift_context_t
*sctx
;
3325 re_dfastate_t
**sifted_sts
, **limited_sts
;
3326 int last_node
, last_str_idx
, check_subexp
;
3328 sctx
->sifted_states
= sifted_sts
;
3329 sctx
->limited_states
= limited_sts
;
3330 sctx
->last_node
= last_node
;
3331 sctx
->last_str_idx
= last_str_idx
;
3332 sctx
->check_subexp
= check_subexp
;
3333 sctx
->cur_bkref
= -1;
3334 sctx
->cls_subexp_idx
= -1;
3335 re_node_set_init_empty (&sctx
->limits
);