rebase --root: fix amending root commit messages
[git.git] / compat / regex / regexec.c
blob1b5d89fd5ed1a2c143bc6dafdf027b1f5f2cf790
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 int n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
25 int str_idx, int from, int to)
26 internal_function;
27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
28 internal_function;
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
30 int str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 int node, int str_idx)
33 internal_function;
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, int last_node,
36 int last_str_idx)
37 internal_function;
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, int length,
40 int start, int range, int stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags);
43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, int length1,
45 const char *string2, int length2,
46 int start, int range, struct re_registers *regs,
47 int stop, int ret_len);
48 static int re_search_stub (struct re_pattern_buffer *bufp,
49 const char *string, int length, int start,
50 int range, int stop, struct re_registers *regs,
51 int ret_len);
52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53 int nregs, int regs_allocated);
54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
55 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
56 int *p_match_first) internal_function;
57 static int check_halt_state_context (const re_match_context_t *mctx,
58 const re_dfastate_t *state, int idx)
59 internal_function;
60 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
61 regmatch_t *prev_idx_match, int cur_node,
62 int cur_idx, int nmatch) internal_function;
63 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
64 int str_idx, int dest_node, int nregs,
65 regmatch_t *regs,
66 re_node_set *eps_via_nodes)
67 internal_function;
68 static reg_errcode_t set_regs (const regex_t *preg,
69 const re_match_context_t *mctx,
70 size_t nmatch, regmatch_t *pmatch,
71 int fl_backtrack) internal_function;
72 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
73 internal_function;
75 #ifdef RE_ENABLE_I18N
76 static int sift_states_iter_mb (const re_match_context_t *mctx,
77 re_sift_context_t *sctx,
78 int node_idx, int str_idx, int max_str_idx)
79 internal_function;
80 #endif /* RE_ENABLE_I18N */
81 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
82 re_sift_context_t *sctx)
83 internal_function;
84 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
85 re_sift_context_t *sctx, int str_idx,
86 re_node_set *cur_dest)
87 internal_function;
88 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int str_idx,
91 re_node_set *dest_nodes)
92 internal_function;
93 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
94 re_node_set *dest_nodes,
95 const re_node_set *candidates)
96 internal_function;
97 static int check_dst_limits (const re_match_context_t *mctx,
98 re_node_set *limits,
99 int dst_node, int dst_idx, int src_node,
100 int src_idx) internal_function;
101 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
102 int boundaries, int subexp_idx,
103 int from_node, int bkref_idx)
104 internal_function;
105 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
106 int limit, int subexp_idx,
107 int node, int str_idx,
108 int bkref_idx) internal_function;
109 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
110 re_node_set *dest_nodes,
111 const re_node_set *candidates,
112 re_node_set *limits,
113 struct re_backref_cache_entry *bkref_ents,
114 int str_idx) internal_function;
115 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
116 re_sift_context_t *sctx,
117 int str_idx, const re_node_set *candidates)
118 internal_function;
119 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
120 re_dfastate_t **dst,
121 re_dfastate_t **src, int num)
122 internal_function;
123 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
124 re_match_context_t *mctx) internal_function;
125 static re_dfastate_t *transit_state (reg_errcode_t *err,
126 re_match_context_t *mctx,
127 re_dfastate_t *state) internal_function;
128 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *next_state)
131 internal_function;
132 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
133 re_node_set *cur_nodes,
134 int str_idx) internal_function;
135 #if 0
136 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
137 re_match_context_t *mctx,
138 re_dfastate_t *pstate)
139 internal_function;
140 #endif
141 #ifdef RE_ENABLE_I18N
142 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
143 re_dfastate_t *pstate)
144 internal_function;
145 #endif /* RE_ENABLE_I18N */
146 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
147 const re_node_set *nodes)
148 internal_function;
149 static reg_errcode_t get_subexp (re_match_context_t *mctx,
150 int bkref_node, int bkref_str_idx)
151 internal_function;
152 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
153 const re_sub_match_top_t *sub_top,
154 re_sub_match_last_t *sub_last,
155 int bkref_node, int bkref_str)
156 internal_function;
157 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
158 int subexp_idx, int type) internal_function;
159 static reg_errcode_t check_arrival (re_match_context_t *mctx,
160 state_array_t *path, int top_node,
161 int top_str, int last_node, int last_str,
162 int type) internal_function;
163 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
164 int str_idx,
165 re_node_set *cur_nodes,
166 re_node_set *next_nodes)
167 internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type)
171 internal_function;
172 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
173 re_node_set *dst_nodes,
174 int target, int ex_subexp,
175 int type) internal_function;
176 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
177 re_node_set *cur_nodes, int cur_str,
178 int subexp_num, int type)
179 internal_function;
180 static int build_trtable (const re_dfa_t *dfa,
181 re_dfastate_t *state) internal_function;
182 #ifdef RE_ENABLE_I18N
183 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
184 const re_string_t *input, int idx)
185 internal_function;
186 # ifdef _LIBC
187 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
188 size_t name_len)
189 internal_function;
190 # endif /* _LIBC */
191 #endif /* RE_ENABLE_I18N */
192 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
193 const re_dfastate_t *state,
194 re_node_set *states_node,
195 bitset_t *states_ch) internal_function;
196 static int check_node_accept (const re_match_context_t *mctx,
197 const re_token_t *node, int idx)
198 internal_function;
199 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
200 internal_function;
202 /* Entry point for POSIX code. */
204 /* regexec searches for a given pattern, specified by PREG, in the
205 string STRING.
207 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
208 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
209 least NMATCH elements, and we set them to the offsets of the
210 corresponding matched substrings.
212 EFLAGS specifies `execution flags' which affect matching: if
213 REG_NOTBOL is set, then ^ does not match at the beginning of the
214 string; if REG_NOTEOL is set, then $ does not match at the end.
216 We return 0 if we find a match and REG_NOMATCH if not. */
219 regexec (
220 const regex_t *__restrict preg,
221 const char *__restrict string,
222 size_t nmatch,
223 regmatch_t pmatch[],
224 int eflags)
226 reg_errcode_t err;
227 int start, length;
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230 return REG_BADPAT;
232 if (eflags & REG_STARTEND)
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
237 else
239 start = 0;
240 length = strlen (string);
243 __libc_lock_lock (dfa->lock);
244 if (preg->no_sub)
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
247 else
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
254 #ifdef _LIBC
255 # include <shlib-compat.h>
256 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec) __compat_regexec;
262 attribute_compat_text_section
263 __compat_regexec (const regex_t *__restrict preg,
264 const char *__restrict string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
270 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271 # endif
272 #endif
274 /* Entry points for GNU code. */
276 /* re_match, re_search, re_match_2, re_search_2
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
288 way as re_match().)
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
292 concerned.
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
297 strings.)
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
304 re_match (struct re_pattern_buffer *bufp,
305 const char *string,
306 int length,
307 int start,
308 struct re_registers *regs)
310 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
312 #ifdef _LIBC
313 weak_alias (__re_match, re_match)
314 #endif
317 re_search (struct re_pattern_buffer *bufp,
318 const char *string,
319 int length, int start, int range,
320 struct re_registers *regs)
322 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
324 #ifdef _LIBC
325 weak_alias (__re_search, re_search)
326 #endif
329 re_match_2 (struct re_pattern_buffer *bufp,
330 const char *string1, int length1,
331 const char *string2, int length2, int start,
332 struct re_registers *regs, int stop)
334 return re_search_2_stub (bufp, string1, length1, string2, length2,
335 start, 0, regs, stop, 1);
337 #ifdef _LIBC
338 weak_alias (__re_match_2, re_match_2)
339 #endif
342 re_search_2 (struct re_pattern_buffer *bufp,
343 const char *string1, int length1,
344 const char *string2, int length2, int start,
345 int range, struct re_registers *regs, int stop)
347 return re_search_2_stub (bufp, string1, length1, string2, length2,
348 start, range, regs, stop, 0);
350 #ifdef _LIBC
351 weak_alias (__re_search_2, re_search_2)
352 #endif
354 static int
355 re_search_2_stub (struct re_pattern_buffer *bufp,
356 const char *string1, int length1,
357 const char *string2, int length2, int start,
358 int range, struct re_registers *regs,
359 int stop, int ret_len)
361 const char *str;
362 int rval;
363 int len = length1 + length2;
364 int free_str = 0;
366 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
367 return -2;
369 /* Concatenate the strings. */
370 if (length2 > 0)
371 if (length1 > 0)
373 char *s = re_malloc (char, len);
375 if (BE (s == NULL, 0))
376 return -2;
377 memcpy (s, string1, length1);
378 memcpy (s + length1, string2, length2);
379 str = s;
380 free_str = 1;
382 else
383 str = string2;
384 else
385 str = string1;
387 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
388 if (free_str)
389 re_free ((char *) str);
390 return rval;
393 /* The parameters have the same meaning as those of re_search.
394 Additional parameters:
395 If RET_LEN is nonzero the length of the match is returned (re_match style);
396 otherwise the position of the match is returned. */
398 static int
399 re_search_stub (struct re_pattern_buffer *bufp,
400 const char *string, int length, int start,
401 int range, int stop,
402 struct re_registers *regs, int ret_len)
404 reg_errcode_t result;
405 regmatch_t *pmatch;
406 int nregs, rval;
407 int eflags = 0;
409 /* Check for out-of-range. */
410 if (BE (start < 0 || start > length, 0))
411 return -1;
412 if (BE (start + range > length, 0))
413 range = length - start;
414 else if (BE (start + range < 0, 0))
415 range = -start;
417 __libc_lock_lock (dfa->lock);
419 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
420 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
422 /* Compile fastmap if we haven't yet. */
423 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
424 re_compile_fastmap (bufp);
426 if (BE (bufp->no_sub, 0))
427 regs = NULL;
429 /* We need at least 1 register. */
430 if (regs == NULL)
431 nregs = 1;
432 else if (BE (bufp->regs_allocated == REGS_FIXED &&
433 regs->num_regs < bufp->re_nsub + 1, 0))
435 nregs = regs->num_regs;
436 if (BE (nregs < 1, 0))
438 /* Nothing can be copied to regs. */
439 regs = NULL;
440 nregs = 1;
443 else
444 nregs = bufp->re_nsub + 1;
445 pmatch = re_malloc (regmatch_t, nregs);
446 if (BE (pmatch == NULL, 0))
448 rval = -2;
449 goto out;
452 result = re_search_internal (bufp, string, length, start, range, stop,
453 nregs, pmatch, eflags);
455 rval = 0;
457 /* I hope we needn't fill their regs with -1's when no match was found. */
458 if (result != REG_NOERROR)
459 rval = -1;
460 else if (regs != NULL)
462 /* If caller wants register contents data back, copy them. */
463 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
464 bufp->regs_allocated);
465 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
466 rval = -2;
469 if (BE (rval == 0, 1))
471 if (ret_len)
473 assert (pmatch[0].rm_so == start);
474 rval = pmatch[0].rm_eo - start;
476 else
477 rval = pmatch[0].rm_so;
479 re_free (pmatch);
480 out:
481 __libc_lock_unlock (dfa->lock);
482 return rval;
485 static unsigned
486 re_copy_regs (struct re_registers *regs,
487 regmatch_t *pmatch,
488 int nregs, int regs_allocated)
490 int rval = REGS_REALLOCATE;
491 int i;
492 int need_regs = nregs + 1;
493 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
494 uses. */
496 /* Have the register data arrays been allocated? */
497 if (regs_allocated == REGS_UNALLOCATED)
498 { /* No. So allocate them with malloc. */
499 regs->start = re_malloc (regoff_t, need_regs);
500 if (BE (regs->start == NULL, 0))
501 return REGS_UNALLOCATED;
502 regs->end = re_malloc (regoff_t, need_regs);
503 if (BE (regs->end == NULL, 0))
505 re_free (regs->start);
506 return REGS_UNALLOCATED;
508 regs->num_regs = need_regs;
510 else if (regs_allocated == REGS_REALLOCATE)
511 { /* Yes. If we need more elements than were already
512 allocated, reallocate them. If we need fewer, just
513 leave it alone. */
514 if (BE (need_regs > regs->num_regs, 0))
516 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
517 regoff_t *new_end;
518 if (BE (new_start == NULL, 0))
519 return REGS_UNALLOCATED;
520 new_end = re_realloc (regs->end, regoff_t, need_regs);
521 if (BE (new_end == NULL, 0))
523 re_free (new_start);
524 return REGS_UNALLOCATED;
526 regs->start = new_start;
527 regs->end = new_end;
528 regs->num_regs = need_regs;
531 else
533 assert (regs_allocated == REGS_FIXED);
534 /* This function may not be called with REGS_FIXED and nregs too big. */
535 assert (regs->num_regs >= nregs);
536 rval = REGS_FIXED;
539 /* Copy the regs. */
540 for (i = 0; i < nregs; ++i)
542 regs->start[i] = pmatch[i].rm_so;
543 regs->end[i] = pmatch[i].rm_eo;
545 for ( ; i < regs->num_regs; ++i)
546 regs->start[i] = regs->end[i] = -1;
548 return rval;
551 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
552 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
553 this memory for recording register information. STARTS and ENDS
554 must be allocated using the malloc library routine, and must each
555 be at least NUM_REGS * sizeof (regoff_t) bytes long.
557 If NUM_REGS == 0, then subsequent matches should allocate their own
558 register data.
560 Unless this function is called, the first search or match using
561 PATTERN_BUFFER will allocate its own register data, without
562 freeing the old data. */
564 void
565 re_set_registers (struct re_pattern_buffer *bufp,
566 struct re_registers *regs,
567 unsigned num_regs,
568 regoff_t *starts,
569 regoff_t *ends)
571 if (num_regs)
573 bufp->regs_allocated = REGS_REALLOCATE;
574 regs->num_regs = num_regs;
575 regs->start = starts;
576 regs->end = ends;
578 else
580 bufp->regs_allocated = REGS_UNALLOCATED;
581 regs->num_regs = 0;
582 regs->start = regs->end = (regoff_t *) 0;
585 #ifdef _LIBC
586 weak_alias (__re_set_registers, re_set_registers)
587 #endif
589 /* Entry points compatible with 4.2 BSD regex library. We don't define
590 them unless specifically requested. */
592 #if defined _REGEX_RE_COMP || defined _LIBC
594 # ifdef _LIBC
595 weak_function
596 # endif
597 re_exec (s)
598 const char *s;
600 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
602 #endif /* _REGEX_RE_COMP */
604 /* Internal entry point. */
606 /* Searches for a compiled pattern PREG in the string STRING, whose
607 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
608 mingings with regexec. START, and RANGE have the same meanings
609 with re_search.
610 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
611 otherwise return the error code.
612 Note: We assume front end functions already check ranges.
613 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
615 static reg_errcode_t
616 re_search_internal (const regex_t *preg,
617 const char *string,
618 int length, int start, int range, int stop,
619 size_t nmatch, regmatch_t pmatch[],
620 int eflags)
622 reg_errcode_t err;
623 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
624 int left_lim, right_lim, incr;
625 int fl_longest_match, match_first, match_kind, match_last = -1;
626 int extra_nmatch;
627 int sb, ch;
628 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
629 re_match_context_t mctx = { .dfa = dfa };
630 #else
631 re_match_context_t mctx;
632 #endif
633 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
634 && range && !preg->can_be_null) ? preg->fastmap : NULL;
635 RE_TRANSLATE_TYPE t = preg->translate;
637 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
638 memset (&mctx, '\0', sizeof (re_match_context_t));
639 mctx.dfa = dfa;
640 #endif
642 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
643 nmatch -= extra_nmatch;
645 /* Check if the DFA haven't been compiled. */
646 if (BE (preg->used == 0 || dfa->init_state == NULL
647 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
648 || dfa->init_state_begbuf == NULL, 0))
649 return REG_NOMATCH;
651 #ifdef DEBUG
652 /* We assume front-end functions already check them. */
653 assert (start + range >= 0 && start + range <= length);
654 #endif
656 /* If initial states with non-begbuf contexts have no elements,
657 the regex must be anchored. If preg->newline_anchor is set,
658 we'll never use init_state_nl, so do not check it. */
659 if (dfa->init_state->nodes.nelem == 0
660 && dfa->init_state_word->nodes.nelem == 0
661 && (dfa->init_state_nl->nodes.nelem == 0
662 || !preg->newline_anchor))
664 if (start != 0 && start + range != 0)
665 return REG_NOMATCH;
666 start = range = 0;
669 /* We must check the longest matching, if nmatch > 0. */
670 fl_longest_match = (nmatch != 0 || dfa->nbackref);
672 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
673 preg->translate, preg->syntax & RE_ICASE, dfa);
674 if (BE (err != REG_NOERROR, 0))
675 goto free_return;
676 mctx.input.stop = stop;
677 mctx.input.raw_stop = stop;
678 mctx.input.newline_anchor = preg->newline_anchor;
680 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
681 if (BE (err != REG_NOERROR, 0))
682 goto free_return;
684 /* We will log all the DFA states through which the dfa pass,
685 if nmatch > 1, or this dfa has "multibyte node", which is a
686 back-reference or a node which can accept multibyte character or
687 multi character collating element. */
688 if (nmatch > 1 || dfa->has_mb_node)
690 /* Avoid overflow. */
691 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
693 err = REG_ESPACE;
694 goto free_return;
697 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
698 if (BE (mctx.state_log == NULL, 0))
700 err = REG_ESPACE;
701 goto free_return;
704 else
705 mctx.state_log = NULL;
707 match_first = start;
708 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
709 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
711 /* Check incrementally whether of not the input string match. */
712 incr = (range < 0) ? -1 : 1;
713 left_lim = (range < 0) ? start + range : start;
714 right_lim = (range < 0) ? start : start + range;
715 sb = dfa->mb_cur_max == 1;
716 match_kind =
717 (fastmap
718 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
719 | (range >= 0 ? 2 : 0)
720 | (t != NULL ? 1 : 0))
721 : 8);
723 for (;; match_first += incr)
725 err = REG_NOMATCH;
726 if (match_first < left_lim || right_lim < match_first)
727 goto free_return;
729 /* Advance as rapidly as possible through the string, until we
730 find a plausible place to start matching. This may be done
731 with varying efficiency, so there are various possibilities:
732 only the most common of them are specialized, in order to
733 save on code size. We use a switch statement for speed. */
734 switch (match_kind)
736 case 8:
737 /* No fastmap. */
738 break;
740 case 7:
741 /* Fastmap with single-byte translation, match forward. */
742 while (BE (match_first < right_lim, 1)
743 && !fastmap[t[(unsigned char) string[match_first]]])
744 ++match_first;
745 goto forward_match_found_start_or_reached_end;
747 case 6:
748 /* Fastmap without translation, match forward. */
749 while (BE (match_first < right_lim, 1)
750 && !fastmap[(unsigned char) string[match_first]])
751 ++match_first;
753 forward_match_found_start_or_reached_end:
754 if (BE (match_first == right_lim, 0))
756 ch = match_first >= length
757 ? 0 : (unsigned char) string[match_first];
758 if (!fastmap[t ? t[ch] : ch])
759 goto free_return;
761 break;
763 case 4:
764 case 5:
765 /* Fastmap without multi-byte translation, match backwards. */
766 while (match_first >= left_lim)
768 ch = match_first >= length
769 ? 0 : (unsigned char) string[match_first];
770 if (fastmap[t ? t[ch] : ch])
771 break;
772 --match_first;
774 if (match_first < left_lim)
775 goto free_return;
776 break;
778 default:
779 /* In this case, we can't determine easily the current byte,
780 since it might be a component byte of a multibyte
781 character. Then we use the constructed buffer instead. */
782 for (;;)
784 /* If MATCH_FIRST is out of the valid range, reconstruct the
785 buffers. */
786 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
787 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
789 err = re_string_reconstruct (&mctx.input, match_first,
790 eflags);
791 if (BE (err != REG_NOERROR, 0))
792 goto free_return;
794 offset = match_first - mctx.input.raw_mbs_idx;
796 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
797 Note that MATCH_FIRST must not be smaller than 0. */
798 ch = (match_first >= length
799 ? 0 : re_string_byte_at (&mctx.input, offset));
800 if (fastmap[ch])
801 break;
802 match_first += incr;
803 if (match_first < left_lim || match_first > right_lim)
805 err = REG_NOMATCH;
806 goto free_return;
809 break;
812 /* Reconstruct the buffers so that the matcher can assume that
813 the matching starts from the beginning of the buffer. */
814 err = re_string_reconstruct (&mctx.input, match_first, eflags);
815 if (BE (err != REG_NOERROR, 0))
816 goto free_return;
818 #ifdef RE_ENABLE_I18N
819 /* Don't consider this char as a possible match start if it part,
820 yet isn't the head, of a multibyte character. */
821 if (!sb && !re_string_first_byte (&mctx.input, 0))
822 continue;
823 #endif
825 /* It seems to be appropriate one, then use the matcher. */
826 /* We assume that the matching starts from 0. */
827 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
828 match_last = check_matching (&mctx, fl_longest_match,
829 range >= 0 ? &match_first : NULL);
830 if (match_last != -1)
832 if (BE (match_last == -2, 0))
834 err = REG_ESPACE;
835 goto free_return;
837 else
839 mctx.match_last = match_last;
840 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
842 re_dfastate_t *pstate = mctx.state_log[match_last];
843 mctx.last_node = check_halt_state_context (&mctx, pstate,
844 match_last);
846 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
847 || dfa->nbackref)
849 err = prune_impossible_nodes (&mctx);
850 if (err == REG_NOERROR)
851 break;
852 if (BE (err != REG_NOMATCH, 0))
853 goto free_return;
854 match_last = -1;
856 else
857 break; /* We found a match. */
861 match_ctx_clean (&mctx);
864 #ifdef DEBUG
865 assert (match_last != -1);
866 assert (err == REG_NOERROR);
867 #endif
869 /* Set pmatch[] if we need. */
870 if (nmatch > 0)
872 int reg_idx;
874 /* Initialize registers. */
875 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
876 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
878 /* Set the points where matching start/end. */
879 pmatch[0].rm_so = 0;
880 pmatch[0].rm_eo = mctx.match_last;
882 if (!preg->no_sub && nmatch > 1)
884 err = set_regs (preg, &mctx, nmatch, pmatch,
885 dfa->has_plural_match && dfa->nbackref > 0);
886 if (BE (err != REG_NOERROR, 0))
887 goto free_return;
890 /* At last, add the offset to the each registers, since we slided
891 the buffers so that we could assume that the matching starts
892 from 0. */
893 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
894 if (pmatch[reg_idx].rm_so != -1)
896 #ifdef RE_ENABLE_I18N
897 if (BE (mctx.input.offsets_needed != 0, 0))
899 pmatch[reg_idx].rm_so =
900 (pmatch[reg_idx].rm_so == mctx.input.valid_len
901 ? mctx.input.valid_raw_len
902 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
903 pmatch[reg_idx].rm_eo =
904 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
905 ? mctx.input.valid_raw_len
906 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
908 #else
909 assert (mctx.input.offsets_needed == 0);
910 #endif
911 pmatch[reg_idx].rm_so += match_first;
912 pmatch[reg_idx].rm_eo += match_first;
914 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
916 pmatch[nmatch + reg_idx].rm_so = -1;
917 pmatch[nmatch + reg_idx].rm_eo = -1;
920 if (dfa->subexp_map)
921 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
922 if (dfa->subexp_map[reg_idx] != reg_idx)
924 pmatch[reg_idx + 1].rm_so
925 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
926 pmatch[reg_idx + 1].rm_eo
927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
931 free_return:
932 re_free (mctx.state_log);
933 if (dfa->nbackref)
934 match_ctx_free (&mctx);
935 re_string_destruct (&mctx.input);
936 return err;
939 static reg_errcode_t
940 prune_impossible_nodes (re_match_context_t *mctx)
942 const re_dfa_t *const dfa = mctx->dfa;
943 int halt_node, match_last;
944 reg_errcode_t ret;
945 re_dfastate_t **sifted_states;
946 re_dfastate_t **lim_states = NULL;
947 re_sift_context_t sctx;
948 #ifdef DEBUG
949 assert (mctx->state_log != NULL);
950 #endif
951 match_last = mctx->match_last;
952 halt_node = mctx->last_node;
954 /* Avoid overflow. */
955 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
956 return REG_ESPACE;
958 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
959 if (BE (sifted_states == NULL, 0))
961 ret = REG_ESPACE;
962 goto free_return;
964 if (dfa->nbackref)
966 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
967 if (BE (lim_states == NULL, 0))
969 ret = REG_ESPACE;
970 goto free_return;
972 while (1)
974 memset (lim_states, '\0',
975 sizeof (re_dfastate_t *) * (match_last + 1));
976 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
977 match_last);
978 ret = sift_states_backward (mctx, &sctx);
979 re_node_set_free (&sctx.limits);
980 if (BE (ret != REG_NOERROR, 0))
981 goto free_return;
982 if (sifted_states[0] != NULL || lim_states[0] != NULL)
983 break;
986 --match_last;
987 if (match_last < 0)
989 ret = REG_NOMATCH;
990 goto free_return;
992 } while (mctx->state_log[match_last] == NULL
993 || !mctx->state_log[match_last]->halt);
994 halt_node = check_halt_state_context (mctx,
995 mctx->state_log[match_last],
996 match_last);
998 ret = merge_state_array (dfa, sifted_states, lim_states,
999 match_last + 1);
1000 re_free (lim_states);
1001 lim_states = NULL;
1002 if (BE (ret != REG_NOERROR, 0))
1003 goto free_return;
1005 else
1007 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1008 ret = sift_states_backward (mctx, &sctx);
1009 re_node_set_free (&sctx.limits);
1010 if (BE (ret != REG_NOERROR, 0))
1011 goto free_return;
1012 if (sifted_states[0] == NULL)
1014 ret = REG_NOMATCH;
1015 goto free_return;
1018 re_free (mctx->state_log);
1019 mctx->state_log = sifted_states;
1020 sifted_states = NULL;
1021 mctx->last_node = halt_node;
1022 mctx->match_last = match_last;
1023 ret = REG_NOERROR;
1024 free_return:
1025 re_free (sifted_states);
1026 re_free (lim_states);
1027 return ret;
1030 /* Acquire an initial state and return it.
1031 We must select appropriate initial state depending on the context,
1032 since initial states may have constraints like "\<", "^", etc.. */
1034 static inline re_dfastate_t *
1035 __attribute ((always_inline)) internal_function
1036 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1037 int idx)
1039 const re_dfa_t *const dfa = mctx->dfa;
1040 if (dfa->init_state->has_constraint)
1042 unsigned int context;
1043 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1044 if (IS_WORD_CONTEXT (context))
1045 return dfa->init_state_word;
1046 else if (IS_ORDINARY_CONTEXT (context))
1047 return dfa->init_state;
1048 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1049 return dfa->init_state_begbuf;
1050 else if (IS_NEWLINE_CONTEXT (context))
1051 return dfa->init_state_nl;
1052 else if (IS_BEGBUF_CONTEXT (context))
1054 /* It is relatively rare case, then calculate on demand. */
1055 return re_acquire_state_context (err, dfa,
1056 dfa->init_state->entrance_nodes,
1057 context);
1059 else
1060 /* Must not happen? */
1061 return dfa->init_state;
1063 else
1064 return dfa->init_state;
1067 /* Check whether the regular expression match input string INPUT or not,
1068 and return the index where the matching end, return -1 if not match,
1069 or return -2 in case of an error.
1070 FL_LONGEST_MATCH means we want the POSIX longest matching.
1071 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1072 next place where we may want to try matching.
1073 Note that the matcher assume that the matching starts from the current
1074 index of the buffer. */
1076 static int
1077 internal_function
1078 check_matching (re_match_context_t *mctx, int fl_longest_match,
1079 int *p_match_first)
1081 const re_dfa_t *const dfa = mctx->dfa;
1082 reg_errcode_t err;
1083 int match = 0;
1084 int match_last = -1;
1085 int cur_str_idx = re_string_cur_idx (&mctx->input);
1086 re_dfastate_t *cur_state;
1087 int at_init_state = p_match_first != NULL;
1088 int next_start_idx = cur_str_idx;
1090 err = REG_NOERROR;
1091 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1092 /* An initial state must not be NULL (invalid). */
1093 if (BE (cur_state == NULL, 0))
1095 assert (err == REG_ESPACE);
1096 return -2;
1099 if (mctx->state_log != NULL)
1101 mctx->state_log[cur_str_idx] = cur_state;
1103 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1104 later. E.g. Processing back references. */
1105 if (BE (dfa->nbackref, 0))
1107 at_init_state = 0;
1108 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1109 if (BE (err != REG_NOERROR, 0))
1110 return err;
1112 if (cur_state->has_backref)
1114 err = transit_state_bkref (mctx, &cur_state->nodes);
1115 if (BE (err != REG_NOERROR, 0))
1116 return err;
1121 /* If the RE accepts NULL string. */
1122 if (BE (cur_state->halt, 0))
1124 if (!cur_state->has_constraint
1125 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1127 if (!fl_longest_match)
1128 return cur_str_idx;
1129 else
1131 match_last = cur_str_idx;
1132 match = 1;
1137 while (!re_string_eoi (&mctx->input))
1139 re_dfastate_t *old_state = cur_state;
1140 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1142 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1143 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1144 && mctx->input.valid_len < mctx->input.len))
1146 err = extend_buffers (mctx);
1147 if (BE (err != REG_NOERROR, 0))
1149 assert (err == REG_ESPACE);
1150 return -2;
1154 cur_state = transit_state (&err, mctx, cur_state);
1155 if (mctx->state_log != NULL)
1156 cur_state = merge_state_with_log (&err, mctx, cur_state);
1158 if (cur_state == NULL)
1160 /* Reached the invalid state or an error. Try to recover a valid
1161 state using the state log, if available and if we have not
1162 already found a valid (even if not the longest) match. */
1163 if (BE (err != REG_NOERROR, 0))
1164 return -2;
1166 if (mctx->state_log == NULL
1167 || (match && !fl_longest_match)
1168 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1169 break;
1172 if (BE (at_init_state, 0))
1174 if (old_state == cur_state)
1175 next_start_idx = next_char_idx;
1176 else
1177 at_init_state = 0;
1180 if (cur_state->halt)
1182 /* Reached a halt state.
1183 Check the halt state can satisfy the current context. */
1184 if (!cur_state->has_constraint
1185 || check_halt_state_context (mctx, cur_state,
1186 re_string_cur_idx (&mctx->input)))
1188 /* We found an appropriate halt state. */
1189 match_last = re_string_cur_idx (&mctx->input);
1190 match = 1;
1192 /* We found a match, do not modify match_first below. */
1193 p_match_first = NULL;
1194 if (!fl_longest_match)
1195 break;
1200 if (p_match_first)
1201 *p_match_first += next_start_idx;
1203 return match_last;
1206 /* Check NODE match the current context. */
1208 static int
1209 internal_function
1210 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1212 re_token_type_t type = dfa->nodes[node].type;
1213 unsigned int constraint = dfa->nodes[node].constraint;
1214 if (type != END_OF_RE)
1215 return 0;
1216 if (!constraint)
1217 return 1;
1218 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1219 return 0;
1220 return 1;
1223 /* Check the halt state STATE match the current context.
1224 Return 0 if not match, if the node, STATE has, is a halt node and
1225 match the context, return the node. */
1227 static int
1228 internal_function
1229 check_halt_state_context (const re_match_context_t *mctx,
1230 const re_dfastate_t *state, int idx)
1232 int i;
1233 unsigned int context;
1234 #ifdef DEBUG
1235 assert (state->halt);
1236 #endif
1237 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1238 for (i = 0; i < state->nodes.nelem; ++i)
1239 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1240 return state->nodes.elems[i];
1241 return 0;
1244 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1245 corresponding to the DFA).
1246 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1247 of errors. */
1249 static int
1250 internal_function
1251 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1252 int *pidx, int node, re_node_set *eps_via_nodes,
1253 struct re_fail_stack_t *fs)
1255 const re_dfa_t *const dfa = mctx->dfa;
1256 int i, err;
1257 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1259 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1260 re_node_set *edests = &dfa->edests[node];
1261 int dest_node;
1262 err = re_node_set_insert (eps_via_nodes, node);
1263 if (BE (err < 0, 0))
1264 return -2;
1265 /* Pick up a valid destination, or return -1 if none is found. */
1266 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1268 int candidate = edests->elems[i];
1269 if (!re_node_set_contains (cur_nodes, candidate))
1270 continue;
1271 if (dest_node == -1)
1272 dest_node = candidate;
1274 else
1276 /* In order to avoid infinite loop like "(a*)*", return the second
1277 epsilon-transition if the first was already considered. */
1278 if (re_node_set_contains (eps_via_nodes, dest_node))
1279 return candidate;
1281 /* Otherwise, push the second epsilon-transition on the fail stack. */
1282 else if (fs != NULL
1283 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1284 eps_via_nodes))
1285 return -2;
1287 /* We know we are going to exit. */
1288 break;
1291 return dest_node;
1293 else
1295 int naccepted = 0;
1296 re_token_type_t type = dfa->nodes[node].type;
1298 #ifdef RE_ENABLE_I18N
1299 if (dfa->nodes[node].accept_mb)
1300 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1301 else
1302 #endif /* RE_ENABLE_I18N */
1303 if (type == OP_BACK_REF)
1305 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1306 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1307 if (fs != NULL)
1309 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1310 return -1;
1311 else if (naccepted)
1313 char *buf = (char *) re_string_get_buffer (&mctx->input);
1314 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1315 naccepted) != 0)
1316 return -1;
1320 if (naccepted == 0)
1322 int dest_node;
1323 err = re_node_set_insert (eps_via_nodes, node);
1324 if (BE (err < 0, 0))
1325 return -2;
1326 dest_node = dfa->edests[node].elems[0];
1327 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1328 dest_node))
1329 return dest_node;
1333 if (naccepted != 0
1334 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1336 int dest_node = dfa->nexts[node];
1337 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1338 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1339 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1340 dest_node)))
1341 return -1;
1342 re_node_set_empty (eps_via_nodes);
1343 return dest_node;
1346 return -1;
1349 static reg_errcode_t
1350 internal_function
1351 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1352 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1354 reg_errcode_t err;
1355 int num = fs->num++;
1356 if (fs->num == fs->alloc)
1358 struct re_fail_stack_ent_t *new_array;
1359 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1360 * fs->alloc * 2));
1361 if (new_array == NULL)
1362 return REG_ESPACE;
1363 fs->alloc *= 2;
1364 fs->stack = new_array;
1366 fs->stack[num].idx = str_idx;
1367 fs->stack[num].node = dest_node;
1368 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1369 if (fs->stack[num].regs == NULL)
1370 return REG_ESPACE;
1371 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1372 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1373 return err;
1376 static int
1377 internal_function
1378 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1379 regmatch_t *regs, re_node_set *eps_via_nodes)
1381 int num = --fs->num;
1382 assert (num >= 0);
1383 *pidx = fs->stack[num].idx;
1384 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1385 re_node_set_free (eps_via_nodes);
1386 re_free (fs->stack[num].regs);
1387 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1388 return fs->stack[num].node;
1391 /* Set the positions where the subexpressions are starts/ends to registers
1392 PMATCH.
1393 Note: We assume that pmatch[0] is already set, and
1394 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1396 static reg_errcode_t
1397 internal_function
1398 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1399 regmatch_t *pmatch, int fl_backtrack)
1401 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1402 int idx, cur_node;
1403 re_node_set eps_via_nodes;
1404 struct re_fail_stack_t *fs;
1405 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1406 regmatch_t *prev_idx_match;
1407 int prev_idx_match_malloced = 0;
1409 #ifdef DEBUG
1410 assert (nmatch > 1);
1411 assert (mctx->state_log != NULL);
1412 #endif
1413 if (fl_backtrack)
1415 fs = &fs_body;
1416 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1417 if (fs->stack == NULL)
1418 return REG_ESPACE;
1420 else
1421 fs = NULL;
1423 cur_node = dfa->init_node;
1424 re_node_set_init_empty (&eps_via_nodes);
1426 #ifdef HAVE_ALLOCA
1427 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1428 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1429 else
1430 #endif
1432 prev_idx_match = re_malloc (regmatch_t, nmatch);
1433 if (prev_idx_match == NULL)
1435 free_fail_stack_return (fs);
1436 return REG_ESPACE;
1438 prev_idx_match_malloced = 1;
1440 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1442 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1444 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1446 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1448 int reg_idx;
1449 if (fs)
1451 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1452 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1453 break;
1454 if (reg_idx == nmatch)
1456 re_node_set_free (&eps_via_nodes);
1457 if (prev_idx_match_malloced)
1458 re_free (prev_idx_match);
1459 return free_fail_stack_return (fs);
1461 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1462 &eps_via_nodes);
1464 else
1466 re_node_set_free (&eps_via_nodes);
1467 if (prev_idx_match_malloced)
1468 re_free (prev_idx_match);
1469 return REG_NOERROR;
1473 /* Proceed to next node. */
1474 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1475 &eps_via_nodes, fs);
1477 if (BE (cur_node < 0, 0))
1479 if (BE (cur_node == -2, 0))
1481 re_node_set_free (&eps_via_nodes);
1482 if (prev_idx_match_malloced)
1483 re_free (prev_idx_match);
1484 free_fail_stack_return (fs);
1485 return REG_ESPACE;
1487 if (fs)
1488 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1489 &eps_via_nodes);
1490 else
1492 re_node_set_free (&eps_via_nodes);
1493 if (prev_idx_match_malloced)
1494 re_free (prev_idx_match);
1495 return REG_NOMATCH;
1499 re_node_set_free (&eps_via_nodes);
1500 if (prev_idx_match_malloced)
1501 re_free (prev_idx_match);
1502 return free_fail_stack_return (fs);
1505 static reg_errcode_t
1506 internal_function
1507 free_fail_stack_return (struct re_fail_stack_t *fs)
1509 if (fs)
1511 int fs_idx;
1512 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1514 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1515 re_free (fs->stack[fs_idx].regs);
1517 re_free (fs->stack);
1519 return REG_NOERROR;
1522 static void
1523 internal_function
1524 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1525 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1527 int type = dfa->nodes[cur_node].type;
1528 if (type == OP_OPEN_SUBEXP)
1530 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1532 /* We are at the first node of this sub expression. */
1533 if (reg_num < nmatch)
1535 pmatch[reg_num].rm_so = cur_idx;
1536 pmatch[reg_num].rm_eo = -1;
1539 else if (type == OP_CLOSE_SUBEXP)
1541 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542 if (reg_num < nmatch)
1544 /* We are at the last node of this sub expression. */
1545 if (pmatch[reg_num].rm_so < cur_idx)
1547 pmatch[reg_num].rm_eo = cur_idx;
1548 /* This is a non-empty match or we are not inside an optional
1549 subexpression. Accept this right away. */
1550 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1552 else
1554 if (dfa->nodes[cur_node].opt_subexp
1555 && prev_idx_match[reg_num].rm_so != -1)
1556 /* We transited through an empty match for an optional
1557 subexpression, like (a?)*, and this is not the subexp's
1558 first match. Copy back the old content of the registers
1559 so that matches of an inner subexpression are undone as
1560 well, like in ((a?))*. */
1561 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1562 else
1563 /* We completed a subexpression, but it may be part of
1564 an optional one, so do not update PREV_IDX_MATCH. */
1565 pmatch[reg_num].rm_eo = cur_idx;
1571 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1572 and sift the nodes in each states according to the following rules.
1573 Updated state_log will be wrote to STATE_LOG.
1575 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1576 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1577 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1578 the LAST_NODE, we throw away the node `a'.
1579 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1580 string `s' and transit to `b':
1581 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1582 away the node `a'.
1583 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1584 thrown away, we throw away the node `a'.
1585 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1586 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1587 node `a'.
1588 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1589 we throw away the node `a'. */
1591 #define STATE_NODE_CONTAINS(state,node) \
1592 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1594 static reg_errcode_t
1595 internal_function
1596 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1598 reg_errcode_t err;
1599 int null_cnt = 0;
1600 int str_idx = sctx->last_str_idx;
1601 re_node_set cur_dest;
1603 #ifdef DEBUG
1604 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1605 #endif
1607 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1608 transit to the last_node and the last_node itself. */
1609 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1610 if (BE (err != REG_NOERROR, 0))
1611 return err;
1612 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1613 if (BE (err != REG_NOERROR, 0))
1614 goto free_return;
1616 /* Then check each states in the state_log. */
1617 while (str_idx > 0)
1619 /* Update counters. */
1620 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1621 if (null_cnt > mctx->max_mb_elem_len)
1623 memset (sctx->sifted_states, '\0',
1624 sizeof (re_dfastate_t *) * str_idx);
1625 re_node_set_free (&cur_dest);
1626 return REG_NOERROR;
1628 re_node_set_empty (&cur_dest);
1629 --str_idx;
1631 if (mctx->state_log[str_idx])
1633 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1634 if (BE (err != REG_NOERROR, 0))
1635 goto free_return;
1638 /* Add all the nodes which satisfy the following conditions:
1639 - It can epsilon transit to a node in CUR_DEST.
1640 - It is in CUR_SRC.
1641 And update state_log. */
1642 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1643 if (BE (err != REG_NOERROR, 0))
1644 goto free_return;
1646 err = REG_NOERROR;
1647 free_return:
1648 re_node_set_free (&cur_dest);
1649 return err;
1652 static reg_errcode_t
1653 internal_function
1654 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1655 int str_idx, re_node_set *cur_dest)
1657 const re_dfa_t *const dfa = mctx->dfa;
1658 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1659 int i;
1661 /* Then build the next sifted state.
1662 We build the next sifted state on `cur_dest', and update
1663 `sifted_states[str_idx]' with `cur_dest'.
1664 Note:
1665 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1666 `cur_src' points the node_set of the old `state_log[str_idx]'
1667 (with the epsilon nodes pre-filtered out). */
1668 for (i = 0; i < cur_src->nelem; i++)
1670 int prev_node = cur_src->elems[i];
1671 int naccepted = 0;
1672 int ret;
1674 #ifdef DEBUG
1675 re_token_type_t type = dfa->nodes[prev_node].type;
1676 assert (!IS_EPSILON_NODE (type));
1677 #endif
1678 #ifdef RE_ENABLE_I18N
1679 /* If the node may accept `multi byte'. */
1680 if (dfa->nodes[prev_node].accept_mb)
1681 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1682 str_idx, sctx->last_str_idx);
1683 #endif /* RE_ENABLE_I18N */
1685 /* We don't check backreferences here.
1686 See update_cur_sifted_state(). */
1687 if (!naccepted
1688 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1689 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1690 dfa->nexts[prev_node]))
1691 naccepted = 1;
1693 if (naccepted == 0)
1694 continue;
1696 if (sctx->limits.nelem)
1698 int to_idx = str_idx + naccepted;
1699 if (check_dst_limits (mctx, &sctx->limits,
1700 dfa->nexts[prev_node], to_idx,
1701 prev_node, str_idx))
1702 continue;
1704 ret = re_node_set_insert (cur_dest, prev_node);
1705 if (BE (ret == -1, 0))
1706 return REG_ESPACE;
1709 return REG_NOERROR;
1712 /* Helper functions. */
1714 static reg_errcode_t
1715 internal_function
1716 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1718 int top = mctx->state_log_top;
1720 if (next_state_log_idx >= mctx->input.bufs_len
1721 || (next_state_log_idx >= mctx->input.valid_len
1722 && mctx->input.valid_len < mctx->input.len))
1724 reg_errcode_t err;
1725 err = extend_buffers (mctx);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1730 if (top < next_state_log_idx)
1732 memset (mctx->state_log + top + 1, '\0',
1733 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1734 mctx->state_log_top = next_state_log_idx;
1736 return REG_NOERROR;
1739 static reg_errcode_t
1740 internal_function
1741 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1742 re_dfastate_t **src, int num)
1744 int st_idx;
1745 reg_errcode_t err;
1746 for (st_idx = 0; st_idx < num; ++st_idx)
1748 if (dst[st_idx] == NULL)
1749 dst[st_idx] = src[st_idx];
1750 else if (src[st_idx] != NULL)
1752 re_node_set merged_set;
1753 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1754 &src[st_idx]->nodes);
1755 if (BE (err != REG_NOERROR, 0))
1756 return err;
1757 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1758 re_node_set_free (&merged_set);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1763 return REG_NOERROR;
1766 static reg_errcode_t
1767 internal_function
1768 update_cur_sifted_state (const re_match_context_t *mctx,
1769 re_sift_context_t *sctx, int str_idx,
1770 re_node_set *dest_nodes)
1772 const re_dfa_t *const dfa = mctx->dfa;
1773 reg_errcode_t err = REG_NOERROR;
1774 const re_node_set *candidates;
1775 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1776 : &mctx->state_log[str_idx]->nodes);
1778 if (dest_nodes->nelem == 0)
1779 sctx->sifted_states[str_idx] = NULL;
1780 else
1782 if (candidates)
1784 /* At first, add the nodes which can epsilon transit to a node in
1785 DEST_NODE. */
1786 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1787 if (BE (err != REG_NOERROR, 0))
1788 return err;
1790 /* Then, check the limitations in the current sift_context. */
1791 if (sctx->limits.nelem)
1793 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1794 mctx->bkref_ents, str_idx);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1800 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1801 if (BE (err != REG_NOERROR, 0))
1802 return err;
1805 if (candidates && mctx->state_log[str_idx]->has_backref)
1807 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1808 if (BE (err != REG_NOERROR, 0))
1809 return err;
1811 return REG_NOERROR;
1814 static reg_errcode_t
1815 internal_function
1816 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1817 const re_node_set *candidates)
1819 reg_errcode_t err = REG_NOERROR;
1820 int i;
1822 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1823 if (BE (err != REG_NOERROR, 0))
1824 return err;
1826 if (!state->inveclosure.alloc)
1828 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1829 if (BE (err != REG_NOERROR, 0))
1830 return REG_ESPACE;
1831 for (i = 0; i < dest_nodes->nelem; i++)
1833 err = re_node_set_merge (&state->inveclosure,
1834 dfa->inveclosures + dest_nodes->elems[i]);
1835 if (BE (err != REG_NOERROR, 0))
1836 return REG_ESPACE;
1839 return re_node_set_add_intersect (dest_nodes, candidates,
1840 &state->inveclosure);
1843 static reg_errcode_t
1844 internal_function
1845 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1846 const re_node_set *candidates)
1848 int ecl_idx;
1849 reg_errcode_t err;
1850 re_node_set *inv_eclosure = dfa->inveclosures + node;
1851 re_node_set except_nodes;
1852 re_node_set_init_empty (&except_nodes);
1853 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1855 int cur_node = inv_eclosure->elems[ecl_idx];
1856 if (cur_node == node)
1857 continue;
1858 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1860 int edst1 = dfa->edests[cur_node].elems[0];
1861 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1862 ? dfa->edests[cur_node].elems[1] : -1);
1863 if ((!re_node_set_contains (inv_eclosure, edst1)
1864 && re_node_set_contains (dest_nodes, edst1))
1865 || (edst2 > 0
1866 && !re_node_set_contains (inv_eclosure, edst2)
1867 && re_node_set_contains (dest_nodes, edst2)))
1869 err = re_node_set_add_intersect (&except_nodes, candidates,
1870 dfa->inveclosures + cur_node);
1871 if (BE (err != REG_NOERROR, 0))
1873 re_node_set_free (&except_nodes);
1874 return err;
1879 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1881 int cur_node = inv_eclosure->elems[ecl_idx];
1882 if (!re_node_set_contains (&except_nodes, cur_node))
1884 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1885 re_node_set_remove_at (dest_nodes, idx);
1888 re_node_set_free (&except_nodes);
1889 return REG_NOERROR;
1892 static int
1893 internal_function
1894 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1895 int dst_node, int dst_idx, int src_node, int src_idx)
1897 const re_dfa_t *const dfa = mctx->dfa;
1898 int lim_idx, src_pos, dst_pos;
1900 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1901 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1902 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1904 int subexp_idx;
1905 struct re_backref_cache_entry *ent;
1906 ent = mctx->bkref_ents + limits->elems[lim_idx];
1907 subexp_idx = dfa->nodes[ent->node].opr.idx;
1909 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1910 subexp_idx, dst_node, dst_idx,
1911 dst_bkref_idx);
1912 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1913 subexp_idx, src_node, src_idx,
1914 src_bkref_idx);
1916 /* In case of:
1917 <src> <dst> ( <subexp> )
1918 ( <subexp> ) <src> <dst>
1919 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1920 if (src_pos == dst_pos)
1921 continue; /* This is unrelated limitation. */
1922 else
1923 return 1;
1925 return 0;
1928 static int
1929 internal_function
1930 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1931 int subexp_idx, int from_node, int bkref_idx)
1933 const re_dfa_t *const dfa = mctx->dfa;
1934 const re_node_set *eclosures = dfa->eclosures + from_node;
1935 int node_idx;
1937 /* Else, we are on the boundary: examine the nodes on the epsilon
1938 closure. */
1939 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1941 int node = eclosures->elems[node_idx];
1942 switch (dfa->nodes[node].type)
1944 case OP_BACK_REF:
1945 if (bkref_idx != -1)
1947 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1950 int dst, cpos;
1952 if (ent->node != node)
1953 continue;
1955 if (subexp_idx < BITSET_WORD_BITS
1956 && !(ent->eps_reachable_subexps_map
1957 & ((bitset_word_t) 1 << subexp_idx)))
1958 continue;
1960 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961 OP_CLOSE_SUBEXP cases below. But, if the
1962 destination node is the same node as the source
1963 node, don't recurse because it would cause an
1964 infinite loop: a regex that exhibits this behavior
1965 is ()\1*\1* */
1966 dst = dfa->edests[node].elems[0];
1967 if (dst == from_node)
1969 if (boundaries & 1)
1970 return -1;
1971 else /* if (boundaries & 2) */
1972 return 0;
1975 cpos =
1976 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1977 dst, bkref_idx);
1978 if (cpos == -1 /* && (boundaries & 1) */)
1979 return -1;
1980 if (cpos == 0 && (boundaries & 2))
1981 return 0;
1983 if (subexp_idx < BITSET_WORD_BITS)
1984 ent->eps_reachable_subexps_map
1985 &= ~((bitset_word_t) 1 << subexp_idx);
1987 while (ent++->more);
1989 break;
1991 case OP_OPEN_SUBEXP:
1992 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1993 return -1;
1994 break;
1996 case OP_CLOSE_SUBEXP:
1997 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1998 return 0;
1999 break;
2001 default:
2002 break;
2006 return (boundaries & 2) ? 1 : 0;
2009 static int
2010 internal_function
2011 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2012 int subexp_idx, int from_node, int str_idx,
2013 int bkref_idx)
2015 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2016 int boundaries;
2018 /* If we are outside the range of the subexpression, return -1 or 1. */
2019 if (str_idx < lim->subexp_from)
2020 return -1;
2022 if (lim->subexp_to < str_idx)
2023 return 1;
2025 /* If we are within the subexpression, return 0. */
2026 boundaries = (str_idx == lim->subexp_from);
2027 boundaries |= (str_idx == lim->subexp_to) << 1;
2028 if (boundaries == 0)
2029 return 0;
2031 /* Else, examine epsilon closure. */
2032 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2033 from_node, bkref_idx);
2036 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2037 which are against limitations from DEST_NODES. */
2039 static reg_errcode_t
2040 internal_function
2041 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2042 const re_node_set *candidates, re_node_set *limits,
2043 struct re_backref_cache_entry *bkref_ents, int str_idx)
2045 reg_errcode_t err;
2046 int node_idx, lim_idx;
2048 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2050 int subexp_idx;
2051 struct re_backref_cache_entry *ent;
2052 ent = bkref_ents + limits->elems[lim_idx];
2054 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2055 continue; /* This is unrelated limitation. */
2057 subexp_idx = dfa->nodes[ent->node].opr.idx;
2058 if (ent->subexp_to == str_idx)
2060 int ops_node = -1;
2061 int cls_node = -1;
2062 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2064 int node = dest_nodes->elems[node_idx];
2065 re_token_type_t type = dfa->nodes[node].type;
2066 if (type == OP_OPEN_SUBEXP
2067 && subexp_idx == dfa->nodes[node].opr.idx)
2068 ops_node = node;
2069 else if (type == OP_CLOSE_SUBEXP
2070 && subexp_idx == dfa->nodes[node].opr.idx)
2071 cls_node = node;
2074 /* Check the limitation of the open subexpression. */
2075 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2076 if (ops_node >= 0)
2078 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2079 candidates);
2080 if (BE (err != REG_NOERROR, 0))
2081 return err;
2084 /* Check the limitation of the close subexpression. */
2085 if (cls_node >= 0)
2086 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2088 int node = dest_nodes->elems[node_idx];
2089 if (!re_node_set_contains (dfa->inveclosures + node,
2090 cls_node)
2091 && !re_node_set_contains (dfa->eclosures + node,
2092 cls_node))
2094 /* It is against this limitation.
2095 Remove it form the current sifted state. */
2096 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2097 candidates);
2098 if (BE (err != REG_NOERROR, 0))
2099 return err;
2100 --node_idx;
2104 else /* (ent->subexp_to != str_idx) */
2106 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2108 int node = dest_nodes->elems[node_idx];
2109 re_token_type_t type = dfa->nodes[node].type;
2110 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2112 if (subexp_idx != dfa->nodes[node].opr.idx)
2113 continue;
2114 /* It is against this limitation.
2115 Remove it form the current sifted state. */
2116 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2117 candidates);
2118 if (BE (err != REG_NOERROR, 0))
2119 return err;
2124 return REG_NOERROR;
2127 static reg_errcode_t
2128 internal_function
2129 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2130 int str_idx, const re_node_set *candidates)
2132 const re_dfa_t *const dfa = mctx->dfa;
2133 reg_errcode_t err;
2134 int node_idx, node;
2135 re_sift_context_t local_sctx;
2136 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2138 if (first_idx == -1)
2139 return REG_NOERROR;
2141 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2143 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2145 int enabled_idx;
2146 re_token_type_t type;
2147 struct re_backref_cache_entry *entry;
2148 node = candidates->elems[node_idx];
2149 type = dfa->nodes[node].type;
2150 /* Avoid infinite loop for the REs like "()\1+". */
2151 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2152 continue;
2153 if (type != OP_BACK_REF)
2154 continue;
2156 entry = mctx->bkref_ents + first_idx;
2157 enabled_idx = first_idx;
2160 int subexp_len;
2161 int to_idx;
2162 int dst_node;
2163 int ret;
2164 re_dfastate_t *cur_state;
2166 if (entry->node != node)
2167 continue;
2168 subexp_len = entry->subexp_to - entry->subexp_from;
2169 to_idx = str_idx + subexp_len;
2170 dst_node = (subexp_len ? dfa->nexts[node]
2171 : dfa->edests[node].elems[0]);
2173 if (to_idx > sctx->last_str_idx
2174 || sctx->sifted_states[to_idx] == NULL
2175 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2176 || check_dst_limits (mctx, &sctx->limits, node,
2177 str_idx, dst_node, to_idx))
2178 continue;
2180 if (local_sctx.sifted_states == NULL)
2182 local_sctx = *sctx;
2183 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2184 if (BE (err != REG_NOERROR, 0))
2185 goto free_return;
2187 local_sctx.last_node = node;
2188 local_sctx.last_str_idx = str_idx;
2189 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2190 if (BE (ret < 0, 0))
2192 err = REG_ESPACE;
2193 goto free_return;
2195 cur_state = local_sctx.sifted_states[str_idx];
2196 err = sift_states_backward (mctx, &local_sctx);
2197 if (BE (err != REG_NOERROR, 0))
2198 goto free_return;
2199 if (sctx->limited_states != NULL)
2201 err = merge_state_array (dfa, sctx->limited_states,
2202 local_sctx.sifted_states,
2203 str_idx + 1);
2204 if (BE (err != REG_NOERROR, 0))
2205 goto free_return;
2207 local_sctx.sifted_states[str_idx] = cur_state;
2208 re_node_set_remove (&local_sctx.limits, enabled_idx);
2210 /* mctx->bkref_ents may have changed, reload the pointer. */
2211 entry = mctx->bkref_ents + enabled_idx;
2213 while (enabled_idx++, entry++->more);
2215 err = REG_NOERROR;
2216 free_return:
2217 if (local_sctx.sifted_states != NULL)
2219 re_node_set_free (&local_sctx.limits);
2222 return err;
2226 #ifdef RE_ENABLE_I18N
2227 static int
2228 internal_function
2229 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2230 int node_idx, int str_idx, int max_str_idx)
2232 const re_dfa_t *const dfa = mctx->dfa;
2233 int naccepted;
2234 /* Check the node can accept `multi byte'. */
2235 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2236 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2237 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2238 dfa->nexts[node_idx]))
2239 /* The node can't accept the `multi byte', or the
2240 destination was already thrown away, then the node
2241 couldn't accept the current input `multi byte'. */
2242 naccepted = 0;
2243 /* Otherwise, it is sure that the node could accept
2244 `naccepted' bytes input. */
2245 return naccepted;
2247 #endif /* RE_ENABLE_I18N */
2250 /* Functions for state transition. */
2252 /* Return the next state to which the current state STATE will transit by
2253 accepting the current input byte, and update STATE_LOG if necessary.
2254 If STATE can accept a multibyte char/collating element/back reference
2255 update the destination of STATE_LOG. */
2257 static re_dfastate_t *
2258 internal_function
2259 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2260 re_dfastate_t *state)
2262 re_dfastate_t **trtable;
2263 unsigned char ch;
2265 #ifdef RE_ENABLE_I18N
2266 /* If the current state can accept multibyte. */
2267 if (BE (state->accept_mb, 0))
2269 *err = transit_state_mb (mctx, state);
2270 if (BE (*err != REG_NOERROR, 0))
2271 return NULL;
2273 #endif /* RE_ENABLE_I18N */
2275 /* Then decide the next state with the single byte. */
2276 #if 0
2277 if (0)
2278 /* don't use transition table */
2279 return transit_state_sb (err, mctx, state);
2280 #endif
2282 /* Use transition table */
2283 ch = re_string_fetch_byte (&mctx->input);
2284 for (;;)
2286 trtable = state->trtable;
2287 if (BE (trtable != NULL, 1))
2288 return trtable[ch];
2290 trtable = state->word_trtable;
2291 if (BE (trtable != NULL, 1))
2293 unsigned int context;
2294 context
2295 = re_string_context_at (&mctx->input,
2296 re_string_cur_idx (&mctx->input) - 1,
2297 mctx->eflags);
2298 if (IS_WORD_CONTEXT (context))
2299 return trtable[ch + SBC_MAX];
2300 else
2301 return trtable[ch];
2304 if (!build_trtable (mctx->dfa, state))
2306 *err = REG_ESPACE;
2307 return NULL;
2310 /* Retry, we now have a transition table. */
2314 /* Update the state_log if we need */
2315 static re_dfastate_t *
2316 internal_function
2317 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2318 re_dfastate_t *next_state)
2320 const re_dfa_t *const dfa = mctx->dfa;
2321 int cur_idx = re_string_cur_idx (&mctx->input);
2323 if (cur_idx > mctx->state_log_top)
2325 mctx->state_log[cur_idx] = next_state;
2326 mctx->state_log_top = cur_idx;
2328 else if (mctx->state_log[cur_idx] == NULL)
2330 mctx->state_log[cur_idx] = next_state;
2332 else
2334 re_dfastate_t *pstate;
2335 unsigned int context;
2336 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2337 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338 the destination of a multibyte char/collating element/
2339 back reference. Then the next state is the union set of
2340 these destinations and the results of the transition table. */
2341 pstate = mctx->state_log[cur_idx];
2342 log_nodes = pstate->entrance_nodes;
2343 if (next_state != NULL)
2345 table_nodes = next_state->entrance_nodes;
2346 *err = re_node_set_init_union (&next_nodes, table_nodes,
2347 log_nodes);
2348 if (BE (*err != REG_NOERROR, 0))
2349 return NULL;
2351 else
2352 next_nodes = *log_nodes;
2353 /* Note: We already add the nodes of the initial state,
2354 then we don't need to add them here. */
2356 context = re_string_context_at (&mctx->input,
2357 re_string_cur_idx (&mctx->input) - 1,
2358 mctx->eflags);
2359 next_state = mctx->state_log[cur_idx]
2360 = re_acquire_state_context (err, dfa, &next_nodes, context);
2361 /* We don't need to check errors here, since the return value of
2362 this function is next_state and ERR is already set. */
2364 if (table_nodes != NULL)
2365 re_node_set_free (&next_nodes);
2368 if (BE (dfa->nbackref, 0) && next_state != NULL)
2370 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371 later. We must check them here, since the back references in the
2372 next state might use them. */
2373 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2374 cur_idx);
2375 if (BE (*err != REG_NOERROR, 0))
2376 return NULL;
2378 /* If the next state has back references. */
2379 if (next_state->has_backref)
2381 *err = transit_state_bkref (mctx, &next_state->nodes);
2382 if (BE (*err != REG_NOERROR, 0))
2383 return NULL;
2384 next_state = mctx->state_log[cur_idx];
2388 return next_state;
2391 /* Skip bytes in the input that correspond to part of a
2392 multi-byte match, then look in the log for a state
2393 from which to restart matching. */
2394 static re_dfastate_t *
2395 internal_function
2396 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2398 re_dfastate_t *cur_state;
2401 int max = mctx->state_log_top;
2402 int cur_str_idx = re_string_cur_idx (&mctx->input);
2406 if (++cur_str_idx > max)
2407 return NULL;
2408 re_string_skip_bytes (&mctx->input, 1);
2410 while (mctx->state_log[cur_str_idx] == NULL);
2412 cur_state = merge_state_with_log (err, mctx, NULL);
2414 while (*err == REG_NOERROR && cur_state == NULL);
2415 return cur_state;
2418 /* Helper functions for transit_state. */
2420 /* From the node set CUR_NODES, pick up the nodes whose types are
2421 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422 expression. And register them to use them later for evaluating the
2423 correspoding back references. */
2425 static reg_errcode_t
2426 internal_function
2427 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2428 int str_idx)
2430 const re_dfa_t *const dfa = mctx->dfa;
2431 int node_idx;
2432 reg_errcode_t err;
2434 /* TODO: This isn't efficient.
2435 Because there might be more than one nodes whose types are
2436 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2437 nodes.
2438 E.g. RE: (a){2} */
2439 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2441 int node = cur_nodes->elems[node_idx];
2442 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2443 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2444 && (dfa->used_bkref_map
2445 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2447 err = match_ctx_add_subtop (mctx, node, str_idx);
2448 if (BE (err != REG_NOERROR, 0))
2449 return err;
2452 return REG_NOERROR;
2455 #if 0
2456 /* Return the next state to which the current state STATE will transit by
2457 accepting the current input byte. */
2459 static re_dfastate_t *
2460 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2461 re_dfastate_t *state)
2463 const re_dfa_t *const dfa = mctx->dfa;
2464 re_node_set next_nodes;
2465 re_dfastate_t *next_state;
2466 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2467 unsigned int context;
2469 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2470 if (BE (*err != REG_NOERROR, 0))
2471 return NULL;
2472 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2474 int cur_node = state->nodes.elems[node_cnt];
2475 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2477 *err = re_node_set_merge (&next_nodes,
2478 dfa->eclosures + dfa->nexts[cur_node]);
2479 if (BE (*err != REG_NOERROR, 0))
2481 re_node_set_free (&next_nodes);
2482 return NULL;
2486 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2487 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2488 /* We don't need to check errors here, since the return value of
2489 this function is next_state and ERR is already set. */
2491 re_node_set_free (&next_nodes);
2492 re_string_skip_bytes (&mctx->input, 1);
2493 return next_state;
2495 #endif
2497 #ifdef RE_ENABLE_I18N
2498 static reg_errcode_t
2499 internal_function
2500 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2502 const re_dfa_t *const dfa = mctx->dfa;
2503 reg_errcode_t err;
2504 int i;
2506 for (i = 0; i < pstate->nodes.nelem; ++i)
2508 re_node_set dest_nodes, *new_nodes;
2509 int cur_node_idx = pstate->nodes.elems[i];
2510 int naccepted, dest_idx;
2511 unsigned int context;
2512 re_dfastate_t *dest_state;
2514 if (!dfa->nodes[cur_node_idx].accept_mb)
2515 continue;
2517 if (dfa->nodes[cur_node_idx].constraint)
2519 context = re_string_context_at (&mctx->input,
2520 re_string_cur_idx (&mctx->input),
2521 mctx->eflags);
2522 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2523 context))
2524 continue;
2527 /* How many bytes the node can accept? */
2528 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2529 re_string_cur_idx (&mctx->input));
2530 if (naccepted == 0)
2531 continue;
2533 /* The node can accepts `naccepted' bytes. */
2534 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2535 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2536 : mctx->max_mb_elem_len);
2537 err = clean_state_log_if_needed (mctx, dest_idx);
2538 if (BE (err != REG_NOERROR, 0))
2539 return err;
2540 #ifdef DEBUG
2541 assert (dfa->nexts[cur_node_idx] != -1);
2542 #endif
2543 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2545 dest_state = mctx->state_log[dest_idx];
2546 if (dest_state == NULL)
2547 dest_nodes = *new_nodes;
2548 else
2550 err = re_node_set_init_union (&dest_nodes,
2551 dest_state->entrance_nodes, new_nodes);
2552 if (BE (err != REG_NOERROR, 0))
2553 return err;
2555 context = re_string_context_at (&mctx->input, dest_idx - 1,
2556 mctx->eflags);
2557 mctx->state_log[dest_idx]
2558 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2559 if (dest_state != NULL)
2560 re_node_set_free (&dest_nodes);
2561 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2562 return err;
2564 return REG_NOERROR;
2566 #endif /* RE_ENABLE_I18N */
2568 static reg_errcode_t
2569 internal_function
2570 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2572 const re_dfa_t *const dfa = mctx->dfa;
2573 reg_errcode_t err;
2574 int i;
2575 int cur_str_idx = re_string_cur_idx (&mctx->input);
2577 for (i = 0; i < nodes->nelem; ++i)
2579 int dest_str_idx, prev_nelem, bkc_idx;
2580 int node_idx = nodes->elems[i];
2581 unsigned int context;
2582 const re_token_t *node = dfa->nodes + node_idx;
2583 re_node_set *new_dest_nodes;
2585 /* Check whether `node' is a backreference or not. */
2586 if (node->type != OP_BACK_REF)
2587 continue;
2589 if (node->constraint)
2591 context = re_string_context_at (&mctx->input, cur_str_idx,
2592 mctx->eflags);
2593 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2594 continue;
2597 /* `node' is a backreference.
2598 Check the substring which the substring matched. */
2599 bkc_idx = mctx->nbkref_ents;
2600 err = get_subexp (mctx, node_idx, cur_str_idx);
2601 if (BE (err != REG_NOERROR, 0))
2602 goto free_return;
2604 /* And add the epsilon closures (which is `new_dest_nodes') of
2605 the backreference to appropriate state_log. */
2606 #ifdef DEBUG
2607 assert (dfa->nexts[node_idx] != -1);
2608 #endif
2609 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2611 int subexp_len;
2612 re_dfastate_t *dest_state;
2613 struct re_backref_cache_entry *bkref_ent;
2614 bkref_ent = mctx->bkref_ents + bkc_idx;
2615 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2616 continue;
2617 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2618 new_dest_nodes = (subexp_len == 0
2619 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2620 : dfa->eclosures + dfa->nexts[node_idx]);
2621 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2622 - bkref_ent->subexp_from);
2623 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2624 mctx->eflags);
2625 dest_state = mctx->state_log[dest_str_idx];
2626 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2627 : mctx->state_log[cur_str_idx]->nodes.nelem);
2628 /* Add `new_dest_node' to state_log. */
2629 if (dest_state == NULL)
2631 mctx->state_log[dest_str_idx]
2632 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2633 context);
2634 if (BE (mctx->state_log[dest_str_idx] == NULL
2635 && err != REG_NOERROR, 0))
2636 goto free_return;
2638 else
2640 re_node_set dest_nodes;
2641 err = re_node_set_init_union (&dest_nodes,
2642 dest_state->entrance_nodes,
2643 new_dest_nodes);
2644 if (BE (err != REG_NOERROR, 0))
2646 re_node_set_free (&dest_nodes);
2647 goto free_return;
2649 mctx->state_log[dest_str_idx]
2650 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2651 re_node_set_free (&dest_nodes);
2652 if (BE (mctx->state_log[dest_str_idx] == NULL
2653 && err != REG_NOERROR, 0))
2654 goto free_return;
2656 /* We need to check recursively if the backreference can epsilon
2657 transit. */
2658 if (subexp_len == 0
2659 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2661 err = check_subexp_matching_top (mctx, new_dest_nodes,
2662 cur_str_idx);
2663 if (BE (err != REG_NOERROR, 0))
2664 goto free_return;
2665 err = transit_state_bkref (mctx, new_dest_nodes);
2666 if (BE (err != REG_NOERROR, 0))
2667 goto free_return;
2671 err = REG_NOERROR;
2672 free_return:
2673 return err;
2676 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2677 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678 Note that we might collect inappropriate candidates here.
2679 However, the cost of checking them strictly here is too high, then we
2680 delay these checking for prune_impossible_nodes(). */
2682 static reg_errcode_t
2683 internal_function
2684 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2686 const re_dfa_t *const dfa = mctx->dfa;
2687 int subexp_num, sub_top_idx;
2688 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2689 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2690 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2691 if (cache_idx != -1)
2693 const struct re_backref_cache_entry *entry
2694 = mctx->bkref_ents + cache_idx;
2696 if (entry->node == bkref_node)
2697 return REG_NOERROR; /* We already checked it. */
2698 while (entry++->more);
2701 subexp_num = dfa->nodes[bkref_node].opr.idx;
2703 /* For each sub expression */
2704 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2706 reg_errcode_t err;
2707 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2708 re_sub_match_last_t *sub_last;
2709 int sub_last_idx, sl_str, bkref_str_off;
2711 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2712 continue; /* It isn't related. */
2714 sl_str = sub_top->str_idx;
2715 bkref_str_off = bkref_str_idx;
2716 /* At first, check the last node of sub expressions we already
2717 evaluated. */
2718 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2720 int sl_str_diff;
2721 sub_last = sub_top->lasts[sub_last_idx];
2722 sl_str_diff = sub_last->str_idx - sl_str;
2723 /* The matched string by the sub expression match with the substring
2724 at the back reference? */
2725 if (sl_str_diff > 0)
2727 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2729 /* Not enough chars for a successful match. */
2730 if (bkref_str_off + sl_str_diff > mctx->input.len)
2731 break;
2733 err = clean_state_log_if_needed (mctx,
2734 bkref_str_off
2735 + sl_str_diff);
2736 if (BE (err != REG_NOERROR, 0))
2737 return err;
2738 buf = (const char *) re_string_get_buffer (&mctx->input);
2740 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2741 /* We don't need to search this sub expression any more. */
2742 break;
2744 bkref_str_off += sl_str_diff;
2745 sl_str += sl_str_diff;
2746 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2747 bkref_str_idx);
2749 /* Reload buf, since the preceding call might have reallocated
2750 the buffer. */
2751 buf = (const char *) re_string_get_buffer (&mctx->input);
2753 if (err == REG_NOMATCH)
2754 continue;
2755 if (BE (err != REG_NOERROR, 0))
2756 return err;
2759 if (sub_last_idx < sub_top->nlasts)
2760 continue;
2761 if (sub_last_idx > 0)
2762 ++sl_str;
2763 /* Then, search for the other last nodes of the sub expression. */
2764 for (; sl_str <= bkref_str_idx; ++sl_str)
2766 int cls_node, sl_str_off;
2767 const re_node_set *nodes;
2768 sl_str_off = sl_str - sub_top->str_idx;
2769 /* The matched string by the sub expression match with the substring
2770 at the back reference? */
2771 if (sl_str_off > 0)
2773 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2775 /* If we are at the end of the input, we cannot match. */
2776 if (bkref_str_off >= mctx->input.len)
2777 break;
2779 err = extend_buffers (mctx);
2780 if (BE (err != REG_NOERROR, 0))
2781 return err;
2783 buf = (const char *) re_string_get_buffer (&mctx->input);
2785 if (buf [bkref_str_off++] != buf[sl_str - 1])
2786 break; /* We don't need to search this sub expression
2787 any more. */
2789 if (mctx->state_log[sl_str] == NULL)
2790 continue;
2791 /* Does this state have a ')' of the sub expression? */
2792 nodes = &mctx->state_log[sl_str]->nodes;
2793 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2794 OP_CLOSE_SUBEXP);
2795 if (cls_node == -1)
2796 continue; /* No. */
2797 if (sub_top->path == NULL)
2799 sub_top->path = calloc (sizeof (state_array_t),
2800 sl_str - sub_top->str_idx + 1);
2801 if (sub_top->path == NULL)
2802 return REG_ESPACE;
2804 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805 in the current context? */
2806 err = check_arrival (mctx, sub_top->path, sub_top->node,
2807 sub_top->str_idx, cls_node, sl_str,
2808 OP_CLOSE_SUBEXP);
2809 if (err == REG_NOMATCH)
2810 continue;
2811 if (BE (err != REG_NOERROR, 0))
2812 return err;
2813 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2814 if (BE (sub_last == NULL, 0))
2815 return REG_ESPACE;
2816 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2817 bkref_str_idx);
2818 if (err == REG_NOMATCH)
2819 continue;
2822 return REG_NOERROR;
2825 /* Helper functions for get_subexp(). */
2827 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828 If it can arrive, register the sub expression expressed with SUB_TOP
2829 and SUB_LAST. */
2831 static reg_errcode_t
2832 internal_function
2833 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2834 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2836 reg_errcode_t err;
2837 int to_idx;
2838 /* Can the subexpression arrive the back reference? */
2839 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2840 sub_last->str_idx, bkref_node, bkref_str,
2841 OP_OPEN_SUBEXP);
2842 if (err != REG_NOERROR)
2843 return err;
2844 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2845 sub_last->str_idx);
2846 if (BE (err != REG_NOERROR, 0))
2847 return err;
2848 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2849 return clean_state_log_if_needed (mctx, to_idx);
2852 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853 Search '(' if FL_OPEN, or search ')' otherwise.
2854 TODO: This function isn't efficient...
2855 Because there might be more than one nodes whose types are
2856 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857 nodes.
2858 E.g. RE: (a){2} */
2860 static int
2861 internal_function
2862 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2863 int subexp_idx, int type)
2865 int cls_idx;
2866 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2868 int cls_node = nodes->elems[cls_idx];
2869 const re_token_t *node = dfa->nodes + cls_node;
2870 if (node->type == type
2871 && node->opr.idx == subexp_idx)
2872 return cls_node;
2874 return -1;
2877 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2879 heavily reused.
2880 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2882 static reg_errcode_t
2883 internal_function
2884 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2885 int top_str, int last_node, int last_str, int type)
2887 const re_dfa_t *const dfa = mctx->dfa;
2888 reg_errcode_t err = REG_NOERROR;
2889 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2890 re_dfastate_t *cur_state = NULL;
2891 re_node_set *cur_nodes, next_nodes;
2892 re_dfastate_t **backup_state_log;
2893 unsigned int context;
2895 subexp_num = dfa->nodes[top_node].opr.idx;
2896 /* Extend the buffer if we need. */
2897 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2899 re_dfastate_t **new_array;
2900 int old_alloc = path->alloc;
2901 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2902 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2903 if (BE (new_array == NULL, 0))
2905 path->alloc = old_alloc;
2906 return REG_ESPACE;
2908 path->array = new_array;
2909 memset (new_array + old_alloc, '\0',
2910 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2913 str_idx = path->next_idx ? path->next_idx : top_str;
2915 /* Temporary modify MCTX. */
2916 backup_state_log = mctx->state_log;
2917 backup_cur_idx = mctx->input.cur_idx;
2918 mctx->state_log = path->array;
2919 mctx->input.cur_idx = str_idx;
2921 /* Setup initial node set. */
2922 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2923 if (str_idx == top_str)
2925 err = re_node_set_init_1 (&next_nodes, top_node);
2926 if (BE (err != REG_NOERROR, 0))
2927 return err;
2928 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2929 if (BE (err != REG_NOERROR, 0))
2931 re_node_set_free (&next_nodes);
2932 return err;
2935 else
2937 cur_state = mctx->state_log[str_idx];
2938 if (cur_state && cur_state->has_backref)
2940 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2941 if (BE (err != REG_NOERROR, 0))
2942 return err;
2944 else
2945 re_node_set_init_empty (&next_nodes);
2947 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2949 if (next_nodes.nelem)
2951 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2952 subexp_num, type);
2953 if (BE (err != REG_NOERROR, 0))
2955 re_node_set_free (&next_nodes);
2956 return err;
2959 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2960 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2962 re_node_set_free (&next_nodes);
2963 return err;
2965 mctx->state_log[str_idx] = cur_state;
2968 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2970 re_node_set_empty (&next_nodes);
2971 if (mctx->state_log[str_idx + 1])
2973 err = re_node_set_merge (&next_nodes,
2974 &mctx->state_log[str_idx + 1]->nodes);
2975 if (BE (err != REG_NOERROR, 0))
2977 re_node_set_free (&next_nodes);
2978 return err;
2981 if (cur_state)
2983 err = check_arrival_add_next_nodes (mctx, str_idx,
2984 &cur_state->non_eps_nodes,
2985 &next_nodes);
2986 if (BE (err != REG_NOERROR, 0))
2988 re_node_set_free (&next_nodes);
2989 return err;
2992 ++str_idx;
2993 if (next_nodes.nelem)
2995 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2996 if (BE (err != REG_NOERROR, 0))
2998 re_node_set_free (&next_nodes);
2999 return err;
3001 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3002 subexp_num, type);
3003 if (BE (err != REG_NOERROR, 0))
3005 re_node_set_free (&next_nodes);
3006 return err;
3009 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3010 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3011 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3013 re_node_set_free (&next_nodes);
3014 return err;
3016 mctx->state_log[str_idx] = cur_state;
3017 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3019 re_node_set_free (&next_nodes);
3020 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3021 : &mctx->state_log[last_str]->nodes);
3022 path->next_idx = str_idx;
3024 /* Fix MCTX. */
3025 mctx->state_log = backup_state_log;
3026 mctx->input.cur_idx = backup_cur_idx;
3028 /* Then check the current node set has the node LAST_NODE. */
3029 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3030 return REG_NOERROR;
3032 return REG_NOMATCH;
3035 /* Helper functions for check_arrival. */
3037 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3038 to NEXT_NODES.
3039 TODO: This function is similar to the functions transit_state*(),
3040 however this function has many additional works.
3041 Can't we unify them? */
3043 static reg_errcode_t
3044 internal_function
3045 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3046 re_node_set *cur_nodes, re_node_set *next_nodes)
3048 const re_dfa_t *const dfa = mctx->dfa;
3049 int result;
3050 int cur_idx;
3051 #ifdef RE_ENABLE_I18N
3052 reg_errcode_t err = REG_NOERROR;
3053 #endif
3054 re_node_set union_set;
3055 re_node_set_init_empty (&union_set);
3056 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3058 int naccepted = 0;
3059 int cur_node = cur_nodes->elems[cur_idx];
3060 #ifdef DEBUG
3061 re_token_type_t type = dfa->nodes[cur_node].type;
3062 assert (!IS_EPSILON_NODE (type));
3063 #endif
3064 #ifdef RE_ENABLE_I18N
3065 /* If the node may accept `multi byte'. */
3066 if (dfa->nodes[cur_node].accept_mb)
3068 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3069 str_idx);
3070 if (naccepted > 1)
3072 re_dfastate_t *dest_state;
3073 int next_node = dfa->nexts[cur_node];
3074 int next_idx = str_idx + naccepted;
3075 dest_state = mctx->state_log[next_idx];
3076 re_node_set_empty (&union_set);
3077 if (dest_state)
3079 err = re_node_set_merge (&union_set, &dest_state->nodes);
3080 if (BE (err != REG_NOERROR, 0))
3082 re_node_set_free (&union_set);
3083 return err;
3086 result = re_node_set_insert (&union_set, next_node);
3087 if (BE (result < 0, 0))
3089 re_node_set_free (&union_set);
3090 return REG_ESPACE;
3092 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3093 &union_set);
3094 if (BE (mctx->state_log[next_idx] == NULL
3095 && err != REG_NOERROR, 0))
3097 re_node_set_free (&union_set);
3098 return err;
3102 #endif /* RE_ENABLE_I18N */
3103 if (naccepted
3104 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3106 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3107 if (BE (result < 0, 0))
3109 re_node_set_free (&union_set);
3110 return REG_ESPACE;
3114 re_node_set_free (&union_set);
3115 return REG_NOERROR;
3118 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3119 CUR_NODES, however exclude the nodes which are:
3120 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3121 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3124 static reg_errcode_t
3125 internal_function
3126 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3127 int ex_subexp, int type)
3129 reg_errcode_t err;
3130 int idx, outside_node;
3131 re_node_set new_nodes;
3132 #ifdef DEBUG
3133 assert (cur_nodes->nelem);
3134 #endif
3135 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3136 if (BE (err != REG_NOERROR, 0))
3137 return err;
3138 /* Create a new node set NEW_NODES with the nodes which are epsilon
3139 closures of the node in CUR_NODES. */
3141 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3143 int cur_node = cur_nodes->elems[idx];
3144 const re_node_set *eclosure = dfa->eclosures + cur_node;
3145 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3146 if (outside_node == -1)
3148 /* There are no problematic nodes, just merge them. */
3149 err = re_node_set_merge (&new_nodes, eclosure);
3150 if (BE (err != REG_NOERROR, 0))
3152 re_node_set_free (&new_nodes);
3153 return err;
3156 else
3158 /* There are problematic nodes, re-calculate incrementally. */
3159 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3160 ex_subexp, type);
3161 if (BE (err != REG_NOERROR, 0))
3163 re_node_set_free (&new_nodes);
3164 return err;
3168 re_node_set_free (cur_nodes);
3169 *cur_nodes = new_nodes;
3170 return REG_NOERROR;
3173 /* Helper function for check_arrival_expand_ecl.
3174 Check incrementally the epsilon closure of TARGET, and if it isn't
3175 problematic append it to DST_NODES. */
3177 static reg_errcode_t
3178 internal_function
3179 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3180 int target, int ex_subexp, int type)
3182 int cur_node;
3183 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3185 int err;
3187 if (dfa->nodes[cur_node].type == type
3188 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3190 if (type == OP_CLOSE_SUBEXP)
3192 err = re_node_set_insert (dst_nodes, cur_node);
3193 if (BE (err == -1, 0))
3194 return REG_ESPACE;
3196 break;
3198 err = re_node_set_insert (dst_nodes, cur_node);
3199 if (BE (err == -1, 0))
3200 return REG_ESPACE;
3201 if (dfa->edests[cur_node].nelem == 0)
3202 break;
3203 if (dfa->edests[cur_node].nelem == 2)
3205 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3206 dfa->edests[cur_node].elems[1],
3207 ex_subexp, type);
3208 if (BE (err != REG_NOERROR, 0))
3209 return err;
3211 cur_node = dfa->edests[cur_node].elems[0];
3213 return REG_NOERROR;
3217 /* For all the back references in the current state, calculate the
3218 destination of the back references by the appropriate entry
3219 in MCTX->BKREF_ENTS. */
3221 static reg_errcode_t
3222 internal_function
3223 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3224 int cur_str, int subexp_num, int type)
3226 const re_dfa_t *const dfa = mctx->dfa;
3227 reg_errcode_t err;
3228 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3229 struct re_backref_cache_entry *ent;
3231 if (cache_idx_start == -1)
3232 return REG_NOERROR;
3234 restart:
3235 ent = mctx->bkref_ents + cache_idx_start;
3238 int to_idx, next_node;
3240 /* Is this entry ENT is appropriate? */
3241 if (!re_node_set_contains (cur_nodes, ent->node))
3242 continue; /* No. */
3244 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3245 /* Calculate the destination of the back reference, and append it
3246 to MCTX->STATE_LOG. */
3247 if (to_idx == cur_str)
3249 /* The backreference did epsilon transit, we must re-check all the
3250 node in the current state. */
3251 re_node_set new_dests;
3252 reg_errcode_t err2, err3;
3253 next_node = dfa->edests[ent->node].elems[0];
3254 if (re_node_set_contains (cur_nodes, next_node))
3255 continue;
3256 err = re_node_set_init_1 (&new_dests, next_node);
3257 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3258 err3 = re_node_set_merge (cur_nodes, &new_dests);
3259 re_node_set_free (&new_dests);
3260 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3261 || err3 != REG_NOERROR, 0))
3263 err = (err != REG_NOERROR ? err
3264 : (err2 != REG_NOERROR ? err2 : err3));
3265 return err;
3267 /* TODO: It is still inefficient... */
3268 goto restart;
3270 else
3272 re_node_set union_set;
3273 next_node = dfa->nexts[ent->node];
3274 if (mctx->state_log[to_idx])
3276 int ret;
3277 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3278 next_node))
3279 continue;
3280 err = re_node_set_init_copy (&union_set,
3281 &mctx->state_log[to_idx]->nodes);
3282 ret = re_node_set_insert (&union_set, next_node);
3283 if (BE (err != REG_NOERROR || ret < 0, 0))
3285 re_node_set_free (&union_set);
3286 err = err != REG_NOERROR ? err : REG_ESPACE;
3287 return err;
3290 else
3292 err = re_node_set_init_1 (&union_set, next_node);
3293 if (BE (err != REG_NOERROR, 0))
3294 return err;
3296 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3297 re_node_set_free (&union_set);
3298 if (BE (mctx->state_log[to_idx] == NULL
3299 && err != REG_NOERROR, 0))
3300 return err;
3303 while (ent++->more);
3304 return REG_NOERROR;
3307 /* Build transition table for the state.
3308 Return 1 if succeeded, otherwise return NULL. */
3310 static int
3311 internal_function
3312 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3314 reg_errcode_t err;
3315 int i, j, ch, need_word_trtable = 0;
3316 bitset_word_t elem, mask;
3317 bool dests_node_malloced = false;
3318 bool dest_states_malloced = false;
3319 int ndests; /* Number of the destination states from `state'. */
3320 re_dfastate_t **trtable;
3321 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3322 re_node_set follows, *dests_node;
3323 bitset_t *dests_ch;
3324 bitset_t acceptable;
3326 struct dests_alloc
3328 re_node_set dests_node[SBC_MAX];
3329 bitset_t dests_ch[SBC_MAX];
3330 } *dests_alloc;
3332 /* We build DFA states which corresponds to the destination nodes
3333 from `state'. `dests_node[i]' represents the nodes which i-th
3334 destination state contains, and `dests_ch[i]' represents the
3335 characters which i-th destination state accepts. */
3336 #ifdef HAVE_ALLOCA
3337 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3338 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3339 else
3340 #endif
3342 dests_alloc = re_malloc (struct dests_alloc, 1);
3343 if (BE (dests_alloc == NULL, 0))
3344 return 0;
3345 dests_node_malloced = true;
3347 dests_node = dests_alloc->dests_node;
3348 dests_ch = dests_alloc->dests_ch;
3350 /* Initialize transiton table. */
3351 state->word_trtable = state->trtable = NULL;
3353 /* At first, group all nodes belonging to `state' into several
3354 destinations. */
3355 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3356 if (BE (ndests <= 0, 0))
3358 if (dests_node_malloced)
3359 free (dests_alloc);
3360 /* Return 0 in case of an error, 1 otherwise. */
3361 if (ndests == 0)
3363 state->trtable = (re_dfastate_t **)
3364 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365 return 1;
3367 return 0;
3370 err = re_node_set_alloc (&follows, ndests + 1);
3371 if (BE (err != REG_NOERROR, 0))
3372 goto out_free;
3374 /* Avoid arithmetic overflow in size calculation. */
3375 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3376 / (3 * sizeof (re_dfastate_t *)))
3377 < ndests),
3379 goto out_free;
3381 #ifdef HAVE_ALLOCA
3382 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3383 + ndests * 3 * sizeof (re_dfastate_t *)))
3384 dest_states = (re_dfastate_t **)
3385 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3386 else
3387 #endif
3389 dest_states = (re_dfastate_t **)
3390 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3391 if (BE (dest_states == NULL, 0))
3393 out_free:
3394 if (dest_states_malloced)
3395 free (dest_states);
3396 re_node_set_free (&follows);
3397 for (i = 0; i < ndests; ++i)
3398 re_node_set_free (dests_node + i);
3399 if (dests_node_malloced)
3400 free (dests_alloc);
3401 return 0;
3403 dest_states_malloced = true;
3405 dest_states_word = dest_states + ndests;
3406 dest_states_nl = dest_states_word + ndests;
3407 bitset_empty (acceptable);
3409 /* Then build the states for all destinations. */
3410 for (i = 0; i < ndests; ++i)
3412 int next_node;
3413 re_node_set_empty (&follows);
3414 /* Merge the follows of this destination states. */
3415 for (j = 0; j < dests_node[i].nelem; ++j)
3417 next_node = dfa->nexts[dests_node[i].elems[j]];
3418 if (next_node != -1)
3420 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3421 if (BE (err != REG_NOERROR, 0))
3422 goto out_free;
3425 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3426 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3427 goto out_free;
3428 /* If the new state has context constraint,
3429 build appropriate states for these contexts. */
3430 if (dest_states[i]->has_constraint)
3432 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3433 CONTEXT_WORD);
3434 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3435 goto out_free;
3437 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3438 need_word_trtable = 1;
3440 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3441 CONTEXT_NEWLINE);
3442 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3443 goto out_free;
3445 else
3447 dest_states_word[i] = dest_states[i];
3448 dest_states_nl[i] = dest_states[i];
3450 bitset_merge (acceptable, dests_ch[i]);
3453 if (!BE (need_word_trtable, 0))
3455 /* We don't care about whether the following character is a word
3456 character, or we are in a single-byte character set so we can
3457 discern by looking at the character code: allocate a
3458 256-entry transition table. */
3459 trtable = state->trtable =
3460 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3461 if (BE (trtable == NULL, 0))
3462 goto out_free;
3464 /* For all characters ch...: */
3465 for (i = 0; i < BITSET_WORDS; ++i)
3466 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3467 elem;
3468 mask <<= 1, elem >>= 1, ++ch)
3469 if (BE (elem & 1, 0))
3471 /* There must be exactly one destination which accepts
3472 character ch. See group_nodes_into_DFAstates. */
3473 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3476 /* j-th destination accepts the word character ch. */
3477 if (dfa->word_char[i] & mask)
3478 trtable[ch] = dest_states_word[j];
3479 else
3480 trtable[ch] = dest_states[j];
3483 else
3485 /* We care about whether the following character is a word
3486 character, and we are in a multi-byte character set: discern
3487 by looking at the character code: build two 256-entry
3488 transition tables, one starting at trtable[0] and one
3489 starting at trtable[SBC_MAX]. */
3490 trtable = state->word_trtable =
3491 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3492 if (BE (trtable == NULL, 0))
3493 goto out_free;
3495 /* For all characters ch...: */
3496 for (i = 0; i < BITSET_WORDS; ++i)
3497 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3498 elem;
3499 mask <<= 1, elem >>= 1, ++ch)
3500 if (BE (elem & 1, 0))
3502 /* There must be exactly one destination which accepts
3503 character ch. See group_nodes_into_DFAstates. */
3504 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3507 /* j-th destination accepts the word character ch. */
3508 trtable[ch] = dest_states[j];
3509 trtable[ch + SBC_MAX] = dest_states_word[j];
3513 /* new line */
3514 if (bitset_contain (acceptable, NEWLINE_CHAR))
3516 /* The current state accepts newline character. */
3517 for (j = 0; j < ndests; ++j)
3518 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3520 /* k-th destination accepts newline character. */
3521 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3522 if (need_word_trtable)
3523 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3524 /* There must be only one destination which accepts
3525 newline. See group_nodes_into_DFAstates. */
3526 break;
3530 if (dest_states_malloced)
3531 free (dest_states);
3533 re_node_set_free (&follows);
3534 for (i = 0; i < ndests; ++i)
3535 re_node_set_free (dests_node + i);
3537 if (dests_node_malloced)
3538 free (dests_alloc);
3540 return 1;
3543 /* Group all nodes belonging to STATE into several destinations.
3544 Then for all destinations, set the nodes belonging to the destination
3545 to DESTS_NODE[i] and set the characters accepted by the destination
3546 to DEST_CH[i]. This function return the number of destinations. */
3548 static int
3549 internal_function
3550 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3551 re_node_set *dests_node, bitset_t *dests_ch)
3553 reg_errcode_t err;
3554 int result;
3555 int i, j, k;
3556 int ndests; /* Number of the destinations from `state'. */
3557 bitset_t accepts; /* Characters a node can accept. */
3558 const re_node_set *cur_nodes = &state->nodes;
3559 bitset_empty (accepts);
3560 ndests = 0;
3562 /* For all the nodes belonging to `state', */
3563 for (i = 0; i < cur_nodes->nelem; ++i)
3565 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3566 re_token_type_t type = node->type;
3567 unsigned int constraint = node->constraint;
3569 /* Enumerate all single byte character this node can accept. */
3570 if (type == CHARACTER)
3571 bitset_set (accepts, node->opr.c);
3572 else if (type == SIMPLE_BRACKET)
3574 bitset_merge (accepts, node->opr.sbcset);
3576 else if (type == OP_PERIOD)
3578 #ifdef RE_ENABLE_I18N
3579 if (dfa->mb_cur_max > 1)
3580 bitset_merge (accepts, dfa->sb_char);
3581 else
3582 #endif
3583 bitset_set_all (accepts);
3584 if (!(dfa->syntax & RE_DOT_NEWLINE))
3585 bitset_clear (accepts, '\n');
3586 if (dfa->syntax & RE_DOT_NOT_NULL)
3587 bitset_clear (accepts, '\0');
3589 #ifdef RE_ENABLE_I18N
3590 else if (type == OP_UTF8_PERIOD)
3592 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3593 if (!(dfa->syntax & RE_DOT_NEWLINE))
3594 bitset_clear (accepts, '\n');
3595 if (dfa->syntax & RE_DOT_NOT_NULL)
3596 bitset_clear (accepts, '\0');
3598 #endif
3599 else
3600 continue;
3602 /* Check the `accepts' and sift the characters which are not
3603 match it the context. */
3604 if (constraint)
3606 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3608 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3609 bitset_empty (accepts);
3610 if (accepts_newline)
3611 bitset_set (accepts, NEWLINE_CHAR);
3612 else
3613 continue;
3615 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3617 bitset_empty (accepts);
3618 continue;
3621 if (constraint & NEXT_WORD_CONSTRAINT)
3623 bitset_word_t any_set = 0;
3624 if (type == CHARACTER && !node->word_char)
3626 bitset_empty (accepts);
3627 continue;
3629 #ifdef RE_ENABLE_I18N
3630 if (dfa->mb_cur_max > 1)
3631 for (j = 0; j < BITSET_WORDS; ++j)
3632 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3633 else
3634 #endif
3635 for (j = 0; j < BITSET_WORDS; ++j)
3636 any_set |= (accepts[j] &= dfa->word_char[j]);
3637 if (!any_set)
3638 continue;
3640 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3642 bitset_word_t any_set = 0;
3643 if (type == CHARACTER && node->word_char)
3645 bitset_empty (accepts);
3646 continue;
3648 #ifdef RE_ENABLE_I18N
3649 if (dfa->mb_cur_max > 1)
3650 for (j = 0; j < BITSET_WORDS; ++j)
3651 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3652 else
3653 #endif
3654 for (j = 0; j < BITSET_WORDS; ++j)
3655 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3656 if (!any_set)
3657 continue;
3661 /* Then divide `accepts' into DFA states, or create a new
3662 state. Above, we make sure that accepts is not empty. */
3663 for (j = 0; j < ndests; ++j)
3665 bitset_t intersec; /* Intersection sets, see below. */
3666 bitset_t remains;
3667 /* Flags, see below. */
3668 bitset_word_t has_intersec, not_subset, not_consumed;
3670 /* Optimization, skip if this state doesn't accept the character. */
3671 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3672 continue;
3674 /* Enumerate the intersection set of this state and `accepts'. */
3675 has_intersec = 0;
3676 for (k = 0; k < BITSET_WORDS; ++k)
3677 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3678 /* And skip if the intersection set is empty. */
3679 if (!has_intersec)
3680 continue;
3682 /* Then check if this state is a subset of `accepts'. */
3683 not_subset = not_consumed = 0;
3684 for (k = 0; k < BITSET_WORDS; ++k)
3686 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3687 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3690 /* If this state isn't a subset of `accepts', create a
3691 new group state, which has the `remains'. */
3692 if (not_subset)
3694 bitset_copy (dests_ch[ndests], remains);
3695 bitset_copy (dests_ch[j], intersec);
3696 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3697 if (BE (err != REG_NOERROR, 0))
3698 goto error_return;
3699 ++ndests;
3702 /* Put the position in the current group. */
3703 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3704 if (BE (result < 0, 0))
3705 goto error_return;
3707 /* If all characters are consumed, go to next node. */
3708 if (!not_consumed)
3709 break;
3711 /* Some characters remain, create a new group. */
3712 if (j == ndests)
3714 bitset_copy (dests_ch[ndests], accepts);
3715 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3716 if (BE (err != REG_NOERROR, 0))
3717 goto error_return;
3718 ++ndests;
3719 bitset_empty (accepts);
3722 return ndests;
3723 error_return:
3724 for (j = 0; j < ndests; ++j)
3725 re_node_set_free (dests_node + j);
3726 return -1;
3729 #ifdef RE_ENABLE_I18N
3730 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3731 Return the number of the bytes the node accepts.
3732 STR_IDX is the current index of the input string.
3734 This function handles the nodes which can accept one character, or
3735 one collating element like '.', '[a-z]', opposite to the other nodes
3736 can only accept one byte. */
3738 static int
3739 internal_function
3740 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3741 const re_string_t *input, int str_idx)
3743 const re_token_t *node = dfa->nodes + node_idx;
3744 int char_len, elem_len;
3745 int i;
3746 wint_t wc;
3748 if (BE (node->type == OP_UTF8_PERIOD, 0))
3750 unsigned char c = re_string_byte_at (input, str_idx), d;
3751 if (BE (c < 0xc2, 1))
3752 return 0;
3754 if (str_idx + 2 > input->len)
3755 return 0;
3757 d = re_string_byte_at (input, str_idx + 1);
3758 if (c < 0xe0)
3759 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3760 else if (c < 0xf0)
3762 char_len = 3;
3763 if (c == 0xe0 && d < 0xa0)
3764 return 0;
3766 else if (c < 0xf8)
3768 char_len = 4;
3769 if (c == 0xf0 && d < 0x90)
3770 return 0;
3772 else if (c < 0xfc)
3774 char_len = 5;
3775 if (c == 0xf8 && d < 0x88)
3776 return 0;
3778 else if (c < 0xfe)
3780 char_len = 6;
3781 if (c == 0xfc && d < 0x84)
3782 return 0;
3784 else
3785 return 0;
3787 if (str_idx + char_len > input->len)
3788 return 0;
3790 for (i = 1; i < char_len; ++i)
3792 d = re_string_byte_at (input, str_idx + i);
3793 if (d < 0x80 || d > 0xbf)
3794 return 0;
3796 return char_len;
3799 char_len = re_string_char_size_at (input, str_idx);
3800 if (node->type == OP_PERIOD)
3802 if (char_len <= 1)
3803 return 0;
3804 /* FIXME: I don't think this if is needed, as both '\n'
3805 and '\0' are char_len == 1. */
3806 /* '.' accepts any one character except the following two cases. */
3807 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3808 re_string_byte_at (input, str_idx) == '\n') ||
3809 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3810 re_string_byte_at (input, str_idx) == '\0'))
3811 return 0;
3812 return char_len;
3815 elem_len = re_string_elem_size_at (input, str_idx);
3816 wc = __btowc(*(input->mbs+str_idx));
3817 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3818 return 0;
3820 if (node->type == COMPLEX_BRACKET)
3822 const re_charset_t *cset = node->opr.mbcset;
3823 # ifdef _LIBC
3824 const unsigned char *pin
3825 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3826 int j;
3827 uint32_t nrules;
3828 # endif /* _LIBC */
3829 int match_len = 0;
3830 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3831 ? re_string_wchar_at (input, str_idx) : 0);
3833 /* match with multibyte character? */
3834 for (i = 0; i < cset->nmbchars; ++i)
3835 if (wc == cset->mbchars[i])
3837 match_len = char_len;
3838 goto check_node_accept_bytes_match;
3840 /* match with character_class? */
3841 for (i = 0; i < cset->nchar_classes; ++i)
3843 wctype_t wt = cset->char_classes[i];
3844 if (__iswctype (wc, wt))
3846 match_len = char_len;
3847 goto check_node_accept_bytes_match;