Fix compile for MinGW (continued)
[git/platforms.git] / compat / regex / regexec.c
blobf699b9375e2e406b907d718dca4fb10310791449
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags)
44 internal_function;
45 static int re_search_2_stub (struct re_pattern_buffer *bufp,
46 const char *string1, int length1,
47 const char *string2, int length2,
48 int start, int range, struct re_registers *regs,
49 int stop, int ret_len)
50 internal_function;
51 static int re_search_stub (struct re_pattern_buffer *bufp,
52 const char *string, int length, int start,
53 int range, int stop, struct re_registers *regs,
54 int ret_len)
55 internal_function;
56 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
57 int nregs, int regs_allocated)
58 internal_function;
59 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
60 internal_function;
61 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
62 int *p_match_first) internal_function;
63 static int check_halt_state_context (const re_match_context_t *mctx,
64 const re_dfastate_t *state, int idx)
65 internal_function;
66 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
67 regmatch_t *prev_idx_match, int cur_node,
68 int cur_idx, int nmatch) internal_function;
69 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
70 int str_idx, int dest_node, int nregs,
71 regmatch_t *regs,
72 re_node_set *eps_via_nodes)
73 internal_function;
74 static reg_errcode_t set_regs (const regex_t *preg,
75 const re_match_context_t *mctx,
76 size_t nmatch, regmatch_t *pmatch,
77 int fl_backtrack) internal_function;
78 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
79 internal_function;
81 #ifdef RE_ENABLE_I18N
82 static int sift_states_iter_mb (const re_match_context_t *mctx,
83 re_sift_context_t *sctx,
84 int node_idx, int str_idx, int max_str_idx)
85 internal_function;
86 #endif /* RE_ENABLE_I18N */
87 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
88 re_sift_context_t *sctx)
89 internal_function;
90 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
91 re_sift_context_t *sctx, int str_idx,
92 re_node_set *cur_dest)
93 internal_function;
94 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
95 re_sift_context_t *sctx,
96 int str_idx,
97 re_node_set *dest_nodes)
98 internal_function;
99 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
100 re_node_set *dest_nodes,
101 const re_node_set *candidates)
102 internal_function;
103 static int check_dst_limits (const re_match_context_t *mctx,
104 re_node_set *limits,
105 int dst_node, int dst_idx, int src_node,
106 int src_idx) internal_function;
107 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
108 int boundaries, int subexp_idx,
109 int from_node, int bkref_idx)
110 internal_function;
111 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
112 int limit, int subexp_idx,
113 int node, int str_idx,
114 int bkref_idx) internal_function;
115 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
116 re_node_set *dest_nodes,
117 const re_node_set *candidates,
118 re_node_set *limits,
119 struct re_backref_cache_entry *bkref_ents,
120 int str_idx) internal_function;
121 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
122 re_sift_context_t *sctx,
123 int str_idx, const re_node_set *candidates)
124 internal_function;
125 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
126 re_dfastate_t **dst,
127 re_dfastate_t **src, int num)
128 internal_function;
129 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
130 re_match_context_t *mctx) internal_function;
131 static re_dfastate_t *transit_state (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *state) internal_function;
134 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
135 re_match_context_t *mctx,
136 re_dfastate_t *next_state)
137 internal_function;
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141 #if 0
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate)
145 internal_function;
146 #endif
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
149 re_dfastate_t *pstate)
150 internal_function;
151 #endif /* RE_ENABLE_I18N */
152 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
153 const re_node_set *nodes)
154 internal_function;
155 static reg_errcode_t get_subexp (re_match_context_t *mctx,
156 int bkref_node, int bkref_str_idx)
157 internal_function;
158 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
159 const re_sub_match_top_t *sub_top,
160 re_sub_match_last_t *sub_last,
161 int bkref_node, int bkref_str)
162 internal_function;
163 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
164 int subexp_idx, int type) internal_function;
165 static reg_errcode_t check_arrival (re_match_context_t *mctx,
166 state_array_t *path, int top_node,
167 int top_str, int last_node, int last_str,
168 int type) internal_function;
169 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
170 int str_idx,
171 re_node_set *cur_nodes,
172 re_node_set *next_nodes)
173 internal_function;
174 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
175 re_node_set *cur_nodes,
176 int ex_subexp, int type)
177 internal_function;
178 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
179 re_node_set *dst_nodes,
180 int target, int ex_subexp,
181 int type) internal_function;
182 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
183 re_node_set *cur_nodes, int cur_str,
184 int subexp_num, int type)
185 internal_function;
186 static int build_trtable (const re_dfa_t *dfa,
187 re_dfastate_t *state) internal_function;
188 #ifdef RE_ENABLE_I18N
189 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
190 const re_string_t *input, int idx)
191 internal_function;
192 # ifdef _LIBC
193 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
194 size_t name_len)
195 internal_function;
196 # endif /* _LIBC */
197 #endif /* RE_ENABLE_I18N */
198 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
199 const re_dfastate_t *state,
200 re_node_set *states_node,
201 bitset_t *states_ch) internal_function;
202 static int check_node_accept (const re_match_context_t *mctx,
203 const re_token_t *node, int idx)
204 internal_function;
205 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
206 internal_function;
208 /* Entry point for POSIX code. */
210 /* regexec searches for a given pattern, specified by PREG, in the
211 string STRING.
213 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
214 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
215 least NMATCH elements, and we set them to the offsets of the
216 corresponding matched substrings.
218 EFLAGS specifies `execution flags' which affect matching: if
219 REG_NOTBOL is set, then ^ does not match at the beginning of the
220 string; if REG_NOTEOL is set, then $ does not match at the end.
222 We return 0 if we find a match and REG_NOMATCH if not. */
225 regexec (
226 const regex_t *__restrict preg,
227 const char *__restrict string,
228 size_t nmatch,
229 regmatch_t pmatch[],
230 int eflags)
232 reg_errcode_t err;
233 int start, length;
235 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
236 return REG_BADPAT;
238 if (eflags & REG_STARTEND)
240 start = pmatch[0].rm_so;
241 length = pmatch[0].rm_eo;
243 else
245 start = 0;
246 length = strlen (string);
249 __libc_lock_lock (dfa->lock);
250 if (preg->no_sub)
251 err = re_search_internal (preg, string, length, start, length - start,
252 length, 0, NULL, eflags);
253 else
254 err = re_search_internal (preg, string, length, start, length - start,
255 length, nmatch, pmatch, eflags);
256 __libc_lock_unlock (dfa->lock);
257 return err != REG_NOERROR;
260 #ifdef _LIBC
261 # include <shlib-compat.h>
262 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
264 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
265 __typeof__ (__regexec) __compat_regexec;
268 attribute_compat_text_section
269 __compat_regexec (const regex_t *__restrict preg,
270 const char *__restrict string, size_t nmatch,
271 regmatch_t pmatch[], int eflags)
273 return regexec (preg, string, nmatch, pmatch,
274 eflags & (REG_NOTBOL | REG_NOTEOL));
276 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
277 # endif
278 #endif
280 /* Entry points for GNU code. */
282 /* re_match, re_search, re_match_2, re_search_2
284 The former two functions operate on STRING with length LENGTH,
285 while the later two operate on concatenation of STRING1 and STRING2
286 with lengths LENGTH1 and LENGTH2, respectively.
288 re_match() matches the compiled pattern in BUFP against the string,
289 starting at index START.
291 re_search() first tries matching at index START, then it tries to match
292 starting from index START + 1, and so on. The last start position tried
293 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
294 way as re_match().)
296 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
297 the first STOP characters of the concatenation of the strings should be
298 concerned.
300 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
301 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
302 computed relative to the concatenation, not relative to the individual
303 strings.)
305 On success, re_match* functions return the length of the match, re_search*
306 return the position of the start of the match. Return value -1 means no
307 match was found and -2 indicates an internal error. */
310 re_match (struct re_pattern_buffer *bufp,
311 const char *string,
312 int length,
313 int start,
314 struct re_registers *regs)
316 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
318 #ifdef _LIBC
319 weak_alias (__re_match, re_match)
320 #endif
323 re_search (struct re_pattern_buffer *bufp,
324 const char *string,
325 int length, int start, int range,
326 struct re_registers *regs)
328 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
330 #ifdef _LIBC
331 weak_alias (__re_search, re_search)
332 #endif
335 re_match_2 (struct re_pattern_buffer *bufp,
336 const char *string1, int length1,
337 const char *string2, int length2, int start,
338 struct re_registers *regs, int stop)
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, 0, regs, stop, 1);
343 #ifdef _LIBC
344 weak_alias (__re_match_2, re_match_2)
345 #endif
348 re_search_2 (struct re_pattern_buffer *bufp,
349 const char *string1, int length1,
350 const char *string2, int length2, int start,
351 int range, struct re_registers *regs, int stop)
353 return re_search_2_stub (bufp, string1, length1, string2, length2,
354 start, range, regs, stop, 0);
356 #ifdef _LIBC
357 weak_alias (__re_search_2, re_search_2)
358 #endif
360 static int internal_function
361 re_search_2_stub (struct re_pattern_buffer *bufp,
362 const char *string1, int length1,
363 const char *string2, int length2, int start,
364 int range, struct re_registers *regs,
365 int stop, int ret_len)
367 const char *str;
368 int rval;
369 int len = length1 + length2;
370 int free_str = 0;
372 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
373 return -2;
375 /* Concatenate the strings. */
376 if (length2 > 0)
377 if (length1 > 0)
379 char *s = re_malloc (char, len);
381 if (BE (s == NULL, 0))
382 return -2;
383 memcpy (s, string1, length1);
384 memcpy (s + length1, string2, length2);
385 str = s;
386 free_str = 1;
388 else
389 str = string2;
390 else
391 str = string1;
393 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
394 if (free_str)
395 re_free ((char *) str);
396 return rval;
399 /* The parameters have the same meaning as those of re_search.
400 Additional parameters:
401 If RET_LEN is nonzero the length of the match is returned (re_match style);
402 otherwise the position of the match is returned. */
404 static int internal_function
405 re_search_stub (struct re_pattern_buffer *bufp,
406 const char *string, int length, int start,
407 int range, int stop,
408 struct re_registers *regs, int ret_len)
410 reg_errcode_t result;
411 regmatch_t *pmatch;
412 int nregs, rval;
413 int eflags = 0;
415 /* Check for out-of-range. */
416 if (BE (start < 0 || start > length, 0))
417 return -1;
418 if (BE (start + range > length, 0))
419 range = length - start;
420 else if (BE (start + range < 0, 0))
421 range = -start;
423 __libc_lock_lock (dfa->lock);
425 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
426 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
428 /* Compile fastmap if we haven't yet. */
429 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
430 re_compile_fastmap (bufp);
432 if (BE (bufp->no_sub, 0))
433 regs = NULL;
435 /* We need at least 1 register. */
436 if (regs == NULL)
437 nregs = 1;
438 else if (BE (bufp->regs_allocated == REGS_FIXED &&
439 regs->num_regs < bufp->re_nsub + 1, 0))
441 nregs = regs->num_regs;
442 if (BE (nregs < 1, 0))
444 /* Nothing can be copied to regs. */
445 regs = NULL;
446 nregs = 1;
449 else
450 nregs = bufp->re_nsub + 1;
451 pmatch = re_malloc (regmatch_t, nregs);
452 if (BE (pmatch == NULL, 0))
454 rval = -2;
455 goto out;
458 result = re_search_internal (bufp, string, length, start, range, stop,
459 nregs, pmatch, eflags);
461 rval = 0;
463 /* I hope we needn't fill ther regs with -1's when no match was found. */
464 if (result != REG_NOERROR)
465 rval = -1;
466 else if (regs != NULL)
468 /* If caller wants register contents data back, copy them. */
469 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
470 bufp->regs_allocated);
471 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
472 rval = -2;
475 if (BE (rval == 0, 1))
477 if (ret_len)
479 assert (pmatch[0].rm_so == start);
480 rval = pmatch[0].rm_eo - start;
482 else
483 rval = pmatch[0].rm_so;
485 re_free (pmatch);
486 out:
487 __libc_lock_unlock (dfa->lock);
488 return rval;
491 static unsigned internal_function
492 re_copy_regs (struct re_registers *regs,
493 regmatch_t *pmatch,
494 int nregs, int regs_allocated)
496 int rval = REGS_REALLOCATE;
497 int i;
498 int need_regs = nregs + 1;
499 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
500 uses. */
502 /* Have the register data arrays been allocated? */
503 if (regs_allocated == REGS_UNALLOCATED)
504 { /* No. So allocate them with malloc. */
505 regs->start = re_malloc (regoff_t, need_regs);
506 if (BE (regs->start == NULL, 0))
507 return REGS_UNALLOCATED;
508 regs->end = re_malloc (regoff_t, need_regs);
509 if (BE (regs->end == NULL, 0))
511 re_free (regs->start);
512 return REGS_UNALLOCATED;
514 regs->num_regs = need_regs;
516 else if (regs_allocated == REGS_REALLOCATE)
517 { /* Yes. If we need more elements than were already
518 allocated, reallocate them. If we need fewer, just
519 leave it alone. */
520 if (BE (need_regs > regs->num_regs, 0))
522 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
523 regoff_t *new_end;
524 if (BE (new_start == NULL, 0))
525 return REGS_UNALLOCATED;
526 new_end = re_realloc (regs->end, regoff_t, need_regs);
527 if (BE (new_end == NULL, 0))
529 re_free (new_start);
530 return REGS_UNALLOCATED;
532 regs->start = new_start;
533 regs->end = new_end;
534 regs->num_regs = need_regs;
537 else
539 assert (regs_allocated == REGS_FIXED);
540 /* This function may not be called with REGS_FIXED and nregs too big. */
541 assert (regs->num_regs >= nregs);
542 rval = REGS_FIXED;
545 /* Copy the regs. */
546 for (i = 0; i < nregs; ++i)
548 regs->start[i] = pmatch[i].rm_so;
549 regs->end[i] = pmatch[i].rm_eo;
551 for ( ; i < regs->num_regs; ++i)
552 regs->start[i] = regs->end[i] = -1;
554 return rval;
557 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
558 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
559 this memory for recording register information. STARTS and ENDS
560 must be allocated using the malloc library routine, and must each
561 be at least NUM_REGS * sizeof (regoff_t) bytes long.
563 If NUM_REGS == 0, then subsequent matches should allocate their own
564 register data.
566 Unless this function is called, the first search or match using
567 PATTERN_BUFFER will allocate its own register data, without
568 freeing the old data. */
570 void
571 re_set_registers (struct re_pattern_buffer *bufp,
572 struct re_registers *regs,
573 unsigned num_regs,
574 regoff_t *starts,
575 regoff_t *ends)
577 if (num_regs)
579 bufp->regs_allocated = REGS_REALLOCATE;
580 regs->num_regs = num_regs;
581 regs->start = starts;
582 regs->end = ends;
584 else
586 bufp->regs_allocated = REGS_UNALLOCATED;
587 regs->num_regs = 0;
588 regs->start = regs->end = (regoff_t *) 0;
591 #ifdef _LIBC
592 weak_alias (__re_set_registers, re_set_registers)
593 #endif
595 /* Entry points compatible with 4.2 BSD regex library. We don't define
596 them unless specifically requested. */
598 #if defined _REGEX_RE_COMP || defined _LIBC
600 # ifdef _LIBC
601 weak_function
602 # endif
603 re_exec (s)
604 const char *s;
606 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
608 #endif /* _REGEX_RE_COMP */
610 /* Internal entry point. */
612 /* Searches for a compiled pattern PREG in the string STRING, whose
613 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
614 mingings with regexec. START, and RANGE have the same meanings
615 with re_search.
616 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
617 otherwise return the error code.
618 Note: We assume front end functions already check ranges.
619 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
621 static reg_errcode_t internal_function
622 re_search_internal (const regex_t *preg,
623 const char *string,
624 int length, int start, int range, int stop,
625 size_t nmatch, regmatch_t pmatch[],
626 int eflags)
628 reg_errcode_t err;
629 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
630 int left_lim, right_lim, incr;
631 int fl_longest_match, match_first, match_kind, match_last = -1;
632 int extra_nmatch;
633 int sb, ch;
634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
635 re_match_context_t mctx = { .dfa = dfa };
636 #else
637 re_match_context_t mctx;
638 #endif
639 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
640 && range && !preg->can_be_null) ? preg->fastmap : NULL;
641 RE_TRANSLATE_TYPE t = preg->translate;
643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
644 memset (&mctx, '\0', sizeof (re_match_context_t));
645 mctx.dfa = dfa;
646 #endif
648 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
649 nmatch -= extra_nmatch;
651 /* Check if the DFA haven't been compiled. */
652 if (BE (preg->used == 0 || dfa->init_state == NULL
653 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
654 || dfa->init_state_begbuf == NULL, 0))
655 return REG_NOMATCH;
657 #ifdef DEBUG
658 /* We assume front-end functions already check them. */
659 assert (start + range >= 0 && start + range <= length);
660 #endif
662 /* If initial states with non-begbuf contexts have no elements,
663 the regex must be anchored. If preg->newline_anchor is set,
664 we'll never use init_state_nl, so do not check it. */
665 if (dfa->init_state->nodes.nelem == 0
666 && dfa->init_state_word->nodes.nelem == 0
667 && (dfa->init_state_nl->nodes.nelem == 0
668 || !preg->newline_anchor))
670 if (start != 0 && start + range != 0)
671 return REG_NOMATCH;
672 start = range = 0;
675 /* We must check the longest matching, if nmatch > 0. */
676 fl_longest_match = (nmatch != 0 || dfa->nbackref);
678 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
679 preg->translate, preg->syntax & RE_ICASE, dfa);
680 if (BE (err != REG_NOERROR, 0))
681 goto free_return;
682 mctx.input.stop = stop;
683 mctx.input.raw_stop = stop;
684 mctx.input.newline_anchor = preg->newline_anchor;
686 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
687 if (BE (err != REG_NOERROR, 0))
688 goto free_return;
690 /* We will log all the DFA states through which the dfa pass,
691 if nmatch > 1, or this dfa has "multibyte node", which is a
692 back-reference or a node which can accept multibyte character or
693 multi character collating element. */
694 if (nmatch > 1 || dfa->has_mb_node)
696 /* Avoid overflow. */
697 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
699 err = REG_ESPACE;
700 goto free_return;
703 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
704 if (BE (mctx.state_log == NULL, 0))
706 err = REG_ESPACE;
707 goto free_return;
710 else
711 mctx.state_log = NULL;
713 match_first = start;
714 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
715 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
717 /* Check incrementally whether of not the input string match. */
718 incr = (range < 0) ? -1 : 1;
719 left_lim = (range < 0) ? start + range : start;
720 right_lim = (range < 0) ? start : start + range;
721 sb = dfa->mb_cur_max == 1;
722 match_kind =
723 (fastmap
724 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
725 | (range >= 0 ? 2 : 0)
726 | (t != NULL ? 1 : 0))
727 : 8);
729 for (;; match_first += incr)
731 err = REG_NOMATCH;
732 if (match_first < left_lim || right_lim < match_first)
733 goto free_return;
735 /* Advance as rapidly as possible through the string, until we
736 find a plausible place to start matching. This may be done
737 with varying efficiency, so there are various possibilities:
738 only the most common of them are specialized, in order to
739 save on code size. We use a switch statement for speed. */
740 switch (match_kind)
742 case 8:
743 /* No fastmap. */
744 break;
746 case 7:
747 /* Fastmap with single-byte translation, match forward. */
748 while (BE (match_first < right_lim, 1)
749 && !fastmap[t[(unsigned char) string[match_first]]])
750 ++match_first;
751 goto forward_match_found_start_or_reached_end;
753 case 6:
754 /* Fastmap without translation, match forward. */
755 while (BE (match_first < right_lim, 1)
756 && !fastmap[(unsigned char) string[match_first]])
757 ++match_first;
759 forward_match_found_start_or_reached_end:
760 if (BE (match_first == right_lim, 0))
762 ch = match_first >= length
763 ? 0 : (unsigned char) string[match_first];
764 if (!fastmap[t ? t[ch] : ch])
765 goto free_return;
767 break;
769 case 4:
770 case 5:
771 /* Fastmap without multi-byte translation, match backwards. */
772 while (match_first >= left_lim)
774 ch = match_first >= length
775 ? 0 : (unsigned char) string[match_first];
776 if (fastmap[t ? t[ch] : ch])
777 break;
778 --match_first;
780 if (match_first < left_lim)
781 goto free_return;
782 break;
784 default:
785 /* In this case, we can't determine easily the current byte,
786 since it might be a component byte of a multibyte
787 character. Then we use the constructed buffer instead. */
788 for (;;)
790 /* If MATCH_FIRST is out of the valid range, reconstruct the
791 buffers. */
792 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
793 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
795 err = re_string_reconstruct (&mctx.input, match_first,
796 eflags);
797 if (BE (err != REG_NOERROR, 0))
798 goto free_return;
800 offset = match_first - mctx.input.raw_mbs_idx;
802 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
803 Note that MATCH_FIRST must not be smaller than 0. */
804 ch = (match_first >= length
805 ? 0 : re_string_byte_at (&mctx.input, offset));
806 if (fastmap[ch])
807 break;
808 match_first += incr;
809 if (match_first < left_lim || match_first > right_lim)
811 err = REG_NOMATCH;
812 goto free_return;
815 break;
818 /* Reconstruct the buffers so that the matcher can assume that
819 the matching starts from the beginning of the buffer. */
820 err = re_string_reconstruct (&mctx.input, match_first, eflags);
821 if (BE (err != REG_NOERROR, 0))
822 goto free_return;
824 #ifdef RE_ENABLE_I18N
825 /* Don't consider this char as a possible match start if it part,
826 yet isn't the head, of a multibyte character. */
827 if (!sb && !re_string_first_byte (&mctx.input, 0))
828 continue;
829 #endif
831 /* It seems to be appropriate one, then use the matcher. */
832 /* We assume that the matching starts from 0. */
833 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
834 match_last = check_matching (&mctx, fl_longest_match,
835 range >= 0 ? &match_first : NULL);
836 if (match_last != -1)
838 if (BE (match_last == -2, 0))
840 err = REG_ESPACE;
841 goto free_return;
843 else
845 mctx.match_last = match_last;
846 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
848 re_dfastate_t *pstate = mctx.state_log[match_last];
849 mctx.last_node = check_halt_state_context (&mctx, pstate,
850 match_last);
852 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
853 || dfa->nbackref)
855 err = prune_impossible_nodes (&mctx);
856 if (err == REG_NOERROR)
857 break;
858 if (BE (err != REG_NOMATCH, 0))
859 goto free_return;
860 match_last = -1;
862 else
863 break; /* We found a match. */
867 match_ctx_clean (&mctx);
870 #ifdef DEBUG
871 assert (match_last != -1);
872 assert (err == REG_NOERROR);
873 #endif
875 /* Set pmatch[] if we need. */
876 if (nmatch > 0)
878 int reg_idx;
880 /* Initialize registers. */
881 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
882 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
884 /* Set the points where matching start/end. */
885 pmatch[0].rm_so = 0;
886 pmatch[0].rm_eo = mctx.match_last;
888 if (!preg->no_sub && nmatch > 1)
890 err = set_regs (preg, &mctx, nmatch, pmatch,
891 dfa->has_plural_match && dfa->nbackref > 0);
892 if (BE (err != REG_NOERROR, 0))
893 goto free_return;
896 /* At last, add the offset to the each registers, since we slided
897 the buffers so that we could assume that the matching starts
898 from 0. */
899 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
900 if (pmatch[reg_idx].rm_so != -1)
902 #ifdef RE_ENABLE_I18N
903 if (BE (mctx.input.offsets_needed != 0, 0))
905 pmatch[reg_idx].rm_so =
906 (pmatch[reg_idx].rm_so == mctx.input.valid_len
907 ? mctx.input.valid_raw_len
908 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
909 pmatch[reg_idx].rm_eo =
910 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
911 ? mctx.input.valid_raw_len
912 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
914 #else
915 assert (mctx.input.offsets_needed == 0);
916 #endif
917 pmatch[reg_idx].rm_so += match_first;
918 pmatch[reg_idx].rm_eo += match_first;
920 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
922 pmatch[nmatch + reg_idx].rm_so = -1;
923 pmatch[nmatch + reg_idx].rm_eo = -1;
926 if (dfa->subexp_map)
927 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
928 if (dfa->subexp_map[reg_idx] != reg_idx)
930 pmatch[reg_idx + 1].rm_so
931 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
932 pmatch[reg_idx + 1].rm_eo
933 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
937 free_return:
938 re_free (mctx.state_log);
939 if (dfa->nbackref)
940 match_ctx_free (&mctx);
941 re_string_destruct (&mctx.input);
942 return err;
945 static reg_errcode_t internal_function
946 prune_impossible_nodes (re_match_context_t *mctx)
948 const re_dfa_t *const dfa = mctx->dfa;
949 int halt_node, match_last;
950 reg_errcode_t ret;
951 re_dfastate_t **sifted_states;
952 re_dfastate_t **lim_states = NULL;
953 re_sift_context_t sctx;
954 #ifdef DEBUG
955 assert (mctx->state_log != NULL);
956 #endif
957 match_last = mctx->match_last;
958 halt_node = mctx->last_node;
960 /* Avoid overflow. */
961 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
962 return REG_ESPACE;
964 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
965 if (BE (sifted_states == NULL, 0))
967 ret = REG_ESPACE;
968 goto free_return;
970 if (dfa->nbackref)
972 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
973 if (BE (lim_states == NULL, 0))
975 ret = REG_ESPACE;
976 goto free_return;
978 while (1)
980 memset (lim_states, '\0',
981 sizeof (re_dfastate_t *) * (match_last + 1));
982 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
983 match_last);
984 ret = sift_states_backward (mctx, &sctx);
985 re_node_set_free (&sctx.limits);
986 if (BE (ret != REG_NOERROR, 0))
987 goto free_return;
988 if (sifted_states[0] != NULL || lim_states[0] != NULL)
989 break;
992 --match_last;
993 if (match_last < 0)
995 ret = REG_NOMATCH;
996 goto free_return;
998 } while (mctx->state_log[match_last] == NULL
999 || !mctx->state_log[match_last]->halt);
1000 halt_node = check_halt_state_context (mctx,
1001 mctx->state_log[match_last],
1002 match_last);
1004 ret = merge_state_array (dfa, sifted_states, lim_states,
1005 match_last + 1);
1006 re_free (lim_states);
1007 lim_states = NULL;
1008 if (BE (ret != REG_NOERROR, 0))
1009 goto free_return;
1011 else
1013 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1014 ret = sift_states_backward (mctx, &sctx);
1015 re_node_set_free (&sctx.limits);
1016 if (BE (ret != REG_NOERROR, 0))
1017 goto free_return;
1018 if (sifted_states[0] == NULL)
1020 ret = REG_NOMATCH;
1021 goto free_return;
1024 re_free (mctx->state_log);
1025 mctx->state_log = sifted_states;
1026 sifted_states = NULL;
1027 mctx->last_node = halt_node;
1028 mctx->match_last = match_last;
1029 ret = REG_NOERROR;
1030 free_return:
1031 re_free (sifted_states);
1032 re_free (lim_states);
1033 return ret;
1036 /* Acquire an initial state and return it.
1037 We must select appropriate initial state depending on the context,
1038 since initial states may have constraints like "\<", "^", etc.. */
1040 static inline re_dfastate_t *
1041 __attribute ((always_inline)) internal_function
1042 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1043 int idx)
1045 const re_dfa_t *const dfa = mctx->dfa;
1046 if (dfa->init_state->has_constraint)
1048 unsigned int context;
1049 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1050 if (IS_WORD_CONTEXT (context))
1051 return dfa->init_state_word;
1052 else if (IS_ORDINARY_CONTEXT (context))
1053 return dfa->init_state;
1054 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1055 return dfa->init_state_begbuf;
1056 else if (IS_NEWLINE_CONTEXT (context))
1057 return dfa->init_state_nl;
1058 else if (IS_BEGBUF_CONTEXT (context))
1060 /* It is relatively rare case, then calculate on demand. */
1061 return re_acquire_state_context (err, dfa,
1062 dfa->init_state->entrance_nodes,
1063 context);
1065 else
1066 /* Must not happen? */
1067 return dfa->init_state;
1069 else
1070 return dfa->init_state;
1073 /* Check whether the regular expression match input string INPUT or not,
1074 and return the index where the matching end, return -1 if not match,
1075 or return -2 in case of an error.
1076 FL_LONGEST_MATCH means we want the POSIX longest matching.
1077 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1078 next place where we may want to try matching.
1079 Note that the matcher assume that the maching starts from the current
1080 index of the buffer. */
1082 static int
1083 internal_function
1084 check_matching (re_match_context_t *mctx, int fl_longest_match,
1085 int *p_match_first)
1087 const re_dfa_t *const dfa = mctx->dfa;
1088 reg_errcode_t err;
1089 int match = 0;
1090 int match_last = -1;
1091 int cur_str_idx = re_string_cur_idx (&mctx->input);
1092 re_dfastate_t *cur_state;
1093 int at_init_state = p_match_first != NULL;
1094 int next_start_idx = cur_str_idx;
1096 err = REG_NOERROR;
1097 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1098 /* An initial state must not be NULL (invalid). */
1099 if (BE (cur_state == NULL, 0))
1101 assert (err == REG_ESPACE);
1102 return -2;
1105 if (mctx->state_log != NULL)
1107 mctx->state_log[cur_str_idx] = cur_state;
1109 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1110 later. E.g. Processing back references. */
1111 if (BE (dfa->nbackref, 0))
1113 at_init_state = 0;
1114 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1115 if (BE (err != REG_NOERROR, 0))
1116 return err;
1118 if (cur_state->has_backref)
1120 err = transit_state_bkref (mctx, &cur_state->nodes);
1121 if (BE (err != REG_NOERROR, 0))
1122 return err;
1127 /* If the RE accepts NULL string. */
1128 if (BE (cur_state->halt, 0))
1130 if (!cur_state->has_constraint
1131 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1133 if (!fl_longest_match)
1134 return cur_str_idx;
1135 else
1137 match_last = cur_str_idx;
1138 match = 1;
1143 while (!re_string_eoi (&mctx->input))
1145 re_dfastate_t *old_state = cur_state;
1146 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1148 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1149 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1150 && mctx->input.valid_len < mctx->input.len))
1152 err = extend_buffers (mctx);
1153 if (BE (err != REG_NOERROR, 0))
1155 assert (err == REG_ESPACE);
1156 return -2;
1160 cur_state = transit_state (&err, mctx, cur_state);
1161 if (mctx->state_log != NULL)
1162 cur_state = merge_state_with_log (&err, mctx, cur_state);
1164 if (cur_state == NULL)
1166 /* Reached the invalid state or an error. Try to recover a valid
1167 state using the state log, if available and if we have not
1168 already found a valid (even if not the longest) match. */
1169 if (BE (err != REG_NOERROR, 0))
1170 return -2;
1172 if (mctx->state_log == NULL
1173 || (match && !fl_longest_match)
1174 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1175 break;
1178 if (BE (at_init_state, 0))
1180 if (old_state == cur_state)
1181 next_start_idx = next_char_idx;
1182 else
1183 at_init_state = 0;
1186 if (cur_state->halt)
1188 /* Reached a halt state.
1189 Check the halt state can satisfy the current context. */
1190 if (!cur_state->has_constraint
1191 || check_halt_state_context (mctx, cur_state,
1192 re_string_cur_idx (&mctx->input)))
1194 /* We found an appropriate halt state. */
1195 match_last = re_string_cur_idx (&mctx->input);
1196 match = 1;
1198 /* We found a match, do not modify match_first below. */
1199 p_match_first = NULL;
1200 if (!fl_longest_match)
1201 break;
1206 if (p_match_first)
1207 *p_match_first += next_start_idx;
1209 return match_last;
1212 /* Check NODE match the current context. */
1214 static int
1215 internal_function
1216 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1218 re_token_type_t type = dfa->nodes[node].type;
1219 unsigned int constraint = dfa->nodes[node].constraint;
1220 if (type != END_OF_RE)
1221 return 0;
1222 if (!constraint)
1223 return 1;
1224 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1225 return 0;
1226 return 1;
1229 /* Check the halt state STATE match the current context.
1230 Return 0 if not match, if the node, STATE has, is a halt node and
1231 match the context, return the node. */
1233 static int
1234 internal_function
1235 check_halt_state_context (const re_match_context_t *mctx,
1236 const re_dfastate_t *state, int idx)
1238 int i;
1239 unsigned int context;
1240 #ifdef DEBUG
1241 assert (state->halt);
1242 #endif
1243 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1244 for (i = 0; i < state->nodes.nelem; ++i)
1245 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1246 return state->nodes.elems[i];
1247 return 0;
1250 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1251 corresponding to the DFA).
1252 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1253 of errors. */
1255 static int
1256 internal_function
1257 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1258 int *pidx, int node, re_node_set *eps_via_nodes,
1259 struct re_fail_stack_t *fs)
1261 const re_dfa_t *const dfa = mctx->dfa;
1262 int i, err;
1263 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1265 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1266 re_node_set *edests = &dfa->edests[node];
1267 int dest_node;
1268 err = re_node_set_insert (eps_via_nodes, node);
1269 if (BE (err < 0, 0))
1270 return -2;
1271 /* Pick up a valid destination, or return -1 if none is found. */
1272 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1274 int candidate = edests->elems[i];
1275 if (!re_node_set_contains (cur_nodes, candidate))
1276 continue;
1277 if (dest_node == -1)
1278 dest_node = candidate;
1280 else
1282 /* In order to avoid infinite loop like "(a*)*", return the second
1283 epsilon-transition if the first was already considered. */
1284 if (re_node_set_contains (eps_via_nodes, dest_node))
1285 return candidate;
1287 /* Otherwise, push the second epsilon-transition on the fail stack. */
1288 else if (fs != NULL
1289 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1290 eps_via_nodes))
1291 return -2;
1293 /* We know we are going to exit. */
1294 break;
1297 return dest_node;
1299 else
1301 int naccepted = 0;
1302 re_token_type_t type = dfa->nodes[node].type;
1304 #ifdef RE_ENABLE_I18N
1305 if (dfa->nodes[node].accept_mb)
1306 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1307 else
1308 #endif /* RE_ENABLE_I18N */
1309 if (type == OP_BACK_REF)
1311 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1312 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1313 if (fs != NULL)
1315 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1316 return -1;
1317 else if (naccepted)
1319 char *buf = (char *) re_string_get_buffer (&mctx->input);
1320 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1321 naccepted) != 0)
1322 return -1;
1326 if (naccepted == 0)
1328 int dest_node;
1329 err = re_node_set_insert (eps_via_nodes, node);
1330 if (BE (err < 0, 0))
1331 return -2;
1332 dest_node = dfa->edests[node].elems[0];
1333 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1334 dest_node))
1335 return dest_node;
1339 if (naccepted != 0
1340 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1342 int dest_node = dfa->nexts[node];
1343 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1344 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1345 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1346 dest_node)))
1347 return -1;
1348 re_node_set_empty (eps_via_nodes);
1349 return dest_node;
1352 return -1;
1355 static reg_errcode_t
1356 internal_function
1357 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1358 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1360 reg_errcode_t err;
1361 int num = fs->num++;
1362 if (fs->num == fs->alloc)
1364 struct re_fail_stack_ent_t *new_array;
1365 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1366 * fs->alloc * 2));
1367 if (new_array == NULL)
1368 return REG_ESPACE;
1369 fs->alloc *= 2;
1370 fs->stack = new_array;
1372 fs->stack[num].idx = str_idx;
1373 fs->stack[num].node = dest_node;
1374 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1375 if (fs->stack[num].regs == NULL)
1376 return REG_ESPACE;
1377 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1378 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1379 return err;
1382 static int
1383 internal_function
1384 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1385 regmatch_t *regs, re_node_set *eps_via_nodes)
1387 int num = --fs->num;
1388 assert (num >= 0);
1389 *pidx = fs->stack[num].idx;
1390 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1391 re_node_set_free (eps_via_nodes);
1392 re_free (fs->stack[num].regs);
1393 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1394 return fs->stack[num].node;
1397 /* Set the positions where the subexpressions are starts/ends to registers
1398 PMATCH.
1399 Note: We assume that pmatch[0] is already set, and
1400 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1402 static reg_errcode_t
1403 internal_function
1404 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1405 regmatch_t *pmatch, int fl_backtrack)
1407 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1408 int idx, cur_node;
1409 re_node_set eps_via_nodes;
1410 struct re_fail_stack_t *fs;
1411 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1412 regmatch_t *prev_idx_match;
1413 int prev_idx_match_malloced = 0;
1415 #ifdef DEBUG
1416 assert (nmatch > 1);
1417 assert (mctx->state_log != NULL);
1418 #endif
1419 if (fl_backtrack)
1421 fs = &fs_body;
1422 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1423 if (fs->stack == NULL)
1424 return REG_ESPACE;
1426 else
1427 fs = NULL;
1429 cur_node = dfa->init_node;
1430 re_node_set_init_empty (&eps_via_nodes);
1432 #ifdef HAVE_ALLOCA
1433 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1434 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1435 else
1436 #endif
1438 prev_idx_match = re_malloc (regmatch_t, nmatch);
1439 if (prev_idx_match == NULL)
1441 free_fail_stack_return (fs);
1442 return REG_ESPACE;
1444 prev_idx_match_malloced = 1;
1446 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1448 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1450 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1452 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1454 int reg_idx;
1455 if (fs)
1457 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1458 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1459 break;
1460 if (reg_idx == nmatch)
1462 re_node_set_free (&eps_via_nodes);
1463 if (prev_idx_match_malloced)
1464 re_free (prev_idx_match);
1465 return free_fail_stack_return (fs);
1467 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1468 &eps_via_nodes);
1470 else
1472 re_node_set_free (&eps_via_nodes);
1473 if (prev_idx_match_malloced)
1474 re_free (prev_idx_match);
1475 return REG_NOERROR;
1479 /* Proceed to next node. */
1480 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1481 &eps_via_nodes, fs);
1483 if (BE (cur_node < 0, 0))
1485 if (BE (cur_node == -2, 0))
1487 re_node_set_free (&eps_via_nodes);
1488 if (prev_idx_match_malloced)
1489 re_free (prev_idx_match);
1490 free_fail_stack_return (fs);
1491 return REG_ESPACE;
1493 if (fs)
1494 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1495 &eps_via_nodes);
1496 else
1498 re_node_set_free (&eps_via_nodes);
1499 if (prev_idx_match_malloced)
1500 re_free (prev_idx_match);
1501 return REG_NOMATCH;
1505 re_node_set_free (&eps_via_nodes);
1506 if (prev_idx_match_malloced)
1507 re_free (prev_idx_match);
1508 return free_fail_stack_return (fs);
1511 static reg_errcode_t
1512 internal_function
1513 free_fail_stack_return (struct re_fail_stack_t *fs)
1515 if (fs)
1517 int fs_idx;
1518 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1520 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1521 re_free (fs->stack[fs_idx].regs);
1523 re_free (fs->stack);
1525 return REG_NOERROR;
1528 static void
1529 internal_function
1530 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1531 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1533 int type = dfa->nodes[cur_node].type;
1534 if (type == OP_OPEN_SUBEXP)
1536 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1538 /* We are at the first node of this sub expression. */
1539 if (reg_num < nmatch)
1541 pmatch[reg_num].rm_so = cur_idx;
1542 pmatch[reg_num].rm_eo = -1;
1545 else if (type == OP_CLOSE_SUBEXP)
1547 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1548 if (reg_num < nmatch)
1550 /* We are at the last node of this sub expression. */
1551 if (pmatch[reg_num].rm_so < cur_idx)
1553 pmatch[reg_num].rm_eo = cur_idx;
1554 /* This is a non-empty match or we are not inside an optional
1555 subexpression. Accept this right away. */
1556 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1558 else
1560 if (dfa->nodes[cur_node].opt_subexp
1561 && prev_idx_match[reg_num].rm_so != -1)
1562 /* We transited through an empty match for an optional
1563 subexpression, like (a?)*, and this is not the subexp's
1564 first match. Copy back the old content of the registers
1565 so that matches of an inner subexpression are undone as
1566 well, like in ((a?))*. */
1567 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1568 else
1569 /* We completed a subexpression, but it may be part of
1570 an optional one, so do not update PREV_IDX_MATCH. */
1571 pmatch[reg_num].rm_eo = cur_idx;
1577 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1578 and sift the nodes in each states according to the following rules.
1579 Updated state_log will be wrote to STATE_LOG.
1581 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1582 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1583 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1584 the LAST_NODE, we throw away the node `a'.
1585 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1586 string `s' and transit to `b':
1587 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1588 away the node `a'.
1589 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1590 thrown away, we throw away the node `a'.
1591 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1592 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1593 node `a'.
1594 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1595 we throw away the node `a'. */
1597 #define STATE_NODE_CONTAINS(state,node) \
1598 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1600 static reg_errcode_t
1601 internal_function
1602 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1604 reg_errcode_t err;
1605 int null_cnt = 0;
1606 int str_idx = sctx->last_str_idx;
1607 re_node_set cur_dest;
1609 #ifdef DEBUG
1610 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1611 #endif
1613 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1614 transit to the last_node and the last_node itself. */
1615 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1616 if (BE (err != REG_NOERROR, 0))
1617 return err;
1618 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1619 if (BE (err != REG_NOERROR, 0))
1620 goto free_return;
1622 /* Then check each states in the state_log. */
1623 while (str_idx > 0)
1625 /* Update counters. */
1626 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1627 if (null_cnt > mctx->max_mb_elem_len)
1629 memset (sctx->sifted_states, '\0',
1630 sizeof (re_dfastate_t *) * str_idx);
1631 re_node_set_free (&cur_dest);
1632 return REG_NOERROR;
1634 re_node_set_empty (&cur_dest);
1635 --str_idx;
1637 if (mctx->state_log[str_idx])
1639 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1640 if (BE (err != REG_NOERROR, 0))
1641 goto free_return;
1644 /* Add all the nodes which satisfy the following conditions:
1645 - It can epsilon transit to a node in CUR_DEST.
1646 - It is in CUR_SRC.
1647 And update state_log. */
1648 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1649 if (BE (err != REG_NOERROR, 0))
1650 goto free_return;
1652 err = REG_NOERROR;
1653 free_return:
1654 re_node_set_free (&cur_dest);
1655 return err;
1658 static reg_errcode_t
1659 internal_function
1660 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1661 int str_idx, re_node_set *cur_dest)
1663 const re_dfa_t *const dfa = mctx->dfa;
1664 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1665 int i;
1667 /* Then build the next sifted state.
1668 We build the next sifted state on `cur_dest', and update
1669 `sifted_states[str_idx]' with `cur_dest'.
1670 Note:
1671 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1672 `cur_src' points the node_set of the old `state_log[str_idx]'
1673 (with the epsilon nodes pre-filtered out). */
1674 for (i = 0; i < cur_src->nelem; i++)
1676 int prev_node = cur_src->elems[i];
1677 int naccepted = 0;
1678 int ret;
1680 #ifdef DEBUG
1681 re_token_type_t type = dfa->nodes[prev_node].type;
1682 assert (!IS_EPSILON_NODE (type));
1683 #endif
1684 #ifdef RE_ENABLE_I18N
1685 /* If the node may accept `multi byte'. */
1686 if (dfa->nodes[prev_node].accept_mb)
1687 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1688 str_idx, sctx->last_str_idx);
1689 #endif /* RE_ENABLE_I18N */
1691 /* We don't check backreferences here.
1692 See update_cur_sifted_state(). */
1693 if (!naccepted
1694 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1695 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1696 dfa->nexts[prev_node]))
1697 naccepted = 1;
1699 if (naccepted == 0)
1700 continue;
1702 if (sctx->limits.nelem)
1704 int to_idx = str_idx + naccepted;
1705 if (check_dst_limits (mctx, &sctx->limits,
1706 dfa->nexts[prev_node], to_idx,
1707 prev_node, str_idx))
1708 continue;
1710 ret = re_node_set_insert (cur_dest, prev_node);
1711 if (BE (ret == -1, 0))
1712 return REG_ESPACE;
1715 return REG_NOERROR;
1718 /* Helper functions. */
1720 static reg_errcode_t
1721 internal_function
1722 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1724 int top = mctx->state_log_top;
1726 if (next_state_log_idx >= mctx->input.bufs_len
1727 || (next_state_log_idx >= mctx->input.valid_len
1728 && mctx->input.valid_len < mctx->input.len))
1730 reg_errcode_t err;
1731 err = extend_buffers (mctx);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1736 if (top < next_state_log_idx)
1738 memset (mctx->state_log + top + 1, '\0',
1739 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1740 mctx->state_log_top = next_state_log_idx;
1742 return REG_NOERROR;
1745 static reg_errcode_t
1746 internal_function
1747 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1748 re_dfastate_t **src, int num)
1750 int st_idx;
1751 reg_errcode_t err;
1752 for (st_idx = 0; st_idx < num; ++st_idx)
1754 if (dst[st_idx] == NULL)
1755 dst[st_idx] = src[st_idx];
1756 else if (src[st_idx] != NULL)
1758 re_node_set merged_set;
1759 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1760 &src[st_idx]->nodes);
1761 if (BE (err != REG_NOERROR, 0))
1762 return err;
1763 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1764 re_node_set_free (&merged_set);
1765 if (BE (err != REG_NOERROR, 0))
1766 return err;
1769 return REG_NOERROR;
1772 static reg_errcode_t
1773 internal_function
1774 update_cur_sifted_state (const re_match_context_t *mctx,
1775 re_sift_context_t *sctx, int str_idx,
1776 re_node_set *dest_nodes)
1778 const re_dfa_t *const dfa = mctx->dfa;
1779 reg_errcode_t err = REG_NOERROR;
1780 const re_node_set *candidates;
1781 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1782 : &mctx->state_log[str_idx]->nodes);
1784 if (dest_nodes->nelem == 0)
1785 sctx->sifted_states[str_idx] = NULL;
1786 else
1788 if (candidates)
1790 /* At first, add the nodes which can epsilon transit to a node in
1791 DEST_NODE. */
1792 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
1796 /* Then, check the limitations in the current sift_context. */
1797 if (sctx->limits.nelem)
1799 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1800 mctx->bkref_ents, str_idx);
1801 if (BE (err != REG_NOERROR, 0))
1802 return err;
1806 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1807 if (BE (err != REG_NOERROR, 0))
1808 return err;
1811 if (candidates && mctx->state_log[str_idx]->has_backref)
1813 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1814 if (BE (err != REG_NOERROR, 0))
1815 return err;
1817 return REG_NOERROR;
1820 static reg_errcode_t
1821 internal_function
1822 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1823 const re_node_set *candidates)
1825 reg_errcode_t err = REG_NOERROR;
1826 int i;
1828 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1829 if (BE (err != REG_NOERROR, 0))
1830 return err;
1832 if (!state->inveclosure.alloc)
1834 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1835 if (BE (err != REG_NOERROR, 0))
1836 return REG_ESPACE;
1837 for (i = 0; i < dest_nodes->nelem; i++)
1839 err = re_node_set_merge (&state->inveclosure,
1840 dfa->inveclosures + dest_nodes->elems[i]);
1841 if (BE (err != REG_NOERROR, 0))
1842 return REG_ESPACE;
1845 return re_node_set_add_intersect (dest_nodes, candidates,
1846 &state->inveclosure);
1849 static reg_errcode_t
1850 internal_function
1851 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1852 const re_node_set *candidates)
1854 int ecl_idx;
1855 reg_errcode_t err;
1856 re_node_set *inv_eclosure = dfa->inveclosures + node;
1857 re_node_set except_nodes;
1858 re_node_set_init_empty (&except_nodes);
1859 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1861 int cur_node = inv_eclosure->elems[ecl_idx];
1862 if (cur_node == node)
1863 continue;
1864 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1866 int edst1 = dfa->edests[cur_node].elems[0];
1867 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1868 ? dfa->edests[cur_node].elems[1] : -1);
1869 if ((!re_node_set_contains (inv_eclosure, edst1)
1870 && re_node_set_contains (dest_nodes, edst1))
1871 || (edst2 > 0
1872 && !re_node_set_contains (inv_eclosure, edst2)
1873 && re_node_set_contains (dest_nodes, edst2)))
1875 err = re_node_set_add_intersect (&except_nodes, candidates,
1876 dfa->inveclosures + cur_node);
1877 if (BE (err != REG_NOERROR, 0))
1879 re_node_set_free (&except_nodes);
1880 return err;
1885 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1887 int cur_node = inv_eclosure->elems[ecl_idx];
1888 if (!re_node_set_contains (&except_nodes, cur_node))
1890 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1891 re_node_set_remove_at (dest_nodes, idx);
1894 re_node_set_free (&except_nodes);
1895 return REG_NOERROR;
1898 static int
1899 internal_function
1900 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1901 int dst_node, int dst_idx, int src_node, int src_idx)
1903 const re_dfa_t *const dfa = mctx->dfa;
1904 int lim_idx, src_pos, dst_pos;
1906 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1907 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1908 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1910 int subexp_idx;
1911 struct re_backref_cache_entry *ent;
1912 ent = mctx->bkref_ents + limits->elems[lim_idx];
1913 subexp_idx = dfa->nodes[ent->node].opr.idx;
1915 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1916 subexp_idx, dst_node, dst_idx,
1917 dst_bkref_idx);
1918 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1919 subexp_idx, src_node, src_idx,
1920 src_bkref_idx);
1922 /* In case of:
1923 <src> <dst> ( <subexp> )
1924 ( <subexp> ) <src> <dst>
1925 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1926 if (src_pos == dst_pos)
1927 continue; /* This is unrelated limitation. */
1928 else
1929 return 1;
1931 return 0;
1934 static int
1935 internal_function
1936 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1937 int subexp_idx, int from_node, int bkref_idx)
1939 const re_dfa_t *const dfa = mctx->dfa;
1940 const re_node_set *eclosures = dfa->eclosures + from_node;
1941 int node_idx;
1943 /* Else, we are on the boundary: examine the nodes on the epsilon
1944 closure. */
1945 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1947 int node = eclosures->elems[node_idx];
1948 switch (dfa->nodes[node].type)
1950 case OP_BACK_REF:
1951 if (bkref_idx != -1)
1953 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1956 int dst, cpos;
1958 if (ent->node != node)
1959 continue;
1961 if (subexp_idx < BITSET_WORD_BITS
1962 && !(ent->eps_reachable_subexps_map
1963 & ((bitset_word_t) 1 << subexp_idx)))
1964 continue;
1966 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1967 OP_CLOSE_SUBEXP cases below. But, if the
1968 destination node is the same node as the source
1969 node, don't recurse because it would cause an
1970 infinite loop: a regex that exhibits this behavior
1971 is ()\1*\1* */
1972 dst = dfa->edests[node].elems[0];
1973 if (dst == from_node)
1975 if (boundaries & 1)
1976 return -1;
1977 else /* if (boundaries & 2) */
1978 return 0;
1981 cpos =
1982 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1983 dst, bkref_idx);
1984 if (cpos == -1 /* && (boundaries & 1) */)
1985 return -1;
1986 if (cpos == 0 && (boundaries & 2))
1987 return 0;
1989 if (subexp_idx < BITSET_WORD_BITS)
1990 ent->eps_reachable_subexps_map
1991 &= ~((bitset_word_t) 1 << subexp_idx);
1993 while (ent++->more);
1995 break;
1997 case OP_OPEN_SUBEXP:
1998 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1999 return -1;
2000 break;
2002 case OP_CLOSE_SUBEXP:
2003 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2004 return 0;
2005 break;
2007 default:
2008 break;
2012 return (boundaries & 2) ? 1 : 0;
2015 static int
2016 internal_function
2017 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2018 int subexp_idx, int from_node, int str_idx,
2019 int bkref_idx)
2021 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2022 int boundaries;
2024 /* If we are outside the range of the subexpression, return -1 or 1. */
2025 if (str_idx < lim->subexp_from)
2026 return -1;
2028 if (lim->subexp_to < str_idx)
2029 return 1;
2031 /* If we are within the subexpression, return 0. */
2032 boundaries = (str_idx == lim->subexp_from);
2033 boundaries |= (str_idx == lim->subexp_to) << 1;
2034 if (boundaries == 0)
2035 return 0;
2037 /* Else, examine epsilon closure. */
2038 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2039 from_node, bkref_idx);
2042 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2043 which are against limitations from DEST_NODES. */
2045 static reg_errcode_t
2046 internal_function
2047 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2048 const re_node_set *candidates, re_node_set *limits,
2049 struct re_backref_cache_entry *bkref_ents, int str_idx)
2051 reg_errcode_t err;
2052 int node_idx, lim_idx;
2054 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2056 int subexp_idx;
2057 struct re_backref_cache_entry *ent;
2058 ent = bkref_ents + limits->elems[lim_idx];
2060 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2061 continue; /* This is unrelated limitation. */
2063 subexp_idx = dfa->nodes[ent->node].opr.idx;
2064 if (ent->subexp_to == str_idx)
2066 int ops_node = -1;
2067 int cls_node = -1;
2068 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2070 int node = dest_nodes->elems[node_idx];
2071 re_token_type_t type = dfa->nodes[node].type;
2072 if (type == OP_OPEN_SUBEXP
2073 && subexp_idx == dfa->nodes[node].opr.idx)
2074 ops_node = node;
2075 else if (type == OP_CLOSE_SUBEXP
2076 && subexp_idx == dfa->nodes[node].opr.idx)
2077 cls_node = node;
2080 /* Check the limitation of the open subexpression. */
2081 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2082 if (ops_node >= 0)
2084 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2085 candidates);
2086 if (BE (err != REG_NOERROR, 0))
2087 return err;
2090 /* Check the limitation of the close subexpression. */
2091 if (cls_node >= 0)
2092 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2094 int node = dest_nodes->elems[node_idx];
2095 if (!re_node_set_contains (dfa->inveclosures + node,
2096 cls_node)
2097 && !re_node_set_contains (dfa->eclosures + node,
2098 cls_node))
2100 /* It is against this limitation.
2101 Remove it form the current sifted state. */
2102 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2103 candidates);
2104 if (BE (err != REG_NOERROR, 0))
2105 return err;
2106 --node_idx;
2110 else /* (ent->subexp_to != str_idx) */
2112 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2114 int node = dest_nodes->elems[node_idx];
2115 re_token_type_t type = dfa->nodes[node].type;
2116 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2118 if (subexp_idx != dfa->nodes[node].opr.idx)
2119 continue;
2120 /* It is against this limitation.
2121 Remove it form the current sifted state. */
2122 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2123 candidates);
2124 if (BE (err != REG_NOERROR, 0))
2125 return err;
2130 return REG_NOERROR;
2133 static reg_errcode_t
2134 internal_function
2135 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2136 int str_idx, const re_node_set *candidates)
2138 const re_dfa_t *const dfa = mctx->dfa;
2139 reg_errcode_t err;
2140 int node_idx, node;
2141 re_sift_context_t local_sctx;
2142 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2144 if (first_idx == -1)
2145 return REG_NOERROR;
2147 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2149 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2151 int enabled_idx;
2152 re_token_type_t type;
2153 struct re_backref_cache_entry *entry;
2154 node = candidates->elems[node_idx];
2155 type = dfa->nodes[node].type;
2156 /* Avoid infinite loop for the REs like "()\1+". */
2157 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2158 continue;
2159 if (type != OP_BACK_REF)
2160 continue;
2162 entry = mctx->bkref_ents + first_idx;
2163 enabled_idx = first_idx;
2166 int subexp_len;
2167 int to_idx;
2168 int dst_node;
2169 int ret;
2170 re_dfastate_t *cur_state;
2172 if (entry->node != node)
2173 continue;
2174 subexp_len = entry->subexp_to - entry->subexp_from;
2175 to_idx = str_idx + subexp_len;
2176 dst_node = (subexp_len ? dfa->nexts[node]
2177 : dfa->edests[node].elems[0]);
2179 if (to_idx > sctx->last_str_idx
2180 || sctx->sifted_states[to_idx] == NULL
2181 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2182 || check_dst_limits (mctx, &sctx->limits, node,
2183 str_idx, dst_node, to_idx))
2184 continue;
2186 if (local_sctx.sifted_states == NULL)
2188 local_sctx = *sctx;
2189 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2190 if (BE (err != REG_NOERROR, 0))
2191 goto free_return;
2193 local_sctx.last_node = node;
2194 local_sctx.last_str_idx = str_idx;
2195 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2196 if (BE (ret < 0, 0))
2198 err = REG_ESPACE;
2199 goto free_return;
2201 cur_state = local_sctx.sifted_states[str_idx];
2202 err = sift_states_backward (mctx, &local_sctx);
2203 if (BE (err != REG_NOERROR, 0))
2204 goto free_return;
2205 if (sctx->limited_states != NULL)
2207 err = merge_state_array (dfa, sctx->limited_states,
2208 local_sctx.sifted_states,
2209 str_idx + 1);
2210 if (BE (err != REG_NOERROR, 0))
2211 goto free_return;
2213 local_sctx.sifted_states[str_idx] = cur_state;
2214 re_node_set_remove (&local_sctx.limits, enabled_idx);
2216 /* mctx->bkref_ents may have changed, reload the pointer. */
2217 entry = mctx->bkref_ents + enabled_idx;
2219 while (enabled_idx++, entry++->more);
2221 err = REG_NOERROR;
2222 free_return:
2223 if (local_sctx.sifted_states != NULL)
2225 re_node_set_free (&local_sctx.limits);
2228 return err;
2232 #ifdef RE_ENABLE_I18N
2233 static int
2234 internal_function
2235 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2236 int node_idx, int str_idx, int max_str_idx)
2238 const re_dfa_t *const dfa = mctx->dfa;
2239 int naccepted;
2240 /* Check the node can accept `multi byte'. */
2241 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2242 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2243 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2244 dfa->nexts[node_idx]))
2245 /* The node can't accept the `multi byte', or the
2246 destination was already thrown away, then the node
2247 could't accept the current input `multi byte'. */
2248 naccepted = 0;
2249 /* Otherwise, it is sure that the node could accept
2250 `naccepted' bytes input. */
2251 return naccepted;
2253 #endif /* RE_ENABLE_I18N */
2256 /* Functions for state transition. */
2258 /* Return the next state to which the current state STATE will transit by
2259 accepting the current input byte, and update STATE_LOG if necessary.
2260 If STATE can accept a multibyte char/collating element/back reference
2261 update the destination of STATE_LOG. */
2263 static re_dfastate_t *
2264 internal_function
2265 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2266 re_dfastate_t *state)
2268 re_dfastate_t **trtable;
2269 unsigned char ch;
2271 #ifdef RE_ENABLE_I18N
2272 /* If the current state can accept multibyte. */
2273 if (BE (state->accept_mb, 0))
2275 *err = transit_state_mb (mctx, state);
2276 if (BE (*err != REG_NOERROR, 0))
2277 return NULL;
2279 #endif /* RE_ENABLE_I18N */
2281 /* Then decide the next state with the single byte. */
2282 #if 0
2283 if (0)
2284 /* don't use transition table */
2285 return transit_state_sb (err, mctx, state);
2286 #endif
2288 /* Use transition table */
2289 ch = re_string_fetch_byte (&mctx->input);
2290 for (;;)
2292 trtable = state->trtable;
2293 if (BE (trtable != NULL, 1))
2294 return trtable[ch];
2296 trtable = state->word_trtable;
2297 if (BE (trtable != NULL, 1))
2299 unsigned int context;
2300 context
2301 = re_string_context_at (&mctx->input,
2302 re_string_cur_idx (&mctx->input) - 1,
2303 mctx->eflags);
2304 if (IS_WORD_CONTEXT (context))
2305 return trtable[ch + SBC_MAX];
2306 else
2307 return trtable[ch];
2310 if (!build_trtable (mctx->dfa, state))
2312 *err = REG_ESPACE;
2313 return NULL;
2316 /* Retry, we now have a transition table. */
2320 /* Update the state_log if we need */
2321 re_dfastate_t *
2322 internal_function
2323 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2324 re_dfastate_t *next_state)
2326 const re_dfa_t *const dfa = mctx->dfa;
2327 int cur_idx = re_string_cur_idx (&mctx->input);
2329 if (cur_idx > mctx->state_log_top)
2331 mctx->state_log[cur_idx] = next_state;
2332 mctx->state_log_top = cur_idx;
2334 else if (mctx->state_log[cur_idx] == 0)
2336 mctx->state_log[cur_idx] = next_state;
2338 else
2340 re_dfastate_t *pstate;
2341 unsigned int context;
2342 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2343 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2344 the destination of a multibyte char/collating element/
2345 back reference. Then the next state is the union set of
2346 these destinations and the results of the transition table. */
2347 pstate = mctx->state_log[cur_idx];
2348 log_nodes = pstate->entrance_nodes;
2349 if (next_state != NULL)
2351 table_nodes = next_state->entrance_nodes;
2352 *err = re_node_set_init_union (&next_nodes, table_nodes,
2353 log_nodes);
2354 if (BE (*err != REG_NOERROR, 0))
2355 return NULL;
2357 else
2358 next_nodes = *log_nodes;
2359 /* Note: We already add the nodes of the initial state,
2360 then we don't need to add them here. */
2362 context = re_string_context_at (&mctx->input,
2363 re_string_cur_idx (&mctx->input) - 1,
2364 mctx->eflags);
2365 next_state = mctx->state_log[cur_idx]
2366 = re_acquire_state_context (err, dfa, &next_nodes, context);
2367 /* We don't need to check errors here, since the return value of
2368 this function is next_state and ERR is already set. */
2370 if (table_nodes != NULL)
2371 re_node_set_free (&next_nodes);
2374 if (BE (dfa->nbackref, 0) && next_state != NULL)
2376 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2377 later. We must check them here, since the back references in the
2378 next state might use them. */
2379 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2380 cur_idx);
2381 if (BE (*err != REG_NOERROR, 0))
2382 return NULL;
2384 /* If the next state has back references. */
2385 if (next_state->has_backref)
2387 *err = transit_state_bkref (mctx, &next_state->nodes);
2388 if (BE (*err != REG_NOERROR, 0))
2389 return NULL;
2390 next_state = mctx->state_log[cur_idx];
2394 return next_state;
2397 /* Skip bytes in the input that correspond to part of a
2398 multi-byte match, then look in the log for a state
2399 from which to restart matching. */
2400 re_dfastate_t *
2401 internal_function
2402 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2404 re_dfastate_t *cur_state;
2407 int max = mctx->state_log_top;
2408 int cur_str_idx = re_string_cur_idx (&mctx->input);
2412 if (++cur_str_idx > max)
2413 return NULL;
2414 re_string_skip_bytes (&mctx->input, 1);
2416 while (mctx->state_log[cur_str_idx] == NULL);
2418 cur_state = merge_state_with_log (err, mctx, NULL);
2420 while (*err == REG_NOERROR && cur_state == NULL);
2421 return cur_state;
2424 /* Helper functions for transit_state. */
2426 /* From the node set CUR_NODES, pick up the nodes whose types are
2427 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2428 expression. And register them to use them later for evaluating the
2429 correspoding back references. */
2431 static reg_errcode_t
2432 internal_function
2433 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2434 int str_idx)
2436 const re_dfa_t *const dfa = mctx->dfa;
2437 int node_idx;
2438 reg_errcode_t err;
2440 /* TODO: This isn't efficient.
2441 Because there might be more than one nodes whose types are
2442 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2443 nodes.
2444 E.g. RE: (a){2} */
2445 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2447 int node = cur_nodes->elems[node_idx];
2448 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2449 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2450 && (dfa->used_bkref_map
2451 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2453 err = match_ctx_add_subtop (mctx, node, str_idx);
2454 if (BE (err != REG_NOERROR, 0))
2455 return err;
2458 return REG_NOERROR;
2461 #if 0
2462 /* Return the next state to which the current state STATE will transit by
2463 accepting the current input byte. */
2465 static re_dfastate_t *
2466 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2467 re_dfastate_t *state)
2469 const re_dfa_t *const dfa = mctx->dfa;
2470 re_node_set next_nodes;
2471 re_dfastate_t *next_state;
2472 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2473 unsigned int context;
2475 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2476 if (BE (*err != REG_NOERROR, 0))
2477 return NULL;
2478 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2480 int cur_node = state->nodes.elems[node_cnt];
2481 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2483 *err = re_node_set_merge (&next_nodes,
2484 dfa->eclosures + dfa->nexts[cur_node]);
2485 if (BE (*err != REG_NOERROR, 0))
2487 re_node_set_free (&next_nodes);
2488 return NULL;
2492 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2493 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2494 /* We don't need to check errors here, since the return value of
2495 this function is next_state and ERR is already set. */
2497 re_node_set_free (&next_nodes);
2498 re_string_skip_bytes (&mctx->input, 1);
2499 return next_state;
2501 #endif
2503 #ifdef RE_ENABLE_I18N
2504 static reg_errcode_t
2505 internal_function
2506 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2508 const re_dfa_t *const dfa = mctx->dfa;
2509 reg_errcode_t err;
2510 int i;
2512 for (i = 0; i < pstate->nodes.nelem; ++i)
2514 re_node_set dest_nodes, *new_nodes;
2515 int cur_node_idx = pstate->nodes.elems[i];
2516 int naccepted, dest_idx;
2517 unsigned int context;
2518 re_dfastate_t *dest_state;
2520 if (!dfa->nodes[cur_node_idx].accept_mb)
2521 continue;
2523 if (dfa->nodes[cur_node_idx].constraint)
2525 context = re_string_context_at (&mctx->input,
2526 re_string_cur_idx (&mctx->input),
2527 mctx->eflags);
2528 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2529 context))
2530 continue;
2533 /* How many bytes the node can accept? */
2534 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2535 re_string_cur_idx (&mctx->input));
2536 if (naccepted == 0)
2537 continue;
2539 /* The node can accepts `naccepted' bytes. */
2540 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2541 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2542 : mctx->max_mb_elem_len);
2543 err = clean_state_log_if_needed (mctx, dest_idx);
2544 if (BE (err != REG_NOERROR, 0))
2545 return err;
2546 #ifdef DEBUG
2547 assert (dfa->nexts[cur_node_idx] != -1);
2548 #endif
2549 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2551 dest_state = mctx->state_log[dest_idx];
2552 if (dest_state == NULL)
2553 dest_nodes = *new_nodes;
2554 else
2556 err = re_node_set_init_union (&dest_nodes,
2557 dest_state->entrance_nodes, new_nodes);
2558 if (BE (err != REG_NOERROR, 0))
2559 return err;
2561 context = re_string_context_at (&mctx->input, dest_idx - 1,
2562 mctx->eflags);
2563 mctx->state_log[dest_idx]
2564 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2565 if (dest_state != NULL)
2566 re_node_set_free (&dest_nodes);
2567 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2568 return err;
2570 return REG_NOERROR;
2572 #endif /* RE_ENABLE_I18N */
2574 static reg_errcode_t
2575 internal_function
2576 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2578 const re_dfa_t *const dfa = mctx->dfa;
2579 reg_errcode_t err;
2580 int i;
2581 int cur_str_idx = re_string_cur_idx (&mctx->input);
2583 for (i = 0; i < nodes->nelem; ++i)
2585 int dest_str_idx, prev_nelem, bkc_idx;
2586 int node_idx = nodes->elems[i];
2587 unsigned int context;
2588 const re_token_t *node = dfa->nodes + node_idx;
2589 re_node_set *new_dest_nodes;
2591 /* Check whether `node' is a backreference or not. */
2592 if (node->type != OP_BACK_REF)
2593 continue;
2595 if (node->constraint)
2597 context = re_string_context_at (&mctx->input, cur_str_idx,
2598 mctx->eflags);
2599 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2600 continue;
2603 /* `node' is a backreference.
2604 Check the substring which the substring matched. */
2605 bkc_idx = mctx->nbkref_ents;
2606 err = get_subexp (mctx, node_idx, cur_str_idx);
2607 if (BE (err != REG_NOERROR, 0))
2608 goto free_return;
2610 /* And add the epsilon closures (which is `new_dest_nodes') of
2611 the backreference to appropriate state_log. */
2612 #ifdef DEBUG
2613 assert (dfa->nexts[node_idx] != -1);
2614 #endif
2615 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2617 int subexp_len;
2618 re_dfastate_t *dest_state;
2619 struct re_backref_cache_entry *bkref_ent;
2620 bkref_ent = mctx->bkref_ents + bkc_idx;
2621 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2622 continue;
2623 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2624 new_dest_nodes = (subexp_len == 0
2625 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2626 : dfa->eclosures + dfa->nexts[node_idx]);
2627 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2628 - bkref_ent->subexp_from);
2629 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2630 mctx->eflags);
2631 dest_state = mctx->state_log[dest_str_idx];
2632 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2633 : mctx->state_log[cur_str_idx]->nodes.nelem);
2634 /* Add `new_dest_node' to state_log. */
2635 if (dest_state == NULL)
2637 mctx->state_log[dest_str_idx]
2638 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2639 context);
2640 if (BE (mctx->state_log[dest_str_idx] == NULL
2641 && err != REG_NOERROR, 0))
2642 goto free_return;
2644 else
2646 re_node_set dest_nodes;
2647 err = re_node_set_init_union (&dest_nodes,
2648 dest_state->entrance_nodes,
2649 new_dest_nodes);
2650 if (BE (err != REG_NOERROR, 0))
2652 re_node_set_free (&dest_nodes);
2653 goto free_return;
2655 mctx->state_log[dest_str_idx]
2656 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2657 re_node_set_free (&dest_nodes);
2658 if (BE (mctx->state_log[dest_str_idx] == NULL
2659 && err != REG_NOERROR, 0))
2660 goto free_return;
2662 /* We need to check recursively if the backreference can epsilon
2663 transit. */
2664 if (subexp_len == 0
2665 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2667 err = check_subexp_matching_top (mctx, new_dest_nodes,
2668 cur_str_idx);
2669 if (BE (err != REG_NOERROR, 0))
2670 goto free_return;
2671 err = transit_state_bkref (mctx, new_dest_nodes);
2672 if (BE (err != REG_NOERROR, 0))
2673 goto free_return;
2677 err = REG_NOERROR;
2678 free_return:
2679 return err;
2682 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2683 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2684 Note that we might collect inappropriate candidates here.
2685 However, the cost of checking them strictly here is too high, then we
2686 delay these checking for prune_impossible_nodes(). */
2688 static reg_errcode_t
2689 internal_function
2690 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2692 const re_dfa_t *const dfa = mctx->dfa;
2693 int subexp_num, sub_top_idx;
2694 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2695 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2696 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2697 if (cache_idx != -1)
2699 const struct re_backref_cache_entry *entry
2700 = mctx->bkref_ents + cache_idx;
2702 if (entry->node == bkref_node)
2703 return REG_NOERROR; /* We already checked it. */
2704 while (entry++->more);
2707 subexp_num = dfa->nodes[bkref_node].opr.idx;
2709 /* For each sub expression */
2710 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2712 reg_errcode_t err;
2713 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2714 re_sub_match_last_t *sub_last;
2715 int sub_last_idx, sl_str, bkref_str_off;
2717 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2718 continue; /* It isn't related. */
2720 sl_str = sub_top->str_idx;
2721 bkref_str_off = bkref_str_idx;
2722 /* At first, check the last node of sub expressions we already
2723 evaluated. */
2724 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2726 int sl_str_diff;
2727 sub_last = sub_top->lasts[sub_last_idx];
2728 sl_str_diff = sub_last->str_idx - sl_str;
2729 /* The matched string by the sub expression match with the substring
2730 at the back reference? */
2731 if (sl_str_diff > 0)
2733 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2735 /* Not enough chars for a successful match. */
2736 if (bkref_str_off + sl_str_diff > mctx->input.len)
2737 break;
2739 err = clean_state_log_if_needed (mctx,
2740 bkref_str_off
2741 + sl_str_diff);
2742 if (BE (err != REG_NOERROR, 0))
2743 return err;
2744 buf = (const char *) re_string_get_buffer (&mctx->input);
2746 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2747 /* We don't need to search this sub expression any more. */
2748 break;
2750 bkref_str_off += sl_str_diff;
2751 sl_str += sl_str_diff;
2752 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2753 bkref_str_idx);
2755 /* Reload buf, since the preceding call might have reallocated
2756 the buffer. */
2757 buf = (const char *) re_string_get_buffer (&mctx->input);
2759 if (err == REG_NOMATCH)
2760 continue;
2761 if (BE (err != REG_NOERROR, 0))
2762 return err;
2765 if (sub_last_idx < sub_top->nlasts)
2766 continue;
2767 if (sub_last_idx > 0)
2768 ++sl_str;
2769 /* Then, search for the other last nodes of the sub expression. */
2770 for (; sl_str <= bkref_str_idx; ++sl_str)
2772 int cls_node, sl_str_off;
2773 const re_node_set *nodes;
2774 sl_str_off = sl_str - sub_top->str_idx;
2775 /* The matched string by the sub expression match with the substring
2776 at the back reference? */
2777 if (sl_str_off > 0)
2779 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2781 /* If we are at the end of the input, we cannot match. */
2782 if (bkref_str_off >= mctx->input.len)
2783 break;
2785 err = extend_buffers (mctx);
2786 if (BE (err != REG_NOERROR, 0))
2787 return err;
2789 buf = (const char *) re_string_get_buffer (&mctx->input);
2791 if (buf [bkref_str_off++] != buf[sl_str - 1])
2792 break; /* We don't need to search this sub expression
2793 any more. */
2795 if (mctx->state_log[sl_str] == NULL)
2796 continue;
2797 /* Does this state have a ')' of the sub expression? */
2798 nodes = &mctx->state_log[sl_str]->nodes;
2799 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2800 OP_CLOSE_SUBEXP);
2801 if (cls_node == -1)
2802 continue; /* No. */
2803 if (sub_top->path == NULL)
2805 sub_top->path = calloc (sizeof (state_array_t),
2806 sl_str - sub_top->str_idx + 1);
2807 if (sub_top->path == NULL)
2808 return REG_ESPACE;
2810 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2811 in the current context? */
2812 err = check_arrival (mctx, sub_top->path, sub_top->node,
2813 sub_top->str_idx, cls_node, sl_str,
2814 OP_CLOSE_SUBEXP);
2815 if (err == REG_NOMATCH)
2816 continue;
2817 if (BE (err != REG_NOERROR, 0))
2818 return err;
2819 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2820 if (BE (sub_last == NULL, 0))
2821 return REG_ESPACE;
2822 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2823 bkref_str_idx);
2824 if (err == REG_NOMATCH)
2825 continue;
2828 return REG_NOERROR;
2831 /* Helper functions for get_subexp(). */
2833 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2834 If it can arrive, register the sub expression expressed with SUB_TOP
2835 and SUB_LAST. */
2837 static reg_errcode_t
2838 internal_function
2839 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2840 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2842 reg_errcode_t err;
2843 int to_idx;
2844 /* Can the subexpression arrive the back reference? */
2845 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2846 sub_last->str_idx, bkref_node, bkref_str,
2847 OP_OPEN_SUBEXP);
2848 if (err != REG_NOERROR)
2849 return err;
2850 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2851 sub_last->str_idx);
2852 if (BE (err != REG_NOERROR, 0))
2853 return err;
2854 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2855 return clean_state_log_if_needed (mctx, to_idx);
2858 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2859 Search '(' if FL_OPEN, or search ')' otherwise.
2860 TODO: This function isn't efficient...
2861 Because there might be more than one nodes whose types are
2862 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2863 nodes.
2864 E.g. RE: (a){2} */
2866 static int
2867 internal_function
2868 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2869 int subexp_idx, int type)
2871 int cls_idx;
2872 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2874 int cls_node = nodes->elems[cls_idx];
2875 const re_token_t *node = dfa->nodes + cls_node;
2876 if (node->type == type
2877 && node->opr.idx == subexp_idx)
2878 return cls_node;
2880 return -1;
2883 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2884 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2885 heavily reused.
2886 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2888 static reg_errcode_t
2889 internal_function
2890 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2891 int top_str, int last_node, int last_str, int type)
2893 const re_dfa_t *const dfa = mctx->dfa;
2894 reg_errcode_t err = REG_NOERROR;
2895 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2896 re_dfastate_t *cur_state = NULL;
2897 re_node_set *cur_nodes, next_nodes;
2898 re_dfastate_t **backup_state_log;
2899 unsigned int context;
2901 subexp_num = dfa->nodes[top_node].opr.idx;
2902 /* Extend the buffer if we need. */
2903 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2905 re_dfastate_t **new_array;
2906 int old_alloc = path->alloc;
2907 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2908 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2909 if (BE (new_array == NULL, 0))
2911 path->alloc = old_alloc;
2912 return REG_ESPACE;
2914 path->array = new_array;
2915 memset (new_array + old_alloc, '\0',
2916 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2919 str_idx = path->next_idx ? path->next_idx : top_str;
2921 /* Temporary modify MCTX. */
2922 backup_state_log = mctx->state_log;
2923 backup_cur_idx = mctx->input.cur_idx;
2924 mctx->state_log = path->array;
2925 mctx->input.cur_idx = str_idx;
2927 /* Setup initial node set. */
2928 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2929 if (str_idx == top_str)
2931 err = re_node_set_init_1 (&next_nodes, top_node);
2932 if (BE (err != REG_NOERROR, 0))
2933 return err;
2934 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2935 if (BE (err != REG_NOERROR, 0))
2937 re_node_set_free (&next_nodes);
2938 return err;
2941 else
2943 cur_state = mctx->state_log[str_idx];
2944 if (cur_state && cur_state->has_backref)
2946 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2947 if (BE (err != REG_NOERROR, 0))
2948 return err;
2950 else
2951 re_node_set_init_empty (&next_nodes);
2953 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2955 if (next_nodes.nelem)
2957 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2958 subexp_num, type);
2959 if (BE (err != REG_NOERROR, 0))
2961 re_node_set_free (&next_nodes);
2962 return err;
2965 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2966 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2968 re_node_set_free (&next_nodes);
2969 return err;
2971 mctx->state_log[str_idx] = cur_state;
2974 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2976 re_node_set_empty (&next_nodes);
2977 if (mctx->state_log[str_idx + 1])
2979 err = re_node_set_merge (&next_nodes,
2980 &mctx->state_log[str_idx + 1]->nodes);
2981 if (BE (err != REG_NOERROR, 0))
2983 re_node_set_free (&next_nodes);
2984 return err;
2987 if (cur_state)
2989 err = check_arrival_add_next_nodes (mctx, str_idx,
2990 &cur_state->non_eps_nodes,
2991 &next_nodes);
2992 if (BE (err != REG_NOERROR, 0))
2994 re_node_set_free (&next_nodes);
2995 return err;
2998 ++str_idx;
2999 if (next_nodes.nelem)
3001 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3002 if (BE (err != REG_NOERROR, 0))
3004 re_node_set_free (&next_nodes);
3005 return err;
3007 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3008 subexp_num, type);
3009 if (BE (err != REG_NOERROR, 0))
3011 re_node_set_free (&next_nodes);
3012 return err;
3015 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3016 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3017 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3019 re_node_set_free (&next_nodes);
3020 return err;
3022 mctx->state_log[str_idx] = cur_state;
3023 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3025 re_node_set_free (&next_nodes);
3026 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3027 : &mctx->state_log[last_str]->nodes);
3028 path->next_idx = str_idx;
3030 /* Fix MCTX. */
3031 mctx->state_log = backup_state_log;
3032 mctx->input.cur_idx = backup_cur_idx;
3034 /* Then check the current node set has the node LAST_NODE. */
3035 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3036 return REG_NOERROR;
3038 return REG_NOMATCH;
3041 /* Helper functions for check_arrival. */
3043 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3044 to NEXT_NODES.
3045 TODO: This function is similar to the functions transit_state*(),
3046 however this function has many additional works.
3047 Can't we unify them? */
3049 static reg_errcode_t
3050 internal_function
3051 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3052 re_node_set *cur_nodes, re_node_set *next_nodes)
3054 const re_dfa_t *const dfa = mctx->dfa;
3055 int result;
3056 int cur_idx;
3057 #ifdef RE_ENABLE_I18N
3058 reg_errcode_t err = REG_NOERROR;
3059 #endif
3060 re_node_set union_set;
3061 re_node_set_init_empty (&union_set);
3062 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3064 int naccepted = 0;
3065 int cur_node = cur_nodes->elems[cur_idx];
3066 #ifdef DEBUG
3067 re_token_type_t type = dfa->nodes[cur_node].type;
3068 assert (!IS_EPSILON_NODE (type));
3069 #endif
3070 #ifdef RE_ENABLE_I18N
3071 /* If the node may accept `multi byte'. */
3072 if (dfa->nodes[cur_node].accept_mb)
3074 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3075 str_idx);
3076 if (naccepted > 1)
3078 re_dfastate_t *dest_state;
3079 int next_node = dfa->nexts[cur_node];
3080 int next_idx = str_idx + naccepted;
3081 dest_state = mctx->state_log[next_idx];
3082 re_node_set_empty (&union_set);
3083 if (dest_state)
3085 err = re_node_set_merge (&union_set, &dest_state->nodes);
3086 if (BE (err != REG_NOERROR, 0))
3088 re_node_set_free (&union_set);
3089 return err;
3092 result = re_node_set_insert (&union_set, next_node);
3093 if (BE (result < 0, 0))
3095 re_node_set_free (&union_set);
3096 return REG_ESPACE;
3098 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3099 &union_set);
3100 if (BE (mctx->state_log[next_idx] == NULL
3101 && err != REG_NOERROR, 0))
3103 re_node_set_free (&union_set);
3104 return err;
3108 #endif /* RE_ENABLE_I18N */
3109 if (naccepted
3110 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3112 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3113 if (BE (result < 0, 0))
3115 re_node_set_free (&union_set);
3116 return REG_ESPACE;
3120 re_node_set_free (&union_set);
3121 return REG_NOERROR;
3124 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3125 CUR_NODES, however exclude the nodes which are:
3126 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3127 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3130 static reg_errcode_t
3131 internal_function
3132 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3133 int ex_subexp, int type)
3135 reg_errcode_t err;
3136 int idx, outside_node;
3137 re_node_set new_nodes;
3138 #ifdef DEBUG
3139 assert (cur_nodes->nelem);
3140 #endif
3141 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3142 if (BE (err != REG_NOERROR, 0))
3143 return err;
3144 /* Create a new node set NEW_NODES with the nodes which are epsilon
3145 closures of the node in CUR_NODES. */
3147 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3149 int cur_node = cur_nodes->elems[idx];
3150 const re_node_set *eclosure = dfa->eclosures + cur_node;
3151 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3152 if (outside_node == -1)
3154 /* There are no problematic nodes, just merge them. */
3155 err = re_node_set_merge (&new_nodes, eclosure);
3156 if (BE (err != REG_NOERROR, 0))
3158 re_node_set_free (&new_nodes);
3159 return err;
3162 else
3164 /* There are problematic nodes, re-calculate incrementally. */
3165 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3166 ex_subexp, type);
3167 if (BE (err != REG_NOERROR, 0))
3169 re_node_set_free (&new_nodes);
3170 return err;
3174 re_node_set_free (cur_nodes);
3175 *cur_nodes = new_nodes;
3176 return REG_NOERROR;
3179 /* Helper function for check_arrival_expand_ecl.
3180 Check incrementally the epsilon closure of TARGET, and if it isn't
3181 problematic append it to DST_NODES. */
3183 static reg_errcode_t
3184 internal_function
3185 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3186 int target, int ex_subexp, int type)
3188 int cur_node;
3189 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3191 int err;
3193 if (dfa->nodes[cur_node].type == type
3194 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3196 if (type == OP_CLOSE_SUBEXP)
3198 err = re_node_set_insert (dst_nodes, cur_node);
3199 if (BE (err == -1, 0))
3200 return REG_ESPACE;
3202 break;
3204 err = re_node_set_insert (dst_nodes, cur_node);
3205 if (BE (err == -1, 0))
3206 return REG_ESPACE;
3207 if (dfa->edests[cur_node].nelem == 0)
3208 break;
3209 if (dfa->edests[cur_node].nelem == 2)
3211 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3212 dfa->edests[cur_node].elems[1],
3213 ex_subexp, type);
3214 if (BE (err != REG_NOERROR, 0))
3215 return err;
3217 cur_node = dfa->edests[cur_node].elems[0];
3219 return REG_NOERROR;
3223 /* For all the back references in the current state, calculate the
3224 destination of the back references by the appropriate entry
3225 in MCTX->BKREF_ENTS. */
3227 static reg_errcode_t
3228 internal_function
3229 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3230 int cur_str, int subexp_num, int type)
3232 const re_dfa_t *const dfa = mctx->dfa;
3233 reg_errcode_t err;
3234 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3235 struct re_backref_cache_entry *ent;
3237 if (cache_idx_start == -1)
3238 return REG_NOERROR;
3240 restart:
3241 ent = mctx->bkref_ents + cache_idx_start;
3244 int to_idx, next_node;
3246 /* Is this entry ENT is appropriate? */
3247 if (!re_node_set_contains (cur_nodes, ent->node))
3248 continue; /* No. */
3250 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3251 /* Calculate the destination of the back reference, and append it
3252 to MCTX->STATE_LOG. */
3253 if (to_idx == cur_str)
3255 /* The backreference did epsilon transit, we must re-check all the
3256 node in the current state. */
3257 re_node_set new_dests;
3258 reg_errcode_t err2, err3;
3259 next_node = dfa->edests[ent->node].elems[0];
3260 if (re_node_set_contains (cur_nodes, next_node))
3261 continue;
3262 err = re_node_set_init_1 (&new_dests, next_node);
3263 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3264 err3 = re_node_set_merge (cur_nodes, &new_dests);
3265 re_node_set_free (&new_dests);
3266 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3267 || err3 != REG_NOERROR, 0))
3269 err = (err != REG_NOERROR ? err
3270 : (err2 != REG_NOERROR ? err2 : err3));
3271 return err;
3273 /* TODO: It is still inefficient... */
3274 goto restart;
3276 else
3278 re_node_set union_set;
3279 next_node = dfa->nexts[ent->node];
3280 if (mctx->state_log[to_idx])
3282 int ret;
3283 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3284 next_node))
3285 continue;
3286 err = re_node_set_init_copy (&union_set,
3287 &mctx->state_log[to_idx]->nodes);
3288 ret = re_node_set_insert (&union_set, next_node);
3289 if (BE (err != REG_NOERROR || ret < 0, 0))
3291 re_node_set_free (&union_set);
3292 err = err != REG_NOERROR ? err : REG_ESPACE;
3293 return err;
3296 else
3298 err = re_node_set_init_1 (&union_set, next_node);
3299 if (BE (err != REG_NOERROR, 0))
3300 return err;
3302 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3303 re_node_set_free (&union_set);
3304 if (BE (mctx->state_log[to_idx] == NULL
3305 && err != REG_NOERROR, 0))
3306 return err;
3309 while (ent++->more);
3310 return REG_NOERROR;
3313 /* Build transition table for the state.
3314 Return 1 if succeeded, otherwise return NULL. */
3316 static int
3317 internal_function
3318 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3320 reg_errcode_t err;
3321 int i, j, ch, need_word_trtable = 0;
3322 bitset_word_t elem, mask;
3323 bool dests_node_malloced = false;
3324 bool dest_states_malloced = false;
3325 int ndests; /* Number of the destination states from `state'. */
3326 re_dfastate_t **trtable;
3327 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3328 re_node_set follows, *dests_node;
3329 bitset_t *dests_ch;
3330 bitset_t acceptable;
3332 struct dests_alloc
3334 re_node_set dests_node[SBC_MAX];
3335 bitset_t dests_ch[SBC_MAX];
3336 } *dests_alloc;
3338 /* We build DFA states which corresponds to the destination nodes
3339 from `state'. `dests_node[i]' represents the nodes which i-th
3340 destination state contains, and `dests_ch[i]' represents the
3341 characters which i-th destination state accepts. */
3342 #ifdef HAVE_ALLOCA
3343 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3344 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3345 else
3346 #endif
3348 dests_alloc = re_malloc (struct dests_alloc, 1);
3349 if (BE (dests_alloc == NULL, 0))
3350 return 0;
3351 dests_node_malloced = true;
3353 dests_node = dests_alloc->dests_node;
3354 dests_ch = dests_alloc->dests_ch;
3356 /* Initialize transiton table. */
3357 state->word_trtable = state->trtable = NULL;
3359 /* At first, group all nodes belonging to `state' into several
3360 destinations. */
3361 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3362 if (BE (ndests <= 0, 0))
3364 if (dests_node_malloced)
3365 free (dests_alloc);
3366 /* Return 0 in case of an error, 1 otherwise. */
3367 if (ndests == 0)
3369 state->trtable = (re_dfastate_t **)
3370 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3371 return 1;
3373 return 0;
3376 err = re_node_set_alloc (&follows, ndests + 1);
3377 if (BE (err != REG_NOERROR, 0))
3378 goto out_free;
3380 /* Avoid arithmetic overflow in size calculation. */
3381 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3382 / (3 * sizeof (re_dfastate_t *)))
3383 < ndests),
3385 goto out_free;
3387 #ifdef HAVE_ALLOCA
3388 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3389 + ndests * 3 * sizeof (re_dfastate_t *)))
3390 dest_states = (re_dfastate_t **)
3391 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3392 else
3393 #endif
3395 dest_states = (re_dfastate_t **)
3396 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3397 if (BE (dest_states == NULL, 0))
3399 out_free:
3400 if (dest_states_malloced)
3401 free (dest_states);
3402 re_node_set_free (&follows);
3403 for (i = 0; i < ndests; ++i)
3404 re_node_set_free (dests_node + i);
3405 if (dests_node_malloced)
3406 free (dests_alloc);
3407 return 0;
3409 dest_states_malloced = true;
3411 dest_states_word = dest_states + ndests;
3412 dest_states_nl = dest_states_word + ndests;
3413 bitset_empty (acceptable);
3415 /* Then build the states for all destinations. */
3416 for (i = 0; i < ndests; ++i)
3418 int next_node;
3419 re_node_set_empty (&follows);
3420 /* Merge the follows of this destination states. */
3421 for (j = 0; j < dests_node[i].nelem; ++j)
3423 next_node = dfa->nexts[dests_node[i].elems[j]];
3424 if (next_node != -1)
3426 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3427 if (BE (err != REG_NOERROR, 0))
3428 goto out_free;
3431 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3432 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3433 goto out_free;
3434 /* If the new state has context constraint,
3435 build appropriate states for these contexts. */
3436 if (dest_states[i]->has_constraint)
3438 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3439 CONTEXT_WORD);
3440 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3441 goto out_free;
3443 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3444 need_word_trtable = 1;
3446 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3447 CONTEXT_NEWLINE);
3448 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3449 goto out_free;
3451 else
3453 dest_states_word[i] = dest_states[i];
3454 dest_states_nl[i] = dest_states[i];
3456 bitset_merge (acceptable, dests_ch[i]);
3459 if (!BE (need_word_trtable, 0))
3461 /* We don't care about whether the following character is a word
3462 character, or we are in a single-byte character set so we can
3463 discern by looking at the character code: allocate a
3464 256-entry transition table. */
3465 trtable = state->trtable =
3466 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3467 if (BE (trtable == NULL, 0))
3468 goto out_free;
3470 /* For all characters ch...: */
3471 for (i = 0; i < BITSET_WORDS; ++i)
3472 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3473 elem;
3474 mask <<= 1, elem >>= 1, ++ch)
3475 if (BE (elem & 1, 0))
3477 /* There must be exactly one destination which accepts
3478 character ch. See group_nodes_into_DFAstates. */
3479 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3482 /* j-th destination accepts the word character ch. */
3483 if (dfa->word_char[i] & mask)
3484 trtable[ch] = dest_states_word[j];
3485 else
3486 trtable[ch] = dest_states[j];
3489 else
3491 /* We care about whether the following character is a word
3492 character, and we are in a multi-byte character set: discern
3493 by looking at the character code: build two 256-entry
3494 transition tables, one starting at trtable[0] and one
3495 starting at trtable[SBC_MAX]. */
3496 trtable = state->word_trtable =
3497 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3498 if (BE (trtable == NULL, 0))
3499 goto out_free;
3501 /* For all characters ch...: */
3502 for (i = 0; i < BITSET_WORDS; ++i)
3503 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3504 elem;
3505 mask <<= 1, elem >>= 1, ++ch)
3506 if (BE (elem & 1, 0))
3508 /* There must be exactly one destination which accepts
3509 character ch. See group_nodes_into_DFAstates. */
3510 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3513 /* j-th destination accepts the word character ch. */
3514 trtable[ch] = dest_states[j];
3515 trtable[ch + SBC_MAX] = dest_states_word[j];
3519 /* new line */
3520 if (bitset_contain (acceptable, NEWLINE_CHAR))
3522 /* The current state accepts newline character. */
3523 for (j = 0; j < ndests; ++j)
3524 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3526 /* k-th destination accepts newline character. */
3527 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3528 if (need_word_trtable)
3529 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3530 /* There must be only one destination which accepts
3531 newline. See group_nodes_into_DFAstates. */
3532 break;
3536 if (dest_states_malloced)
3537 free (dest_states);
3539 re_node_set_free (&follows);
3540 for (i = 0; i < ndests; ++i)
3541 re_node_set_free (dests_node + i);
3543 if (dests_node_malloced)
3544 free (dests_alloc);
3546 return 1;
3549 /* Group all nodes belonging to STATE into several destinations.
3550 Then for all destinations, set the nodes belonging to the destination
3551 to DESTS_NODE[i] and set the characters accepted by the destination
3552 to DEST_CH[i]. This function return the number of destinations. */
3554 static int
3555 internal_function
3556 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3557 re_node_set *dests_node, bitset_t *dests_ch)
3559 reg_errcode_t err;
3560 int result;
3561 int i, j, k;
3562 int ndests; /* Number of the destinations from `state'. */
3563 bitset_t accepts; /* Characters a node can accept. */
3564 const re_node_set *cur_nodes = &state->nodes;
3565 bitset_empty (accepts);
3566 ndests = 0;
3568 /* For all the nodes belonging to `state', */
3569 for (i = 0; i < cur_nodes->nelem; ++i)
3571 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3572 re_token_type_t type = node->type;
3573 unsigned int constraint = node->constraint;
3575 /* Enumerate all single byte character this node can accept. */
3576 if (type == CHARACTER)
3577 bitset_set (accepts, node->opr.c);
3578 else if (type == SIMPLE_BRACKET)
3580 bitset_merge (accepts, node->opr.sbcset);
3582 else if (type == OP_PERIOD)
3584 #ifdef RE_ENABLE_I18N
3585 if (dfa->mb_cur_max > 1)
3586 bitset_merge (accepts, dfa->sb_char);
3587 else
3588 #endif
3589 bitset_set_all (accepts);
3590 if (!(dfa->syntax & RE_DOT_NEWLINE))
3591 bitset_clear (accepts, '\n');
3592 if (dfa->syntax & RE_DOT_NOT_NULL)
3593 bitset_clear (accepts, '\0');
3595 #ifdef RE_ENABLE_I18N
3596 else if (type == OP_UTF8_PERIOD)
3598 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3599 if (!(dfa->syntax & RE_DOT_NEWLINE))
3600 bitset_clear (accepts, '\n');
3601 if (dfa->syntax & RE_DOT_NOT_NULL)
3602 bitset_clear (accepts, '\0');
3604 #endif
3605 else
3606 continue;
3608 /* Check the `accepts' and sift the characters which are not
3609 match it the context. */
3610 if (constraint)
3612 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3614 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3615 bitset_empty (accepts);
3616 if (accepts_newline)
3617 bitset_set (accepts, NEWLINE_CHAR);
3618 else
3619 continue;
3621 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3623 bitset_empty (accepts);
3624 continue;
3627 if (constraint & NEXT_WORD_CONSTRAINT)
3629 bitset_word_t any_set = 0;
3630 if (type == CHARACTER && !node->word_char)
3632 bitset_empty (accepts);
3633 continue;
3635 #ifdef RE_ENABLE_I18N
3636 if (dfa->mb_cur_max > 1)
3637 for (j = 0; j < BITSET_WORDS; ++j)
3638 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3639 else
3640 #endif
3641 for (j = 0; j < BITSET_WORDS; ++j)
3642 any_set |= (accepts[j] &= dfa->word_char[j]);
3643 if (!any_set)
3644 continue;
3646 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3648 bitset_word_t any_set = 0;
3649 if (type == CHARACTER && node->word_char)
3651 bitset_empty (accepts);
3652 continue;
3654 #ifdef RE_ENABLE_I18N
3655 if (dfa->mb_cur_max > 1)
3656 for (j = 0; j < BITSET_WORDS; ++j)
3657 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3658 else
3659 #endif
3660 for (j = 0; j < BITSET_WORDS; ++j)
3661 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3662 if (!any_set)
3663 continue;
3667 /* Then divide `accepts' into DFA states, or create a new
3668 state. Above, we make sure that accepts is not empty. */
3669 for (j = 0; j < ndests; ++j)
3671 bitset_t intersec; /* Intersection sets, see below. */
3672 bitset_t remains;
3673 /* Flags, see below. */
3674 bitset_word_t has_intersec, not_subset, not_consumed;
3676 /* Optimization, skip if this state doesn't accept the character. */
3677 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3678 continue;
3680 /* Enumerate the intersection set of this state and `accepts'. */
3681 has_intersec = 0;
3682 for (k = 0; k < BITSET_WORDS; ++k)
3683 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3684 /* And skip if the intersection set is empty. */
3685 if (!has_intersec)
3686 continue;
3688 /* Then check if this state is a subset of `accepts'. */
3689 not_subset = not_consumed = 0;
3690 for (k = 0; k < BITSET_WORDS; ++k)
3692 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3693 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3696 /* If this state isn't a subset of `accepts', create a
3697 new group state, which has the `remains'. */
3698 if (not_subset)
3700 bitset_copy (dests_ch[ndests], remains);
3701 bitset_copy (dests_ch[j], intersec);
3702 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3703 if (BE (err != REG_NOERROR, 0))
3704 goto error_return;
3705 ++ndests;
3708 /* Put the position in the current group. */
3709 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3710 if (BE (result < 0, 0))
3711 goto error_return;
3713 /* If all characters are consumed, go to next node. */
3714 if (!not_consumed)
3715 break;
3717 /* Some characters remain, create a new group. */
3718 if (j == ndests)
3720 bitset_copy (dests_ch[ndests], accepts);
3721 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3722 if (BE (err != REG_NOERROR, 0))
3723 goto error_return;
3724 ++ndests;
3725 bitset_empty (accepts);
3728 return ndests;
3729 error_return:
3730 for (j = 0; j < ndests; ++j)
3731 re_node_set_free (dests_node + j);
3732 return -1;
3735 #ifdef RE_ENABLE_I18N
3736 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3737 Return the number of the bytes the node accepts.
3738 STR_IDX is the current index of the input string.
3740 This function handles the nodes which can accept one character, or
3741 one collating element like '.', '[a-z]', opposite to the other nodes
3742 can only accept one byte. */
3744 static int
3745 internal_function
3746 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3747 const re_string_t *input, int str_idx)
3749 const re_token_t *node = dfa->nodes + node_idx;
3750 int char_len, elem_len;
3751 int i;
3752 wint_t wc;
3754 if (BE (node->type == OP_UTF8_PERIOD, 0))
3756 unsigned char c = re_string_byte_at (input, str_idx), d;
3757 if (BE (c < 0xc2, 1))
3758 return 0;
3760 if (str_idx + 2 > input->len)
3761 return 0;
3763 d = re_string_byte_at (input, str_idx + 1);
3764 if (c < 0xe0)
3765 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3766 else if (c < 0xf0)
3768 char_len = 3;
3769 if (c == 0xe0 && d < 0xa0)
3770 return 0;
3772 else if (c < 0xf8)
3774 char_len = 4;
3775 if (c == 0xf0 && d < 0x90)
3776 return 0;
3778 else if (c < 0xfc)
3780 char_len = 5;
3781 if (c == 0xf8 && d < 0x88)
3782 return 0;
3784 else if (c < 0xfe)
3786 char_len = 6;
3787 if (c == 0xfc && d < 0x84)
3788 return 0;
3790 else
3791 return 0;
3793 if (str_idx + char_len > input->len)
3794 return 0;
3796 for (i = 1; i < char_len; ++i)
3798 d = re_string_byte_at (input, str_idx + i);
3799 if (d < 0x80 || d > 0xbf)
3800 return 0;
3802 return char_len;
3805 char_len = re_string_char_size_at (input, str_idx);
3806 if (node->type == OP_PERIOD)
3808 if (char_len <= 1)
3809 return 0;
3810 /* FIXME: I don't think this if is needed, as both '\n'
3811 and '\0' are char_len == 1. */
3812 /* '.' accepts any one character except the following two cases. */
3813 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3814 re_string_byte_at (input, str_idx) == '\n') ||
3815 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3816 re_string_byte_at (input, str_idx) == '\0'))
3817 return 0;
3818 return char_len;
3821 elem_len = re_string_elem_size_at (input, str_idx);
3822 wc = __btowc(*(input->mbs+str_idx));
3823 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3824 return 0;
3826 if (node->type == COMPLEX_BRACKET)
3828 const re_charset_t *cset = node->opr.mbcset;
3829 # ifdef _LIBC
3830 const unsigned char *pin
3831 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3832 int j;
3833 uint32_t nrules;
3834 # endif /* _LIBC */
3835 int match_len = 0;
3836 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3837 ? re_string_wchar_at (input, str_idx) : 0);
3839 /* match with multibyte character? */
3840 for (i = 0; i < cset->nmbchars; ++i)
3841 if (wc == cset->mbchars[i])
3843 match_len = char_len;
3844 goto check_node_accept_bytes_match;
3846 /* match with character_class? */
3847 for (i = 0; i < cset->nchar_classes; ++i)
3849 wctype_t wt = cset->char_classes[i];
3850 if (__iswctype (wc, wt))
3852 match_len = char_len;
3853 goto check_node_accept_bytes_match;
3857 # ifdef _LIBC
3858 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3859 if (nrules != 0)
3861 unsigned int in_collseq = 0;
3862 const int32_t *table, *indirect;
3863 const unsigned char *weights, *extra;
3864 const char *collseqwc;
3865 /* This #include defines a local function! */
3866 # include <locale/weight.h>
3868 /* match with collating_symbol? */
3869 if (cset->ncoll_syms)
3870 extra = (const unsigned char *)
3871 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3872 for (i = 0; i < cset->ncoll_syms; ++i)
3874 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3875 /* Compare the length of input collating element and
3876 the length of current collating element. */
3877 if (*coll_sym != elem_len)
3878 continue;
3879 /* Compare each bytes. */
3880 for (j = 0; j < *coll_sym; j++)
3881 if (pin[j] != coll_sym[1 + j])
3882 break;
3883 if (j == *coll_sym)
3885 /* Match if every bytes is equal. */
3886 match_len = j;
3887 goto check_node_accept_bytes_match;
3891 if (cset->nranges)
3893 if (elem_len <= char_len)
3895 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3896 in_collseq = __collseq_table_lookup (collseqwc, wc);
3898 else
3899 in_collseq = find_collation_sequence_value (pin, elem_len);
3901 /* match with range expression? */
3902 for (i = 0; i < cset->nranges; ++i)
3903 if (cset->range_starts[i] <= in_collseq
3904 && in_collseq <= cset->range_ends[i])
3906 match_len = elem_len;
3907 goto check_node_accept_bytes_match;
3910 /* match with equivalence_class? */
3911 if (cset->nequiv_classes)
3913 const unsigned char *cp = pin;
3914 table = (const int32_t *)
3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3916 weights = (const unsigned char *)
3917 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3918 extra = (const unsigned char *)
3919 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3920 indirect = (const int32_t *)
3921 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3922 int32_t idx = findidx (&cp);
3923 if (idx > 0)
3924 for (i = 0; i < cset->nequiv_classes; ++i)
3926 int32_t equiv_class_idx = cset->equiv_classes[i];
3927 size_t weight_len = weights[idx & 0xffffff];
3928 if (weight_len == weights[equiv_class_idx & 0xffffff]
3929 && (idx >> 24) == (equiv_class_idx >> 24))
3931 int cnt = 0;
3933 idx &= 0xffffff;
3934 equiv_class_idx &= 0xffffff;
3936 while (cnt <= weight_len
3937 && (weights[equiv_class_idx + 1 + cnt]
3938 == weights[idx + 1 + cnt]))
3939 ++cnt;
3940 if (cnt > weight_len)
3942 match_len = elem_len;
3943 goto check_node_accept_bytes_match;
3949 else
3950 # endif /* _LIBC */
3952 /* match with range expression? */
3953 #if __GNUC__ >= 2
3954 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3955 #else
3956 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3957 cmp_buf[2] = wc;
3958 #endif
3959 for (i = 0; i < cset->nranges; ++i)
3961 cmp_buf[0] = cset->range_starts[i];
3962 cmp_buf[4] = cset->range_ends[i];
3963 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3964 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3966 match_len = char_len;
3967 goto check_node_accept_bytes_match;
3971 check_node_accept_bytes_match:
3972 if (!cset->non_match)
3973 return match_len;
3974 else
3976 if (match_len > 0)
3977 return 0;
3978 else
3979 return (elem_len > char_len) ? elem_len : char_len;
3982 return 0;
3985 # ifdef _LIBC
3986 static unsigned int
3987 internal_function
3988 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3990 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3991 if (nrules == 0)
3993 if (mbs_len == 1)
3995 /* No valid character. Match it as a single byte character. */
3996 const unsigned char *collseq = (const unsigned char *)
3997 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3998 return collseq[mbs[0]];
4000 return UINT_MAX;
4002 else
4004 int32_t idx;
4005 const unsigned char *extra = (const unsigned char *)
4006 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4007 int32_t extrasize = (const unsigned char *)
4008 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4010 for (idx = 0; idx < extrasize;)
4012 int mbs_cnt, found = 0;
4013 int32_t elem_mbs_len;
4014 /* Skip the name of collating element name. */
4015 idx = idx + extra[idx] + 1;
4016 elem_mbs_len = extra[idx++];
4017 if (mbs_len == elem_mbs_len)
4019 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4020 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4021 break;
4022 if (mbs_cnt == elem_mbs_len)
4023 /* Found the entry. */
4024 found = 1;
4026 /* Skip the byte sequence of the collating element. */
4027 idx += elem_mbs_len;
4028 /* Adjust for the alignment. */
4029 idx = (idx + 3) & ~3;
4030 /* Skip the collation sequence value. */
4031 idx += sizeof (uint32_t);
4032 /* Skip the wide char sequence of the collating element. */
4033 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4034 /* If we found the entry, return the sequence value. */
4035 if (found)
4036 return *(uint32_t *) (extra + idx);
4037 /* Skip the collation sequence value. */
4038 idx += sizeof (uint32_t);
4040 return UINT_MAX;
4043 # endif /* _LIBC */
4044 #endif /* RE_ENABLE_I18N */
4046 /* Check whether the node accepts the byte which is IDX-th
4047 byte of the INPUT. */
4049 static int
4050 internal_function
4051 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4052 int idx)
4054 unsigned char ch;
4055 ch = re_string_byte_at (&mctx->input, idx);
4056 switch (node->type)
4058 case CHARACTER:
4059 if (node->opr.c != ch)
4060 return 0;
4061 break;
4063 case SIMPLE_BRACKET:
4064 if (!bitset_contain (node->opr.sbcset, ch))
4065 return 0;
4066 break;
4068 #ifdef RE_ENABLE_I18N
4069 case OP_UTF8_PERIOD:
4070 if (ch >= 0x80)
4071 return 0;
4072 /* FALLTHROUGH */
4073 #endif
4074 case OP_PERIOD:
4075 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4076 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4077 return 0;
4078 break;
4080 default:
4081 return 0;
4084 if (node->constraint)
4086 /* The node has constraints. Check whether the current context
4087 satisfies the constraints. */
4088 unsigned int context = re_string_context_at (&mctx->input, idx,
4089 mctx->eflags);
4090 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4091 return 0;
4094 return 1;
4097 /* Extend the buffers, if the buffers have run out. */
4099 static reg_errcode_t
4100 internal_function
4101 extend_buffers (re_match_context_t *mctx)
4103 reg_errcode_t ret;
4104 re_string_t *pstr = &mctx->input;
4106 /* Avoid overflow. */
4107 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4108 return REG_ESPACE;
4110 /* Double the lengthes of the buffers. */
4111 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4112 if (BE (ret != REG_NOERROR, 0))
4113 return ret;
4115 if (mctx->state_log != NULL)
4117 /* And double the length of state_log. */
4118 /* XXX We have no indication of the size of this buffer. If this
4119 allocation fail we have no indication that the state_log array
4120 does not have the right size. */
4121 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4122 pstr->bufs_len + 1);
4123 if (BE (new_array == NULL, 0))
4124 return REG_ESPACE;
4125 mctx->state_log = new_array;
4128 /* Then reconstruct the buffers. */
4129 if (pstr->icase)
4131 #ifdef RE_ENABLE_I18N
4132 if (pstr->mb_cur_max > 1)
4134 ret = build_wcs_upper_buffer (pstr);
4135 if (BE (ret != REG_NOERROR, 0))
4136 return ret;
4138 else
4139 #endif /* RE_ENABLE_I18N */
4140 build_upper_buffer (pstr);
4142 else
4144 #ifdef RE_ENABLE_I18N
4145 if (pstr->mb_cur_max > 1)
4146 build_wcs_buffer (pstr);
4147 else
4148 #endif /* RE_ENABLE_I18N */
4150 if (pstr->trans != NULL)
4151 re_string_translate_buffer (pstr);
4154 return REG_NOERROR;
4158 /* Functions for matching context. */
4160 /* Initialize MCTX. */
4162 static reg_errcode_t
4163 internal_function
4164 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4166 mctx->eflags = eflags;
4167 mctx->match_last = -1;
4168 if (n > 0)
4170 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4171 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4172 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4173 return REG_ESPACE;
4175 /* Already zero-ed by the caller.
4176 else
4177 mctx->bkref_ents = NULL;
4178 mctx->nbkref_ents = 0;
4179 mctx->nsub_tops = 0; */
4180 mctx->abkref_ents = n;
4181 mctx->max_mb_elem_len = 1;
4182 mctx->asub_tops = n;
4183 return REG_NOERROR;
4186 /* Clean the entries which depend on the current input in MCTX.
4187 This function must be invoked when the matcher changes the start index
4188 of the input, or changes the input string. */
4190 static void
4191 internal_function
4192 match_ctx_clean (re_match_context_t *mctx)
4194 int st_idx;
4195 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4197 int sl_idx;
4198 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4199 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4201 re_sub_match_last_t *last = top->lasts[sl_idx];
4202 re_free (last->path.array);
4203 re_free (last);
4205 re_free (top->lasts);
4206 if (top->path)
4208 re_free (top->path->array);
4209 re_free (top->path);
4211 free (top);
4214 mctx->nsub_tops = 0;
4215 mctx->nbkref_ents = 0;
4218 /* Free all the memory associated with MCTX. */
4220 static void
4221 internal_function
4222 match_ctx_free (re_match_context_t *mctx)
4224 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4225 match_ctx_clean (mctx);
4226 re_free (mctx->sub_tops);
4227 re_free (mctx->bkref_ents);
4230 /* Add a new backreference entry to MCTX.
4231 Note that we assume that caller never call this function with duplicate
4232 entry, and call with STR_IDX which isn't smaller than any existing entry.
4235 static reg_errcode_t
4236 internal_function
4237 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4238 int to)
4240 if (mctx->nbkref_ents >= mctx->abkref_ents)
4242 struct re_backref_cache_entry* new_entry;
4243 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4244 mctx->abkref_ents * 2);
4245 if (BE (new_entry == NULL, 0))
4247 re_free (mctx->bkref_ents);
4248 return REG_ESPACE;
4250 mctx->bkref_ents = new_entry;
4251 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4252 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4253 mctx->abkref_ents *= 2;
4255 if (mctx->nbkref_ents > 0
4256 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4257 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4259 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4260 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4261 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4262 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4264 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4265 If bit N is clear, means that this entry won't epsilon-transition to
4266 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4267 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4268 such node.
4270 A backreference does not epsilon-transition unless it is empty, so set
4271 to all zeros if FROM != TO. */
4272 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4273 = (from == to ? ~0 : 0);
4275 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4276 if (mctx->max_mb_elem_len < to - from)
4277 mctx->max_mb_elem_len = to - from;
4278 return REG_NOERROR;
4281 /* Search for the first entry which has the same str_idx, or -1 if none is
4282 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4284 static int
4285 internal_function
4286 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4288 int left, right, mid, last;
4289 last = right = mctx->nbkref_ents;
4290 for (left = 0; left < right;)
4292 mid = (left + right) / 2;
4293 if (mctx->bkref_ents[mid].str_idx < str_idx)
4294 left = mid + 1;
4295 else
4296 right = mid;
4298 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4299 return left;
4300 else
4301 return -1;
4304 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4305 at STR_IDX. */
4307 static reg_errcode_t
4308 internal_function
4309 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4311 #ifdef DEBUG
4312 assert (mctx->sub_tops != NULL);
4313 assert (mctx->asub_tops > 0);
4314 #endif
4315 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4317 int new_asub_tops = mctx->asub_tops * 2;
4318 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4319 re_sub_match_top_t *,
4320 new_asub_tops);
4321 if (BE (new_array == NULL, 0))
4322 return REG_ESPACE;
4323 mctx->sub_tops = new_array;
4324 mctx->asub_tops = new_asub_tops;
4326 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4327 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4328 return REG_ESPACE;
4329 mctx->sub_tops[mctx->nsub_tops]->node = node;
4330 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4331 return REG_NOERROR;
4334 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4335 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4337 static re_sub_match_last_t *
4338 internal_function
4339 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4341 re_sub_match_last_t *new_entry;
4342 if (BE (subtop->nlasts == subtop->alasts, 0))
4344 int new_alasts = 2 * subtop->alasts + 1;
4345 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4346 re_sub_match_last_t *,
4347 new_alasts);
4348 if (BE (new_array == NULL, 0))
4349 return NULL;
4350 subtop->lasts = new_array;
4351 subtop->alasts = new_alasts;
4353 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4354 if (BE (new_entry != NULL, 1))
4356 subtop->lasts[subtop->nlasts] = new_entry;
4357 new_entry->node = node;
4358 new_entry->str_idx = str_idx;
4359 ++subtop->nlasts;
4361 return new_entry;
4364 static void
4365 internal_function
4366 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4367 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4369 sctx->sifted_states = sifted_sts;
4370 sctx->limited_states = limited_sts;
4371 sctx->last_node = last_node;
4372 sctx->last_str_idx = last_str_idx;
4373 re_node_set_init_empty (&sctx->limits);