Remove "[Add new features here]" for 2.27
[glibc.git] / posix / regexec.c
blob8eb09dcfa0d329a3100bf74b67229d0db87b2d18
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
23 int n) internal_function;
24 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
25 static void match_ctx_free (re_match_context_t *cache) internal_function;
26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27 int str_idx, int from, int to)
28 internal_function;
29 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 internal_function;
31 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
32 int str_idx) internal_function;
33 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
34 int node, int str_idx)
35 internal_function;
36 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
37 re_dfastate_t **limited_sts, int last_node,
38 int last_str_idx)
39 internal_function;
40 static reg_errcode_t re_search_internal (const regex_t *preg,
41 const char *string, int length,
42 int start, int range, int stop,
43 size_t nmatch, regmatch_t pmatch[],
44 int eflags) internal_function;
45 static int re_search_2_stub (struct re_pattern_buffer *bufp,
46 const char *string1, int length1,
47 const char *string2, int length2,
48 int start, int range, struct re_registers *regs,
49 int stop, int ret_len) internal_function;
50 static int re_search_stub (struct re_pattern_buffer *bufp,
51 const char *string, int length, int start,
52 int range, int stop, struct re_registers *regs,
53 int ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 int nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 internal_function;
58 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
59 int *p_match_first) internal_function;
60 static int check_halt_state_context (const re_match_context_t *mctx,
61 const re_dfastate_t *state, int idx)
62 internal_function;
63 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 regmatch_t *prev_idx_match, int cur_node,
65 int cur_idx, int nmatch) internal_function;
66 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 int str_idx, int dest_node, int nregs,
68 regmatch_t *regs,
69 re_node_set *eps_via_nodes)
70 internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 int fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
76 internal_function;
78 #ifdef RE_ENABLE_I18N
79 static int sift_states_iter_mb (const re_match_context_t *mctx,
80 re_sift_context_t *sctx,
81 int node_idx, int str_idx, int max_str_idx)
82 internal_function;
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 re_sift_context_t *sctx)
86 internal_function;
87 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 re_sift_context_t *sctx, int str_idx,
89 re_node_set *cur_dest)
90 internal_function;
91 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
93 int str_idx,
94 re_node_set *dest_nodes)
95 internal_function;
96 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates)
99 internal_function;
100 static int check_dst_limits (const re_match_context_t *mctx,
101 re_node_set *limits,
102 int dst_node, int dst_idx, int src_node,
103 int src_idx) internal_function;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 int boundaries, int subexp_idx,
106 int from_node, int bkref_idx)
107 internal_function;
108 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 int limit, int subexp_idx,
110 int node, int str_idx,
111 int bkref_idx) internal_function;
112 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates,
115 re_node_set *limits,
116 struct re_backref_cache_entry *bkref_ents,
117 int str_idx) internal_function;
118 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 re_sift_context_t *sctx,
120 int str_idx, const re_node_set *candidates)
121 internal_function;
122 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **dst,
124 re_dfastate_t **src, int num)
125 internal_function;
126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128 static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state)
134 internal_function;
135 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 re_node_set *cur_nodes,
137 int str_idx) internal_function;
138 #if 0
139 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 re_match_context_t *mctx,
141 re_dfastate_t *pstate)
142 internal_function;
143 #endif
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 re_dfastate_t *pstate)
147 internal_function;
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes)
151 internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx)
154 internal_function;
155 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 const re_sub_match_top_t *sub_top,
157 re_sub_match_last_t *sub_last,
158 int bkref_node, int bkref_str)
159 internal_function;
160 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 int subexp_idx, int type) internal_function;
162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, int top_node,
164 int top_str, int last_node, int last_str,
165 int type) internal_function;
166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 int str_idx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes)
170 internal_function;
171 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 re_node_set *cur_nodes,
173 int ex_subexp, int type)
174 internal_function;
175 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 re_node_set *dst_nodes,
177 int target, int ex_subexp,
178 int type) internal_function;
179 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 re_node_set *cur_nodes, int cur_str,
181 int subexp_num, int type)
182 internal_function;
183 static int build_trtable (const re_dfa_t *dfa,
184 re_dfastate_t *state) internal_function;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
187 const re_string_t *input, int idx)
188 internal_function;
189 # ifdef _LIBC
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
191 size_t name_len)
192 internal_function;
193 # endif /* _LIBC */
194 #endif /* RE_ENABLE_I18N */
195 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 const re_dfastate_t *state,
197 re_node_set *states_node,
198 bitset_t *states_ch) internal_function;
199 static int check_node_accept (const re_match_context_t *mctx,
200 const re_token_t *node, int idx)
201 internal_function;
202 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
203 internal_function;
205 /* Entry point for POSIX code. */
207 /* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
219 We return 0 if we find a match and REG_NOMATCH if not. */
222 regexec (const regex_t *__restrict preg, const char *__restrict string,
223 size_t nmatch, regmatch_t pmatch[], int eflags)
225 reg_errcode_t err;
226 int start, length;
227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230 return REG_BADPAT;
232 if (eflags & REG_STARTEND)
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
237 else
239 start = 0;
240 length = strlen (string);
243 __libc_lock_lock (dfa->lock);
244 if (preg->no_sub)
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
247 else
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
254 #ifdef _LIBC
255 # include <shlib-compat.h>
256 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec) __compat_regexec;
262 attribute_compat_text_section
263 __compat_regexec (const regex_t *__restrict preg,
264 const char *__restrict string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
270 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271 # endif
272 #endif
274 /* Entry points for GNU code. */
276 /* re_match, re_search, re_match_2, re_search_2
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
288 way as re_match().)
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
292 concerned.
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
297 strings.)
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
304 re_match (struct re_pattern_buffer *bufp, const char *string, int length,
305 int start, struct re_registers *regs)
307 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
309 #ifdef _LIBC
310 weak_alias (__re_match, re_match)
311 #endif
314 re_search (struct re_pattern_buffer *bufp, const char *string, int length,
315 int start, int range, struct re_registers *regs)
317 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
319 #ifdef _LIBC
320 weak_alias (__re_search, re_search)
321 #endif
324 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
325 const char *string2, int length2, int start,
326 struct re_registers *regs, int stop)
328 return re_search_2_stub (bufp, string1, length1, string2, length2,
329 start, 0, regs, stop, 1);
331 #ifdef _LIBC
332 weak_alias (__re_match_2, re_match_2)
333 #endif
336 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
337 const char *string2, int length2, int start, int range,
338 struct re_registers *regs, int stop)
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, range, regs, stop, 0);
343 #ifdef _LIBC
344 weak_alias (__re_search_2, re_search_2)
345 #endif
347 static int
348 internal_function
349 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
350 int length1, const char *string2, int length2, int start,
351 int range, struct re_registers *regs,
352 int stop, int ret_len)
354 const char *str;
355 int rval;
356 int len = length1 + length2;
357 char *s = NULL;
359 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
360 return -2;
362 /* Concatenate the strings. */
363 if (length2 > 0)
364 if (length1 > 0)
366 s = re_malloc (char, len);
368 if (BE (s == NULL, 0))
369 return -2;
370 #ifdef _LIBC
371 memcpy (__mempcpy (s, string1, length1), string2, length2);
372 #else
373 memcpy (s, string1, length1);
374 memcpy (s + length1, string2, length2);
375 #endif
376 str = s;
378 else
379 str = string2;
380 else
381 str = string1;
383 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
384 re_free (s);
385 return rval;
388 /* The parameters have the same meaning as those of re_search.
389 Additional parameters:
390 If RET_LEN is nonzero the length of the match is returned (re_match style);
391 otherwise the position of the match is returned. */
393 static int
394 internal_function
395 re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
396 int start, int range, int stop, struct re_registers *regs,
397 int ret_len)
399 reg_errcode_t result;
400 regmatch_t *pmatch;
401 int nregs, rval;
402 int eflags = 0;
403 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
405 /* Check for out-of-range. */
406 if (BE (start < 0 || start > length, 0))
407 return -1;
408 if (BE (start + range > length, 0))
409 range = length - start;
410 else if (BE (start + range < 0, 0))
411 range = -start;
413 __libc_lock_lock (dfa->lock);
415 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
416 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
418 /* Compile fastmap if we haven't yet. */
419 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
420 re_compile_fastmap (bufp);
422 if (BE (bufp->no_sub, 0))
423 regs = NULL;
425 /* We need at least 1 register. */
426 if (regs == NULL)
427 nregs = 1;
428 else if (BE (bufp->regs_allocated == REGS_FIXED &&
429 regs->num_regs < bufp->re_nsub + 1, 0))
431 nregs = regs->num_regs;
432 if (BE (nregs < 1, 0))
434 /* Nothing can be copied to regs. */
435 regs = NULL;
436 nregs = 1;
439 else
440 nregs = bufp->re_nsub + 1;
441 pmatch = re_malloc (regmatch_t, nregs);
442 if (BE (pmatch == NULL, 0))
444 rval = -2;
445 goto out;
448 result = re_search_internal (bufp, string, length, start, range, stop,
449 nregs, pmatch, eflags);
451 rval = 0;
453 /* I hope we needn't fill ther regs with -1's when no match was found. */
454 if (result != REG_NOERROR)
455 rval = -1;
456 else if (regs != NULL)
458 /* If caller wants register contents data back, copy them. */
459 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
460 bufp->regs_allocated);
461 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
462 rval = -2;
465 if (BE (rval == 0, 1))
467 if (ret_len)
469 assert (pmatch[0].rm_so == start);
470 rval = pmatch[0].rm_eo - start;
472 else
473 rval = pmatch[0].rm_so;
475 re_free (pmatch);
476 out:
477 __libc_lock_unlock (dfa->lock);
478 return rval;
481 static unsigned
482 internal_function
483 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
484 int regs_allocated)
486 int rval = REGS_REALLOCATE;
487 int i;
488 int need_regs = nregs + 1;
489 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
490 uses. */
492 /* Have the register data arrays been allocated? */
493 if (regs_allocated == REGS_UNALLOCATED)
494 { /* No. So allocate them with malloc. */
495 regs->start = re_malloc (regoff_t, need_regs);
496 if (BE (regs->start == NULL, 0))
497 return REGS_UNALLOCATED;
498 regs->end = re_malloc (regoff_t, need_regs);
499 if (BE (regs->end == NULL, 0))
501 re_free (regs->start);
502 return REGS_UNALLOCATED;
504 regs->num_regs = need_regs;
506 else if (regs_allocated == REGS_REALLOCATE)
507 { /* Yes. If we need more elements than were already
508 allocated, reallocate them. If we need fewer, just
509 leave it alone. */
510 if (BE (need_regs > regs->num_regs, 0))
512 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
513 regoff_t *new_end;
514 if (BE (new_start == NULL, 0))
515 return REGS_UNALLOCATED;
516 new_end = re_realloc (regs->end, regoff_t, need_regs);
517 if (BE (new_end == NULL, 0))
519 re_free (new_start);
520 return REGS_UNALLOCATED;
522 regs->start = new_start;
523 regs->end = new_end;
524 regs->num_regs = need_regs;
527 else
529 assert (regs_allocated == REGS_FIXED);
530 /* This function may not be called with REGS_FIXED and nregs too big. */
531 assert (regs->num_regs >= nregs);
532 rval = REGS_FIXED;
535 /* Copy the regs. */
536 for (i = 0; i < nregs; ++i)
538 regs->start[i] = pmatch[i].rm_so;
539 regs->end[i] = pmatch[i].rm_eo;
541 for ( ; i < regs->num_regs; ++i)
542 regs->start[i] = regs->end[i] = -1;
544 return rval;
547 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
548 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
549 this memory for recording register information. STARTS and ENDS
550 must be allocated using the malloc library routine, and must each
551 be at least NUM_REGS * sizeof (regoff_t) bytes long.
553 If NUM_REGS == 0, then subsequent matches should allocate their own
554 register data.
556 Unless this function is called, the first search or match using
557 PATTERN_BUFFER will allocate its own register data, without
558 freeing the old data. */
560 void
561 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
562 unsigned num_regs, regoff_t *starts, regoff_t *ends)
564 if (num_regs)
566 bufp->regs_allocated = REGS_REALLOCATE;
567 regs->num_regs = num_regs;
568 regs->start = starts;
569 regs->end = ends;
571 else
573 bufp->regs_allocated = REGS_UNALLOCATED;
574 regs->num_regs = 0;
575 regs->start = regs->end = (regoff_t *) 0;
578 #ifdef _LIBC
579 weak_alias (__re_set_registers, re_set_registers)
580 #endif
582 /* Entry points compatible with 4.2 BSD regex library. We don't define
583 them unless specifically requested. */
585 #if defined _REGEX_RE_COMP || defined _LIBC
587 # ifdef _LIBC
588 weak_function
589 # endif
590 re_exec (const char *s)
592 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
594 #endif /* _REGEX_RE_COMP */
596 /* Internal entry point. */
598 /* Searches for a compiled pattern PREG in the string STRING, whose
599 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
600 mingings with regexec. START, and RANGE have the same meanings
601 with re_search.
602 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
603 otherwise return the error code.
604 Note: We assume front end functions already check ranges.
605 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
607 static reg_errcode_t
608 __attribute_warn_unused_result__ internal_function
609 re_search_internal (const regex_t *preg, const char *string, int length,
610 int start, int range, int stop, size_t nmatch,
611 regmatch_t pmatch[], int eflags)
613 reg_errcode_t err;
614 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
615 int left_lim, right_lim, incr;
616 int fl_longest_match, match_first, match_kind, match_last = -1;
617 int extra_nmatch;
618 int sb, ch;
619 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
620 re_match_context_t mctx = { .dfa = dfa };
621 #else
622 re_match_context_t mctx;
623 #endif
624 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
625 && range && !preg->can_be_null) ? preg->fastmap : NULL;
626 RE_TRANSLATE_TYPE t = preg->translate;
628 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
629 memset (&mctx, '\0', sizeof (re_match_context_t));
630 mctx.dfa = dfa;
631 #endif
633 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
634 nmatch -= extra_nmatch;
636 /* Check if the DFA haven't been compiled. */
637 if (BE (preg->used == 0 || dfa->init_state == NULL
638 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
639 || dfa->init_state_begbuf == NULL, 0))
640 return REG_NOMATCH;
642 #ifdef DEBUG
643 /* We assume front-end functions already check them. */
644 assert (start + range >= 0 && start + range <= length);
645 #endif
647 /* If initial states with non-begbuf contexts have no elements,
648 the regex must be anchored. If preg->newline_anchor is set,
649 we'll never use init_state_nl, so do not check it. */
650 if (dfa->init_state->nodes.nelem == 0
651 && dfa->init_state_word->nodes.nelem == 0
652 && (dfa->init_state_nl->nodes.nelem == 0
653 || !preg->newline_anchor))
655 if (start != 0 && start + range != 0)
656 return REG_NOMATCH;
657 start = range = 0;
660 /* We must check the longest matching, if nmatch > 0. */
661 fl_longest_match = (nmatch != 0 || dfa->nbackref);
663 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
664 preg->translate, preg->syntax & RE_ICASE, dfa);
665 if (BE (err != REG_NOERROR, 0))
666 goto free_return;
667 mctx.input.stop = stop;
668 mctx.input.raw_stop = stop;
669 mctx.input.newline_anchor = preg->newline_anchor;
671 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
672 if (BE (err != REG_NOERROR, 0))
673 goto free_return;
675 /* We will log all the DFA states through which the dfa pass,
676 if nmatch > 1, or this dfa has "multibyte node", which is a
677 back-reference or a node which can accept multibyte character or
678 multi character collating element. */
679 if (nmatch > 1 || dfa->has_mb_node)
681 /* Avoid overflow. */
682 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
684 err = REG_ESPACE;
685 goto free_return;
688 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
689 if (BE (mctx.state_log == NULL, 0))
691 err = REG_ESPACE;
692 goto free_return;
695 else
696 mctx.state_log = NULL;
698 match_first = start;
699 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
702 /* Check incrementally whether of not the input string match. */
703 incr = (range < 0) ? -1 : 1;
704 left_lim = (range < 0) ? start + range : start;
705 right_lim = (range < 0) ? start : start + range;
706 sb = dfa->mb_cur_max == 1;
707 match_kind =
708 (fastmap
709 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
710 | (range >= 0 ? 2 : 0)
711 | (t != NULL ? 1 : 0))
712 : 8);
714 for (;; match_first += incr)
716 err = REG_NOMATCH;
717 if (match_first < left_lim || right_lim < match_first)
718 goto free_return;
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
725 switch (match_kind)
727 case 8:
728 /* No fastmap. */
729 break;
731 case 7:
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first < right_lim, 1)
734 && !fastmap[t[(unsigned char) string[match_first]]])
735 ++match_first;
736 goto forward_match_found_start_or_reached_end;
738 case 6:
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first < right_lim, 1)
741 && !fastmap[(unsigned char) string[match_first]])
742 ++match_first;
744 forward_match_found_start_or_reached_end:
745 if (BE (match_first == right_lim, 0))
747 ch = match_first >= length
748 ? 0 : (unsigned char) string[match_first];
749 if (!fastmap[t ? t[ch] : ch])
750 goto free_return;
752 break;
754 case 4:
755 case 5:
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first >= left_lim)
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (fastmap[t ? t[ch] : ch])
762 break;
763 --match_first;
765 if (match_first < left_lim)
766 goto free_return;
767 break;
769 default:
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
773 for (;;)
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
776 buffers. */
777 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
778 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
780 err = re_string_reconstruct (&mctx.input, match_first,
781 eflags);
782 if (BE (err != REG_NOERROR, 0))
783 goto free_return;
785 offset = match_first - mctx.input.raw_mbs_idx;
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch = (match_first >= length
790 ? 0 : re_string_byte_at (&mctx.input, offset));
791 if (fastmap[ch])
792 break;
793 match_first += incr;
794 if (match_first < left_lim || match_first > right_lim)
796 err = REG_NOMATCH;
797 goto free_return;
800 break;
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err = re_string_reconstruct (&mctx.input, match_first, eflags);
806 if (BE (err != REG_NOERROR, 0))
807 goto free_return;
809 #ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb && !re_string_first_byte (&mctx.input, 0))
813 continue;
814 #endif
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
819 match_last = check_matching (&mctx, fl_longest_match,
820 range >= 0 ? &match_first : NULL);
821 if (match_last != -1)
823 if (BE (match_last == -2, 0))
825 err = REG_ESPACE;
826 goto free_return;
828 else
830 mctx.match_last = match_last;
831 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
833 re_dfastate_t *pstate = mctx.state_log[match_last];
834 mctx.last_node = check_halt_state_context (&mctx, pstate,
835 match_last);
837 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
838 || dfa->nbackref)
840 err = prune_impossible_nodes (&mctx);
841 if (err == REG_NOERROR)
842 break;
843 if (BE (err != REG_NOMATCH, 0))
844 goto free_return;
845 match_last = -1;
847 else
848 break; /* We found a match. */
852 match_ctx_clean (&mctx);
855 #ifdef DEBUG
856 assert (match_last != -1);
857 assert (err == REG_NOERROR);
858 #endif
860 /* Set pmatch[] if we need. */
861 if (nmatch > 0)
863 int reg_idx;
865 /* Initialize registers. */
866 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
867 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
869 /* Set the points where matching start/end. */
870 pmatch[0].rm_so = 0;
871 pmatch[0].rm_eo = mctx.match_last;
873 if (!preg->no_sub && nmatch > 1)
875 err = set_regs (preg, &mctx, nmatch, pmatch,
876 dfa->has_plural_match && dfa->nbackref > 0);
877 if (BE (err != REG_NOERROR, 0))
878 goto free_return;
881 /* At last, add the offset to the each registers, since we slided
882 the buffers so that we could assume that the matching starts
883 from 0. */
884 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
885 if (pmatch[reg_idx].rm_so != -1)
887 #ifdef RE_ENABLE_I18N
888 if (BE (mctx.input.offsets_needed != 0, 0))
890 pmatch[reg_idx].rm_so =
891 (pmatch[reg_idx].rm_so == mctx.input.valid_len
892 ? mctx.input.valid_raw_len
893 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
894 pmatch[reg_idx].rm_eo =
895 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
896 ? mctx.input.valid_raw_len
897 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
899 #else
900 assert (mctx.input.offsets_needed == 0);
901 #endif
902 pmatch[reg_idx].rm_so += match_first;
903 pmatch[reg_idx].rm_eo += match_first;
905 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
907 pmatch[nmatch + reg_idx].rm_so = -1;
908 pmatch[nmatch + reg_idx].rm_eo = -1;
911 if (dfa->subexp_map)
912 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
913 if (dfa->subexp_map[reg_idx] != reg_idx)
915 pmatch[reg_idx + 1].rm_so
916 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
917 pmatch[reg_idx + 1].rm_eo
918 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
922 free_return:
923 re_free (mctx.state_log);
924 if (dfa->nbackref)
925 match_ctx_free (&mctx);
926 re_string_destruct (&mctx.input);
927 return err;
930 static reg_errcode_t
931 internal_function __attribute_warn_unused_result__
932 prune_impossible_nodes (re_match_context_t *mctx)
934 const re_dfa_t *const dfa = mctx->dfa;
935 int halt_node, match_last;
936 reg_errcode_t ret;
937 re_dfastate_t **sifted_states;
938 re_dfastate_t **lim_states = NULL;
939 re_sift_context_t sctx;
940 #ifdef DEBUG
941 assert (mctx->state_log != NULL);
942 #endif
943 match_last = mctx->match_last;
944 halt_node = mctx->last_node;
946 /* Avoid overflow. */
947 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
948 return REG_ESPACE;
950 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
951 if (BE (sifted_states == NULL, 0))
953 ret = REG_ESPACE;
954 goto free_return;
956 if (dfa->nbackref)
958 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
959 if (BE (lim_states == NULL, 0))
961 ret = REG_ESPACE;
962 goto free_return;
964 while (1)
966 memset (lim_states, '\0',
967 sizeof (re_dfastate_t *) * (match_last + 1));
968 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
969 match_last);
970 ret = sift_states_backward (mctx, &sctx);
971 re_node_set_free (&sctx.limits);
972 if (BE (ret != REG_NOERROR, 0))
973 goto free_return;
974 if (sifted_states[0] != NULL || lim_states[0] != NULL)
975 break;
978 --match_last;
979 if (match_last < 0)
981 ret = REG_NOMATCH;
982 goto free_return;
984 } while (mctx->state_log[match_last] == NULL
985 || !mctx->state_log[match_last]->halt);
986 halt_node = check_halt_state_context (mctx,
987 mctx->state_log[match_last],
988 match_last);
990 ret = merge_state_array (dfa, sifted_states, lim_states,
991 match_last + 1);
992 re_free (lim_states);
993 lim_states = NULL;
994 if (BE (ret != REG_NOERROR, 0))
995 goto free_return;
997 else
999 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1000 ret = sift_states_backward (mctx, &sctx);
1001 re_node_set_free (&sctx.limits);
1002 if (BE (ret != REG_NOERROR, 0))
1003 goto free_return;
1004 if (sifted_states[0] == NULL)
1006 ret = REG_NOMATCH;
1007 goto free_return;
1010 re_free (mctx->state_log);
1011 mctx->state_log = sifted_states;
1012 sifted_states = NULL;
1013 mctx->last_node = halt_node;
1014 mctx->match_last = match_last;
1015 ret = REG_NOERROR;
1016 free_return:
1017 re_free (sifted_states);
1018 re_free (lim_states);
1019 return ret;
1022 /* Acquire an initial state and return it.
1023 We must select appropriate initial state depending on the context,
1024 since initial states may have constraints like "\<", "^", etc.. */
1026 static inline re_dfastate_t *
1027 __attribute ((always_inline)) internal_function
1028 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1029 int idx)
1031 const re_dfa_t *const dfa = mctx->dfa;
1032 if (dfa->init_state->has_constraint)
1034 unsigned int context;
1035 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1036 if (IS_WORD_CONTEXT (context))
1037 return dfa->init_state_word;
1038 else if (IS_ORDINARY_CONTEXT (context))
1039 return dfa->init_state;
1040 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1041 return dfa->init_state_begbuf;
1042 else if (IS_NEWLINE_CONTEXT (context))
1043 return dfa->init_state_nl;
1044 else if (IS_BEGBUF_CONTEXT (context))
1046 /* It is relatively rare case, then calculate on demand. */
1047 return re_acquire_state_context (err, dfa,
1048 dfa->init_state->entrance_nodes,
1049 context);
1051 else
1052 /* Must not happen? */
1053 return dfa->init_state;
1055 else
1056 return dfa->init_state;
1059 /* Check whether the regular expression match input string INPUT or not,
1060 and return the index where the matching end, return -1 if not match,
1061 or return -2 in case of an error.
1062 FL_LONGEST_MATCH means we want the POSIX longest matching.
1063 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1064 next place where we may want to try matching.
1065 Note that the matcher assume that the maching starts from the current
1066 index of the buffer. */
1068 static int
1069 internal_function __attribute_warn_unused_result__
1070 check_matching (re_match_context_t *mctx, int fl_longest_match,
1071 int *p_match_first)
1073 const re_dfa_t *const dfa = mctx->dfa;
1074 reg_errcode_t err;
1075 int match = 0;
1076 int match_last = -1;
1077 int cur_str_idx = re_string_cur_idx (&mctx->input);
1078 re_dfastate_t *cur_state;
1079 int at_init_state = p_match_first != NULL;
1080 int next_start_idx = cur_str_idx;
1082 err = REG_NOERROR;
1083 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1084 /* An initial state must not be NULL (invalid). */
1085 if (BE (cur_state == NULL, 0))
1087 assert (err == REG_ESPACE);
1088 return -2;
1091 if (mctx->state_log != NULL)
1093 mctx->state_log[cur_str_idx] = cur_state;
1095 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1096 later. E.g. Processing back references. */
1097 if (BE (dfa->nbackref, 0))
1099 at_init_state = 0;
1100 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1101 if (BE (err != REG_NOERROR, 0))
1102 return err;
1104 if (cur_state->has_backref)
1106 err = transit_state_bkref (mctx, &cur_state->nodes);
1107 if (BE (err != REG_NOERROR, 0))
1108 return err;
1113 /* If the RE accepts NULL string. */
1114 if (BE (cur_state->halt, 0))
1116 if (!cur_state->has_constraint
1117 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1119 if (!fl_longest_match)
1120 return cur_str_idx;
1121 else
1123 match_last = cur_str_idx;
1124 match = 1;
1129 while (!re_string_eoi (&mctx->input))
1131 re_dfastate_t *old_state = cur_state;
1132 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1134 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1135 && mctx->input.bufs_len < mctx->input.len)
1136 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1137 && mctx->input.valid_len < mctx->input.len))
1139 err = extend_buffers (mctx, next_char_idx + 1);
1140 if (BE (err != REG_NOERROR, 0))
1142 assert (err == REG_ESPACE);
1143 return -2;
1147 cur_state = transit_state (&err, mctx, cur_state);
1148 if (mctx->state_log != NULL)
1149 cur_state = merge_state_with_log (&err, mctx, cur_state);
1151 if (cur_state == NULL)
1153 /* Reached the invalid state or an error. Try to recover a valid
1154 state using the state log, if available and if we have not
1155 already found a valid (even if not the longest) match. */
1156 if (BE (err != REG_NOERROR, 0))
1157 return -2;
1159 if (mctx->state_log == NULL
1160 || (match && !fl_longest_match)
1161 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1162 break;
1165 if (BE (at_init_state, 0))
1167 if (old_state == cur_state)
1168 next_start_idx = next_char_idx;
1169 else
1170 at_init_state = 0;
1173 if (cur_state->halt)
1175 /* Reached a halt state.
1176 Check the halt state can satisfy the current context. */
1177 if (!cur_state->has_constraint
1178 || check_halt_state_context (mctx, cur_state,
1179 re_string_cur_idx (&mctx->input)))
1181 /* We found an appropriate halt state. */
1182 match_last = re_string_cur_idx (&mctx->input);
1183 match = 1;
1185 /* We found a match, do not modify match_first below. */
1186 p_match_first = NULL;
1187 if (!fl_longest_match)
1188 break;
1193 if (p_match_first)
1194 *p_match_first += next_start_idx;
1196 return match_last;
1199 /* Check NODE match the current context. */
1201 static int
1202 internal_function
1203 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1205 re_token_type_t type = dfa->nodes[node].type;
1206 unsigned int constraint = dfa->nodes[node].constraint;
1207 if (type != END_OF_RE)
1208 return 0;
1209 if (!constraint)
1210 return 1;
1211 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1212 return 0;
1213 return 1;
1216 /* Check the halt state STATE match the current context.
1217 Return 0 if not match, if the node, STATE has, is a halt node and
1218 match the context, return the node. */
1220 static int
1221 internal_function
1222 check_halt_state_context (const re_match_context_t *mctx,
1223 const re_dfastate_t *state, int idx)
1225 int i;
1226 unsigned int context;
1227 #ifdef DEBUG
1228 assert (state->halt);
1229 #endif
1230 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1231 for (i = 0; i < state->nodes.nelem; ++i)
1232 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1233 return state->nodes.elems[i];
1234 return 0;
1237 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1238 corresponding to the DFA).
1239 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1240 of errors. */
1242 static int
1243 internal_function
1244 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1245 int *pidx, int node, re_node_set *eps_via_nodes,
1246 struct re_fail_stack_t *fs)
1248 const re_dfa_t *const dfa = mctx->dfa;
1249 int i, err;
1250 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1252 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1253 re_node_set *edests = &dfa->edests[node];
1254 int dest_node;
1255 err = re_node_set_insert (eps_via_nodes, node);
1256 if (BE (err < 0, 0))
1257 return -2;
1258 /* Pick up a valid destination, or return -1 if none is found. */
1259 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1261 int candidate = edests->elems[i];
1262 if (!re_node_set_contains (cur_nodes, candidate))
1263 continue;
1264 if (dest_node == -1)
1265 dest_node = candidate;
1267 else
1269 /* In order to avoid infinite loop like "(a*)*", return the second
1270 epsilon-transition if the first was already considered. */
1271 if (re_node_set_contains (eps_via_nodes, dest_node))
1272 return candidate;
1274 /* Otherwise, push the second epsilon-transition on the fail stack. */
1275 else if (fs != NULL
1276 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1277 eps_via_nodes))
1278 return -2;
1280 /* We know we are going to exit. */
1281 break;
1284 return dest_node;
1286 else
1288 int naccepted = 0;
1289 re_token_type_t type = dfa->nodes[node].type;
1291 #ifdef RE_ENABLE_I18N
1292 if (dfa->nodes[node].accept_mb)
1293 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1294 else
1295 #endif /* RE_ENABLE_I18N */
1296 if (type == OP_BACK_REF)
1298 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1299 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1300 if (fs != NULL)
1302 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1303 return -1;
1304 else if (naccepted)
1306 char *buf = (char *) re_string_get_buffer (&mctx->input);
1307 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1308 naccepted) != 0)
1309 return -1;
1313 if (naccepted == 0)
1315 int dest_node;
1316 err = re_node_set_insert (eps_via_nodes, node);
1317 if (BE (err < 0, 0))
1318 return -2;
1319 dest_node = dfa->edests[node].elems[0];
1320 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1321 dest_node))
1322 return dest_node;
1326 if (naccepted != 0
1327 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1329 int dest_node = dfa->nexts[node];
1330 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1331 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1332 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1333 dest_node)))
1334 return -1;
1335 re_node_set_empty (eps_via_nodes);
1336 return dest_node;
1339 return -1;
1342 static reg_errcode_t
1343 internal_function __attribute_warn_unused_result__
1344 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1345 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1347 reg_errcode_t err;
1348 int num = fs->num++;
1349 if (fs->num == fs->alloc)
1351 struct re_fail_stack_ent_t *new_array;
1352 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1353 * fs->alloc * 2));
1354 if (new_array == NULL)
1355 return REG_ESPACE;
1356 fs->alloc *= 2;
1357 fs->stack = new_array;
1359 fs->stack[num].idx = str_idx;
1360 fs->stack[num].node = dest_node;
1361 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1362 if (fs->stack[num].regs == NULL)
1363 return REG_ESPACE;
1364 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1365 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1366 return err;
1369 static int
1370 internal_function
1371 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1372 regmatch_t *regs, re_node_set *eps_via_nodes)
1374 int num = --fs->num;
1375 assert (num >= 0);
1376 *pidx = fs->stack[num].idx;
1377 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1378 re_node_set_free (eps_via_nodes);
1379 re_free (fs->stack[num].regs);
1380 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1381 return fs->stack[num].node;
1384 /* Set the positions where the subexpressions are starts/ends to registers
1385 PMATCH.
1386 Note: We assume that pmatch[0] is already set, and
1387 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1389 static reg_errcode_t
1390 internal_function __attribute_warn_unused_result__
1391 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1392 regmatch_t *pmatch, int fl_backtrack)
1394 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1395 int idx, cur_node;
1396 re_node_set eps_via_nodes;
1397 struct re_fail_stack_t *fs;
1398 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1399 regmatch_t *prev_idx_match;
1400 int prev_idx_match_malloced = 0;
1402 #ifdef DEBUG
1403 assert (nmatch > 1);
1404 assert (mctx->state_log != NULL);
1405 #endif
1406 if (fl_backtrack)
1408 fs = &fs_body;
1409 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1410 if (fs->stack == NULL)
1411 return REG_ESPACE;
1413 else
1414 fs = NULL;
1416 cur_node = dfa->init_node;
1417 re_node_set_init_empty (&eps_via_nodes);
1419 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1420 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1421 else
1423 prev_idx_match = re_malloc (regmatch_t, nmatch);
1424 if (prev_idx_match == NULL)
1426 free_fail_stack_return (fs);
1427 return REG_ESPACE;
1429 prev_idx_match_malloced = 1;
1431 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1433 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1435 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1437 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1439 int reg_idx;
1440 if (fs)
1442 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1443 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1444 break;
1445 if (reg_idx == nmatch)
1447 re_node_set_free (&eps_via_nodes);
1448 if (prev_idx_match_malloced)
1449 re_free (prev_idx_match);
1450 return free_fail_stack_return (fs);
1452 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1453 &eps_via_nodes);
1455 else
1457 re_node_set_free (&eps_via_nodes);
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
1460 return REG_NOERROR;
1464 /* Proceed to next node. */
1465 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1466 &eps_via_nodes, fs);
1468 if (BE (cur_node < 0, 0))
1470 if (BE (cur_node == -2, 0))
1472 re_node_set_free (&eps_via_nodes);
1473 if (prev_idx_match_malloced)
1474 re_free (prev_idx_match);
1475 free_fail_stack_return (fs);
1476 return REG_ESPACE;
1478 if (fs)
1479 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1480 &eps_via_nodes);
1481 else
1483 re_node_set_free (&eps_via_nodes);
1484 if (prev_idx_match_malloced)
1485 re_free (prev_idx_match);
1486 return REG_NOMATCH;
1490 re_node_set_free (&eps_via_nodes);
1491 if (prev_idx_match_malloced)
1492 re_free (prev_idx_match);
1493 return free_fail_stack_return (fs);
1496 static reg_errcode_t
1497 internal_function
1498 free_fail_stack_return (struct re_fail_stack_t *fs)
1500 if (fs)
1502 int fs_idx;
1503 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1505 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1506 re_free (fs->stack[fs_idx].regs);
1508 re_free (fs->stack);
1510 return REG_NOERROR;
1513 static void
1514 internal_function
1515 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1516 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1518 int type = dfa->nodes[cur_node].type;
1519 if (type == OP_OPEN_SUBEXP)
1521 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1523 /* We are at the first node of this sub expression. */
1524 if (reg_num < nmatch)
1526 pmatch[reg_num].rm_so = cur_idx;
1527 pmatch[reg_num].rm_eo = -1;
1530 else if (type == OP_CLOSE_SUBEXP)
1532 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1533 if (reg_num < nmatch)
1535 /* We are at the last node of this sub expression. */
1536 if (pmatch[reg_num].rm_so < cur_idx)
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 /* This is a non-empty match or we are not inside an optional
1540 subexpression. Accept this right away. */
1541 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1543 else
1545 if (dfa->nodes[cur_node].opt_subexp
1546 && prev_idx_match[reg_num].rm_so != -1)
1547 /* We transited through an empty match for an optional
1548 subexpression, like (a?)*, and this is not the subexp's
1549 first match. Copy back the old content of the registers
1550 so that matches of an inner subexpression are undone as
1551 well, like in ((a?))*. */
1552 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1553 else
1554 /* We completed a subexpression, but it may be part of
1555 an optional one, so do not update PREV_IDX_MATCH. */
1556 pmatch[reg_num].rm_eo = cur_idx;
1562 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1563 and sift the nodes in each states according to the following rules.
1564 Updated state_log will be wrote to STATE_LOG.
1566 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1567 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1568 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1569 the LAST_NODE, we throw away the node `a'.
1570 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1571 string `s' and transit to `b':
1572 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1573 away the node `a'.
1574 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1575 thrown away, we throw away the node `a'.
1576 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1577 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1578 node `a'.
1579 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1580 we throw away the node `a'. */
1582 #define STATE_NODE_CONTAINS(state,node) \
1583 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1585 static reg_errcode_t
1586 internal_function
1587 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1589 reg_errcode_t err;
1590 int null_cnt = 0;
1591 int str_idx = sctx->last_str_idx;
1592 re_node_set cur_dest;
1594 #ifdef DEBUG
1595 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1596 #endif
1598 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1599 transit to the last_node and the last_node itself. */
1600 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1601 if (BE (err != REG_NOERROR, 0))
1602 return err;
1603 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1604 if (BE (err != REG_NOERROR, 0))
1605 goto free_return;
1607 /* Then check each states in the state_log. */
1608 while (str_idx > 0)
1610 /* Update counters. */
1611 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1612 if (null_cnt > mctx->max_mb_elem_len)
1614 memset (sctx->sifted_states, '\0',
1615 sizeof (re_dfastate_t *) * str_idx);
1616 re_node_set_free (&cur_dest);
1617 return REG_NOERROR;
1619 re_node_set_empty (&cur_dest);
1620 --str_idx;
1622 if (mctx->state_log[str_idx])
1624 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1625 if (BE (err != REG_NOERROR, 0))
1626 goto free_return;
1629 /* Add all the nodes which satisfy the following conditions:
1630 - It can epsilon transit to a node in CUR_DEST.
1631 - It is in CUR_SRC.
1632 And update state_log. */
1633 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1634 if (BE (err != REG_NOERROR, 0))
1635 goto free_return;
1637 err = REG_NOERROR;
1638 free_return:
1639 re_node_set_free (&cur_dest);
1640 return err;
1643 static reg_errcode_t
1644 internal_function __attribute_warn_unused_result__
1645 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1646 int str_idx, re_node_set *cur_dest)
1648 const re_dfa_t *const dfa = mctx->dfa;
1649 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1650 int i;
1652 /* Then build the next sifted state.
1653 We build the next sifted state on `cur_dest', and update
1654 `sifted_states[str_idx]' with `cur_dest'.
1655 Note:
1656 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1657 `cur_src' points the node_set of the old `state_log[str_idx]'
1658 (with the epsilon nodes pre-filtered out). */
1659 for (i = 0; i < cur_src->nelem; i++)
1661 int prev_node = cur_src->elems[i];
1662 int naccepted = 0;
1663 int ret;
1665 #ifdef DEBUG
1666 re_token_type_t type = dfa->nodes[prev_node].type;
1667 assert (!IS_EPSILON_NODE (type));
1668 #endif
1669 #ifdef RE_ENABLE_I18N
1670 /* If the node may accept `multi byte'. */
1671 if (dfa->nodes[prev_node].accept_mb)
1672 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1673 str_idx, sctx->last_str_idx);
1674 #endif /* RE_ENABLE_I18N */
1676 /* We don't check backreferences here.
1677 See update_cur_sifted_state(). */
1678 if (!naccepted
1679 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1680 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1681 dfa->nexts[prev_node]))
1682 naccepted = 1;
1684 if (naccepted == 0)
1685 continue;
1687 if (sctx->limits.nelem)
1689 int to_idx = str_idx + naccepted;
1690 if (check_dst_limits (mctx, &sctx->limits,
1691 dfa->nexts[prev_node], to_idx,
1692 prev_node, str_idx))
1693 continue;
1695 ret = re_node_set_insert (cur_dest, prev_node);
1696 if (BE (ret == -1, 0))
1697 return REG_ESPACE;
1700 return REG_NOERROR;
1703 /* Helper functions. */
1705 static reg_errcode_t
1706 internal_function
1707 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1709 int top = mctx->state_log_top;
1711 if ((next_state_log_idx >= mctx->input.bufs_len
1712 && mctx->input.bufs_len < mctx->input.len)
1713 || (next_state_log_idx >= mctx->input.valid_len
1714 && mctx->input.valid_len < mctx->input.len))
1716 reg_errcode_t err;
1717 err = extend_buffers (mctx, next_state_log_idx + 1);
1718 if (BE (err != REG_NOERROR, 0))
1719 return err;
1722 if (top < next_state_log_idx)
1724 memset (mctx->state_log + top + 1, '\0',
1725 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1726 mctx->state_log_top = next_state_log_idx;
1728 return REG_NOERROR;
1731 static reg_errcode_t
1732 internal_function
1733 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1734 re_dfastate_t **src, int num)
1736 int st_idx;
1737 reg_errcode_t err;
1738 for (st_idx = 0; st_idx < num; ++st_idx)
1740 if (dst[st_idx] == NULL)
1741 dst[st_idx] = src[st_idx];
1742 else if (src[st_idx] != NULL)
1744 re_node_set merged_set;
1745 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1746 &src[st_idx]->nodes);
1747 if (BE (err != REG_NOERROR, 0))
1748 return err;
1749 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1750 re_node_set_free (&merged_set);
1751 if (BE (err != REG_NOERROR, 0))
1752 return err;
1755 return REG_NOERROR;
1758 static reg_errcode_t
1759 internal_function
1760 update_cur_sifted_state (const re_match_context_t *mctx,
1761 re_sift_context_t *sctx, int str_idx,
1762 re_node_set *dest_nodes)
1764 const re_dfa_t *const dfa = mctx->dfa;
1765 reg_errcode_t err = REG_NOERROR;
1766 const re_node_set *candidates;
1767 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1768 : &mctx->state_log[str_idx]->nodes);
1770 if (dest_nodes->nelem == 0)
1771 sctx->sifted_states[str_idx] = NULL;
1772 else
1774 if (candidates)
1776 /* At first, add the nodes which can epsilon transit to a node in
1777 DEST_NODE. */
1778 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1779 if (BE (err != REG_NOERROR, 0))
1780 return err;
1782 /* Then, check the limitations in the current sift_context. */
1783 if (sctx->limits.nelem)
1785 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1786 mctx->bkref_ents, str_idx);
1787 if (BE (err != REG_NOERROR, 0))
1788 return err;
1792 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
1797 if (candidates && mctx->state_log[str_idx]->has_backref)
1799 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1800 if (BE (err != REG_NOERROR, 0))
1801 return err;
1803 return REG_NOERROR;
1806 static reg_errcode_t
1807 internal_function __attribute_warn_unused_result__
1808 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1809 const re_node_set *candidates)
1811 reg_errcode_t err = REG_NOERROR;
1812 int i;
1814 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1815 if (BE (err != REG_NOERROR, 0))
1816 return err;
1818 if (!state->inveclosure.alloc)
1820 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1821 if (BE (err != REG_NOERROR, 0))
1822 return REG_ESPACE;
1823 for (i = 0; i < dest_nodes->nelem; i++)
1825 err = re_node_set_merge (&state->inveclosure,
1826 dfa->inveclosures + dest_nodes->elems[i]);
1827 if (BE (err != REG_NOERROR, 0))
1828 return REG_ESPACE;
1831 return re_node_set_add_intersect (dest_nodes, candidates,
1832 &state->inveclosure);
1835 static reg_errcode_t
1836 internal_function
1837 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1838 const re_node_set *candidates)
1840 int ecl_idx;
1841 reg_errcode_t err;
1842 re_node_set *inv_eclosure = dfa->inveclosures + node;
1843 re_node_set except_nodes;
1844 re_node_set_init_empty (&except_nodes);
1845 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1847 int cur_node = inv_eclosure->elems[ecl_idx];
1848 if (cur_node == node)
1849 continue;
1850 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1852 int edst1 = dfa->edests[cur_node].elems[0];
1853 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1854 ? dfa->edests[cur_node].elems[1] : -1);
1855 if ((!re_node_set_contains (inv_eclosure, edst1)
1856 && re_node_set_contains (dest_nodes, edst1))
1857 || (edst2 > 0
1858 && !re_node_set_contains (inv_eclosure, edst2)
1859 && re_node_set_contains (dest_nodes, edst2)))
1861 err = re_node_set_add_intersect (&except_nodes, candidates,
1862 dfa->inveclosures + cur_node);
1863 if (BE (err != REG_NOERROR, 0))
1865 re_node_set_free (&except_nodes);
1866 return err;
1871 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1873 int cur_node = inv_eclosure->elems[ecl_idx];
1874 if (!re_node_set_contains (&except_nodes, cur_node))
1876 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1877 re_node_set_remove_at (dest_nodes, idx);
1880 re_node_set_free (&except_nodes);
1881 return REG_NOERROR;
1884 static int
1885 internal_function
1886 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1887 int dst_node, int dst_idx, int src_node, int src_idx)
1889 const re_dfa_t *const dfa = mctx->dfa;
1890 int lim_idx, src_pos, dst_pos;
1892 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1893 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1894 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1896 int subexp_idx;
1897 struct re_backref_cache_entry *ent;
1898 ent = mctx->bkref_ents + limits->elems[lim_idx];
1899 subexp_idx = dfa->nodes[ent->node].opr.idx;
1901 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1902 subexp_idx, dst_node, dst_idx,
1903 dst_bkref_idx);
1904 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1905 subexp_idx, src_node, src_idx,
1906 src_bkref_idx);
1908 /* In case of:
1909 <src> <dst> ( <subexp> )
1910 ( <subexp> ) <src> <dst>
1911 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1912 if (src_pos == dst_pos)
1913 continue; /* This is unrelated limitation. */
1914 else
1915 return 1;
1917 return 0;
1920 static int
1921 internal_function
1922 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1923 int subexp_idx, int from_node, int bkref_idx)
1925 const re_dfa_t *const dfa = mctx->dfa;
1926 const re_node_set *eclosures = dfa->eclosures + from_node;
1927 int node_idx;
1929 /* Else, we are on the boundary: examine the nodes on the epsilon
1930 closure. */
1931 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1933 int node = eclosures->elems[node_idx];
1934 switch (dfa->nodes[node].type)
1936 case OP_BACK_REF:
1937 if (bkref_idx != -1)
1939 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1942 int dst, cpos;
1944 if (ent->node != node)
1945 continue;
1947 if (subexp_idx < BITSET_WORD_BITS
1948 && !(ent->eps_reachable_subexps_map
1949 & ((bitset_word_t) 1 << subexp_idx)))
1950 continue;
1952 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1953 OP_CLOSE_SUBEXP cases below. But, if the
1954 destination node is the same node as the source
1955 node, don't recurse because it would cause an
1956 infinite loop: a regex that exhibits this behavior
1957 is ()\1*\1* */
1958 dst = dfa->edests[node].elems[0];
1959 if (dst == from_node)
1961 if (boundaries & 1)
1962 return -1;
1963 else /* if (boundaries & 2) */
1964 return 0;
1967 cpos =
1968 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1969 dst, bkref_idx);
1970 if (cpos == -1 /* && (boundaries & 1) */)
1971 return -1;
1972 if (cpos == 0 && (boundaries & 2))
1973 return 0;
1975 if (subexp_idx < BITSET_WORD_BITS)
1976 ent->eps_reachable_subexps_map
1977 &= ~((bitset_word_t) 1 << subexp_idx);
1979 while (ent++->more);
1981 break;
1983 case OP_OPEN_SUBEXP:
1984 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1985 return -1;
1986 break;
1988 case OP_CLOSE_SUBEXP:
1989 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1990 return 0;
1991 break;
1993 default:
1994 break;
1998 return (boundaries & 2) ? 1 : 0;
2001 static int
2002 internal_function
2003 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2004 int subexp_idx, int from_node, int str_idx,
2005 int bkref_idx)
2007 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2008 int boundaries;
2010 /* If we are outside the range of the subexpression, return -1 or 1. */
2011 if (str_idx < lim->subexp_from)
2012 return -1;
2014 if (lim->subexp_to < str_idx)
2015 return 1;
2017 /* If we are within the subexpression, return 0. */
2018 boundaries = (str_idx == lim->subexp_from);
2019 boundaries |= (str_idx == lim->subexp_to) << 1;
2020 if (boundaries == 0)
2021 return 0;
2023 /* Else, examine epsilon closure. */
2024 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2025 from_node, bkref_idx);
2028 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2029 which are against limitations from DEST_NODES. */
2031 static reg_errcode_t
2032 internal_function
2033 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2034 const re_node_set *candidates, re_node_set *limits,
2035 struct re_backref_cache_entry *bkref_ents, int str_idx)
2037 reg_errcode_t err;
2038 int node_idx, lim_idx;
2040 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2042 int subexp_idx;
2043 struct re_backref_cache_entry *ent;
2044 ent = bkref_ents + limits->elems[lim_idx];
2046 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2047 continue; /* This is unrelated limitation. */
2049 subexp_idx = dfa->nodes[ent->node].opr.idx;
2050 if (ent->subexp_to == str_idx)
2052 int ops_node = -1;
2053 int cls_node = -1;
2054 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2056 int node = dest_nodes->elems[node_idx];
2057 re_token_type_t type = dfa->nodes[node].type;
2058 if (type == OP_OPEN_SUBEXP
2059 && subexp_idx == dfa->nodes[node].opr.idx)
2060 ops_node = node;
2061 else if (type == OP_CLOSE_SUBEXP
2062 && subexp_idx == dfa->nodes[node].opr.idx)
2063 cls_node = node;
2066 /* Check the limitation of the open subexpression. */
2067 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2068 if (ops_node >= 0)
2070 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2071 candidates);
2072 if (BE (err != REG_NOERROR, 0))
2073 return err;
2076 /* Check the limitation of the close subexpression. */
2077 if (cls_node >= 0)
2078 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2080 int node = dest_nodes->elems[node_idx];
2081 if (!re_node_set_contains (dfa->inveclosures + node,
2082 cls_node)
2083 && !re_node_set_contains (dfa->eclosures + node,
2084 cls_node))
2086 /* It is against this limitation.
2087 Remove it form the current sifted state. */
2088 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2089 candidates);
2090 if (BE (err != REG_NOERROR, 0))
2091 return err;
2092 --node_idx;
2096 else /* (ent->subexp_to != str_idx) */
2098 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2100 int node = dest_nodes->elems[node_idx];
2101 re_token_type_t type = dfa->nodes[node].type;
2102 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2104 if (subexp_idx != dfa->nodes[node].opr.idx)
2105 continue;
2106 /* It is against this limitation.
2107 Remove it form the current sifted state. */
2108 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2109 candidates);
2110 if (BE (err != REG_NOERROR, 0))
2111 return err;
2116 return REG_NOERROR;
2119 static reg_errcode_t
2120 internal_function __attribute_warn_unused_result__
2121 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2122 int str_idx, const re_node_set *candidates)
2124 const re_dfa_t *const dfa = mctx->dfa;
2125 reg_errcode_t err;
2126 int node_idx, node;
2127 re_sift_context_t local_sctx;
2128 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2130 if (first_idx == -1)
2131 return REG_NOERROR;
2133 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2135 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2137 int enabled_idx;
2138 re_token_type_t type;
2139 struct re_backref_cache_entry *entry;
2140 node = candidates->elems[node_idx];
2141 type = dfa->nodes[node].type;
2142 /* Avoid infinite loop for the REs like "()\1+". */
2143 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2144 continue;
2145 if (type != OP_BACK_REF)
2146 continue;
2148 entry = mctx->bkref_ents + first_idx;
2149 enabled_idx = first_idx;
2152 int subexp_len;
2153 int to_idx;
2154 int dst_node;
2155 int ret;
2156 re_dfastate_t *cur_state;
2158 if (entry->node != node)
2159 continue;
2160 subexp_len = entry->subexp_to - entry->subexp_from;
2161 to_idx = str_idx + subexp_len;
2162 dst_node = (subexp_len ? dfa->nexts[node]
2163 : dfa->edests[node].elems[0]);
2165 if (to_idx > sctx->last_str_idx
2166 || sctx->sifted_states[to_idx] == NULL
2167 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2168 || check_dst_limits (mctx, &sctx->limits, node,
2169 str_idx, dst_node, to_idx))
2170 continue;
2172 if (local_sctx.sifted_states == NULL)
2174 local_sctx = *sctx;
2175 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2176 if (BE (err != REG_NOERROR, 0))
2177 goto free_return;
2179 local_sctx.last_node = node;
2180 local_sctx.last_str_idx = str_idx;
2181 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2182 if (BE (ret < 0, 0))
2184 err = REG_ESPACE;
2185 goto free_return;
2187 cur_state = local_sctx.sifted_states[str_idx];
2188 err = sift_states_backward (mctx, &local_sctx);
2189 if (BE (err != REG_NOERROR, 0))
2190 goto free_return;
2191 if (sctx->limited_states != NULL)
2193 err = merge_state_array (dfa, sctx->limited_states,
2194 local_sctx.sifted_states,
2195 str_idx + 1);
2196 if (BE (err != REG_NOERROR, 0))
2197 goto free_return;
2199 local_sctx.sifted_states[str_idx] = cur_state;
2200 re_node_set_remove (&local_sctx.limits, enabled_idx);
2202 /* mctx->bkref_ents may have changed, reload the pointer. */
2203 entry = mctx->bkref_ents + enabled_idx;
2205 while (enabled_idx++, entry++->more);
2207 err = REG_NOERROR;
2208 free_return:
2209 if (local_sctx.sifted_states != NULL)
2211 re_node_set_free (&local_sctx.limits);
2214 return err;
2218 #ifdef RE_ENABLE_I18N
2219 static int
2220 internal_function
2221 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2222 int node_idx, int str_idx, int max_str_idx)
2224 const re_dfa_t *const dfa = mctx->dfa;
2225 int naccepted;
2226 /* Check the node can accept `multi byte'. */
2227 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2228 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2229 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2230 dfa->nexts[node_idx]))
2231 /* The node can't accept the `multi byte', or the
2232 destination was already thrown away, then the node
2233 could't accept the current input `multi byte'. */
2234 naccepted = 0;
2235 /* Otherwise, it is sure that the node could accept
2236 `naccepted' bytes input. */
2237 return naccepted;
2239 #endif /* RE_ENABLE_I18N */
2242 /* Functions for state transition. */
2244 /* Return the next state to which the current state STATE will transit by
2245 accepting the current input byte, and update STATE_LOG if necessary.
2246 If STATE can accept a multibyte char/collating element/back reference
2247 update the destination of STATE_LOG. */
2249 static re_dfastate_t *
2250 internal_function __attribute_warn_unused_result__
2251 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2252 re_dfastate_t *state)
2254 re_dfastate_t **trtable;
2255 unsigned char ch;
2257 #ifdef RE_ENABLE_I18N
2258 /* If the current state can accept multibyte. */
2259 if (BE (state->accept_mb, 0))
2261 *err = transit_state_mb (mctx, state);
2262 if (BE (*err != REG_NOERROR, 0))
2263 return NULL;
2265 #endif /* RE_ENABLE_I18N */
2267 /* Then decide the next state with the single byte. */
2268 #if 0
2269 if (0)
2270 /* don't use transition table */
2271 return transit_state_sb (err, mctx, state);
2272 #endif
2274 /* Use transition table */
2275 ch = re_string_fetch_byte (&mctx->input);
2276 for (;;)
2278 trtable = state->trtable;
2279 if (BE (trtable != NULL, 1))
2280 return trtable[ch];
2282 trtable = state->word_trtable;
2283 if (BE (trtable != NULL, 1))
2285 unsigned int context;
2286 context
2287 = re_string_context_at (&mctx->input,
2288 re_string_cur_idx (&mctx->input) - 1,
2289 mctx->eflags);
2290 if (IS_WORD_CONTEXT (context))
2291 return trtable[ch + SBC_MAX];
2292 else
2293 return trtable[ch];
2296 if (!build_trtable (mctx->dfa, state))
2298 *err = REG_ESPACE;
2299 return NULL;
2302 /* Retry, we now have a transition table. */
2306 /* Update the state_log if we need */
2307 re_dfastate_t *
2308 internal_function
2309 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2310 re_dfastate_t *next_state)
2312 const re_dfa_t *const dfa = mctx->dfa;
2313 int cur_idx = re_string_cur_idx (&mctx->input);
2315 if (cur_idx > mctx->state_log_top)
2317 mctx->state_log[cur_idx] = next_state;
2318 mctx->state_log_top = cur_idx;
2320 else if (mctx->state_log[cur_idx] == 0)
2322 mctx->state_log[cur_idx] = next_state;
2324 else
2326 re_dfastate_t *pstate;
2327 unsigned int context;
2328 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2329 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2330 the destination of a multibyte char/collating element/
2331 back reference. Then the next state is the union set of
2332 these destinations and the results of the transition table. */
2333 pstate = mctx->state_log[cur_idx];
2334 log_nodes = pstate->entrance_nodes;
2335 if (next_state != NULL)
2337 table_nodes = next_state->entrance_nodes;
2338 *err = re_node_set_init_union (&next_nodes, table_nodes,
2339 log_nodes);
2340 if (BE (*err != REG_NOERROR, 0))
2341 return NULL;
2343 else
2344 next_nodes = *log_nodes;
2345 /* Note: We already add the nodes of the initial state,
2346 then we don't need to add them here. */
2348 context = re_string_context_at (&mctx->input,
2349 re_string_cur_idx (&mctx->input) - 1,
2350 mctx->eflags);
2351 next_state = mctx->state_log[cur_idx]
2352 = re_acquire_state_context (err, dfa, &next_nodes, context);
2353 /* We don't need to check errors here, since the return value of
2354 this function is next_state and ERR is already set. */
2356 if (table_nodes != NULL)
2357 re_node_set_free (&next_nodes);
2360 if (BE (dfa->nbackref, 0) && next_state != NULL)
2362 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2363 later. We must check them here, since the back references in the
2364 next state might use them. */
2365 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2366 cur_idx);
2367 if (BE (*err != REG_NOERROR, 0))
2368 return NULL;
2370 /* If the next state has back references. */
2371 if (next_state->has_backref)
2373 *err = transit_state_bkref (mctx, &next_state->nodes);
2374 if (BE (*err != REG_NOERROR, 0))
2375 return NULL;
2376 next_state = mctx->state_log[cur_idx];
2380 return next_state;
2383 /* Skip bytes in the input that correspond to part of a
2384 multi-byte match, then look in the log for a state
2385 from which to restart matching. */
2386 re_dfastate_t *
2387 internal_function
2388 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2390 re_dfastate_t *cur_state;
2393 int max = mctx->state_log_top;
2394 int cur_str_idx = re_string_cur_idx (&mctx->input);
2398 if (++cur_str_idx > max)
2399 return NULL;
2400 re_string_skip_bytes (&mctx->input, 1);
2402 while (mctx->state_log[cur_str_idx] == NULL);
2404 cur_state = merge_state_with_log (err, mctx, NULL);
2406 while (*err == REG_NOERROR && cur_state == NULL);
2407 return cur_state;
2410 /* Helper functions for transit_state. */
2412 /* From the node set CUR_NODES, pick up the nodes whose types are
2413 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2414 expression. And register them to use them later for evaluating the
2415 correspoding back references. */
2417 static reg_errcode_t
2418 internal_function
2419 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2420 int str_idx)
2422 const re_dfa_t *const dfa = mctx->dfa;
2423 int node_idx;
2424 reg_errcode_t err;
2426 /* TODO: This isn't efficient.
2427 Because there might be more than one nodes whose types are
2428 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2429 nodes.
2430 E.g. RE: (a){2} */
2431 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2433 int node = cur_nodes->elems[node_idx];
2434 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2435 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2436 && (dfa->used_bkref_map
2437 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2439 err = match_ctx_add_subtop (mctx, node, str_idx);
2440 if (BE (err != REG_NOERROR, 0))
2441 return err;
2444 return REG_NOERROR;
2447 #if 0
2448 /* Return the next state to which the current state STATE will transit by
2449 accepting the current input byte. */
2451 static re_dfastate_t *
2452 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2453 re_dfastate_t *state)
2455 const re_dfa_t *const dfa = mctx->dfa;
2456 re_node_set next_nodes;
2457 re_dfastate_t *next_state;
2458 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2459 unsigned int context;
2461 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2462 if (BE (*err != REG_NOERROR, 0))
2463 return NULL;
2464 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2466 int cur_node = state->nodes.elems[node_cnt];
2467 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2469 *err = re_node_set_merge (&next_nodes,
2470 dfa->eclosures + dfa->nexts[cur_node]);
2471 if (BE (*err != REG_NOERROR, 0))
2473 re_node_set_free (&next_nodes);
2474 return NULL;
2478 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2479 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2480 /* We don't need to check errors here, since the return value of
2481 this function is next_state and ERR is already set. */
2483 re_node_set_free (&next_nodes);
2484 re_string_skip_bytes (&mctx->input, 1);
2485 return next_state;
2487 #endif
2489 #ifdef RE_ENABLE_I18N
2490 static reg_errcode_t
2491 internal_function
2492 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2494 const re_dfa_t *const dfa = mctx->dfa;
2495 reg_errcode_t err;
2496 int i;
2498 for (i = 0; i < pstate->nodes.nelem; ++i)
2500 re_node_set dest_nodes, *new_nodes;
2501 int cur_node_idx = pstate->nodes.elems[i];
2502 int naccepted, dest_idx;
2503 unsigned int context;
2504 re_dfastate_t *dest_state;
2506 if (!dfa->nodes[cur_node_idx].accept_mb)
2507 continue;
2509 if (dfa->nodes[cur_node_idx].constraint)
2511 context = re_string_context_at (&mctx->input,
2512 re_string_cur_idx (&mctx->input),
2513 mctx->eflags);
2514 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2515 context))
2516 continue;
2519 /* How many bytes the node can accept? */
2520 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2521 re_string_cur_idx (&mctx->input));
2522 if (naccepted == 0)
2523 continue;
2525 /* The node can accepts `naccepted' bytes. */
2526 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2527 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2528 : mctx->max_mb_elem_len);
2529 err = clean_state_log_if_needed (mctx, dest_idx);
2530 if (BE (err != REG_NOERROR, 0))
2531 return err;
2532 #ifdef DEBUG
2533 assert (dfa->nexts[cur_node_idx] != -1);
2534 #endif
2535 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2537 dest_state = mctx->state_log[dest_idx];
2538 if (dest_state == NULL)
2539 dest_nodes = *new_nodes;
2540 else
2542 err = re_node_set_init_union (&dest_nodes,
2543 dest_state->entrance_nodes, new_nodes);
2544 if (BE (err != REG_NOERROR, 0))
2545 return err;
2547 context = re_string_context_at (&mctx->input, dest_idx - 1,
2548 mctx->eflags);
2549 mctx->state_log[dest_idx]
2550 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2551 if (dest_state != NULL)
2552 re_node_set_free (&dest_nodes);
2553 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2554 return err;
2556 return REG_NOERROR;
2558 #endif /* RE_ENABLE_I18N */
2560 static reg_errcode_t
2561 internal_function
2562 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2564 const re_dfa_t *const dfa = mctx->dfa;
2565 reg_errcode_t err;
2566 int i;
2567 int cur_str_idx = re_string_cur_idx (&mctx->input);
2569 for (i = 0; i < nodes->nelem; ++i)
2571 int dest_str_idx, prev_nelem, bkc_idx;
2572 int node_idx = nodes->elems[i];
2573 unsigned int context;
2574 const re_token_t *node = dfa->nodes + node_idx;
2575 re_node_set *new_dest_nodes;
2577 /* Check whether `node' is a backreference or not. */
2578 if (node->type != OP_BACK_REF)
2579 continue;
2581 if (node->constraint)
2583 context = re_string_context_at (&mctx->input, cur_str_idx,
2584 mctx->eflags);
2585 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2586 continue;
2589 /* `node' is a backreference.
2590 Check the substring which the substring matched. */
2591 bkc_idx = mctx->nbkref_ents;
2592 err = get_subexp (mctx, node_idx, cur_str_idx);
2593 if (BE (err != REG_NOERROR, 0))
2594 goto free_return;
2596 /* And add the epsilon closures (which is `new_dest_nodes') of
2597 the backreference to appropriate state_log. */
2598 #ifdef DEBUG
2599 assert (dfa->nexts[node_idx] != -1);
2600 #endif
2601 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2603 int subexp_len;
2604 re_dfastate_t *dest_state;
2605 struct re_backref_cache_entry *bkref_ent;
2606 bkref_ent = mctx->bkref_ents + bkc_idx;
2607 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2608 continue;
2609 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2610 new_dest_nodes = (subexp_len == 0
2611 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2612 : dfa->eclosures + dfa->nexts[node_idx]);
2613 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2614 - bkref_ent->subexp_from);
2615 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2616 mctx->eflags);
2617 dest_state = mctx->state_log[dest_str_idx];
2618 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2619 : mctx->state_log[cur_str_idx]->nodes.nelem);
2620 /* Add `new_dest_node' to state_log. */
2621 if (dest_state == NULL)
2623 mctx->state_log[dest_str_idx]
2624 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2625 context);
2626 if (BE (mctx->state_log[dest_str_idx] == NULL
2627 && err != REG_NOERROR, 0))
2628 goto free_return;
2630 else
2632 re_node_set dest_nodes;
2633 err = re_node_set_init_union (&dest_nodes,
2634 dest_state->entrance_nodes,
2635 new_dest_nodes);
2636 if (BE (err != REG_NOERROR, 0))
2638 re_node_set_free (&dest_nodes);
2639 goto free_return;
2641 mctx->state_log[dest_str_idx]
2642 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2643 re_node_set_free (&dest_nodes);
2644 if (BE (mctx->state_log[dest_str_idx] == NULL
2645 && err != REG_NOERROR, 0))
2646 goto free_return;
2648 /* We need to check recursively if the backreference can epsilon
2649 transit. */
2650 if (subexp_len == 0
2651 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2653 err = check_subexp_matching_top (mctx, new_dest_nodes,
2654 cur_str_idx);
2655 if (BE (err != REG_NOERROR, 0))
2656 goto free_return;
2657 err = transit_state_bkref (mctx, new_dest_nodes);
2658 if (BE (err != REG_NOERROR, 0))
2659 goto free_return;
2663 err = REG_NOERROR;
2664 free_return:
2665 return err;
2668 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2669 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2670 Note that we might collect inappropriate candidates here.
2671 However, the cost of checking them strictly here is too high, then we
2672 delay these checking for prune_impossible_nodes(). */
2674 static reg_errcode_t
2675 internal_function __attribute_warn_unused_result__
2676 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2678 const re_dfa_t *const dfa = mctx->dfa;
2679 int subexp_num, sub_top_idx;
2680 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2681 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2682 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2683 if (cache_idx != -1)
2685 const struct re_backref_cache_entry *entry
2686 = mctx->bkref_ents + cache_idx;
2688 if (entry->node == bkref_node)
2689 return REG_NOERROR; /* We already checked it. */
2690 while (entry++->more);
2693 subexp_num = dfa->nodes[bkref_node].opr.idx;
2695 /* For each sub expression */
2696 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2698 reg_errcode_t err;
2699 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2700 re_sub_match_last_t *sub_last;
2701 int sub_last_idx, sl_str, bkref_str_off;
2703 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2704 continue; /* It isn't related. */
2706 sl_str = sub_top->str_idx;
2707 bkref_str_off = bkref_str_idx;
2708 /* At first, check the last node of sub expressions we already
2709 evaluated. */
2710 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2712 int sl_str_diff;
2713 sub_last = sub_top->lasts[sub_last_idx];
2714 sl_str_diff = sub_last->str_idx - sl_str;
2715 /* The matched string by the sub expression match with the substring
2716 at the back reference? */
2717 if (sl_str_diff > 0)
2719 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2721 /* Not enough chars for a successful match. */
2722 if (bkref_str_off + sl_str_diff > mctx->input.len)
2723 break;
2725 err = clean_state_log_if_needed (mctx,
2726 bkref_str_off
2727 + sl_str_diff);
2728 if (BE (err != REG_NOERROR, 0))
2729 return err;
2730 buf = (const char *) re_string_get_buffer (&mctx->input);
2732 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2733 /* We don't need to search this sub expression any more. */
2734 break;
2736 bkref_str_off += sl_str_diff;
2737 sl_str += sl_str_diff;
2738 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2739 bkref_str_idx);
2741 /* Reload buf, since the preceding call might have reallocated
2742 the buffer. */
2743 buf = (const char *) re_string_get_buffer (&mctx->input);
2745 if (err == REG_NOMATCH)
2746 continue;
2747 if (BE (err != REG_NOERROR, 0))
2748 return err;
2751 if (sub_last_idx < sub_top->nlasts)
2752 continue;
2753 if (sub_last_idx > 0)
2754 ++sl_str;
2755 /* Then, search for the other last nodes of the sub expression. */
2756 for (; sl_str <= bkref_str_idx; ++sl_str)
2758 int cls_node, sl_str_off;
2759 const re_node_set *nodes;
2760 sl_str_off = sl_str - sub_top->str_idx;
2761 /* The matched string by the sub expression match with the substring
2762 at the back reference? */
2763 if (sl_str_off > 0)
2765 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2767 /* If we are at the end of the input, we cannot match. */
2768 if (bkref_str_off >= mctx->input.len)
2769 break;
2771 err = extend_buffers (mctx, bkref_str_off + 1);
2772 if (BE (err != REG_NOERROR, 0))
2773 return err;
2775 buf = (const char *) re_string_get_buffer (&mctx->input);
2777 if (buf [bkref_str_off++] != buf[sl_str - 1])
2778 break; /* We don't need to search this sub expression
2779 any more. */
2781 if (mctx->state_log[sl_str] == NULL)
2782 continue;
2783 /* Does this state have a ')' of the sub expression? */
2784 nodes = &mctx->state_log[sl_str]->nodes;
2785 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2786 OP_CLOSE_SUBEXP);
2787 if (cls_node == -1)
2788 continue; /* No. */
2789 if (sub_top->path == NULL)
2791 sub_top->path = calloc (sizeof (state_array_t),
2792 sl_str - sub_top->str_idx + 1);
2793 if (sub_top->path == NULL)
2794 return REG_ESPACE;
2796 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2797 in the current context? */
2798 err = check_arrival (mctx, sub_top->path, sub_top->node,
2799 sub_top->str_idx, cls_node, sl_str,
2800 OP_CLOSE_SUBEXP);
2801 if (err == REG_NOMATCH)
2802 continue;
2803 if (BE (err != REG_NOERROR, 0))
2804 return err;
2805 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2806 if (BE (sub_last == NULL, 0))
2807 return REG_ESPACE;
2808 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2809 bkref_str_idx);
2810 if (err == REG_NOMATCH)
2811 continue;
2814 return REG_NOERROR;
2817 /* Helper functions for get_subexp(). */
2819 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2820 If it can arrive, register the sub expression expressed with SUB_TOP
2821 and SUB_LAST. */
2823 static reg_errcode_t
2824 internal_function
2825 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2826 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2828 reg_errcode_t err;
2829 int to_idx;
2830 /* Can the subexpression arrive the back reference? */
2831 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2832 sub_last->str_idx, bkref_node, bkref_str,
2833 OP_OPEN_SUBEXP);
2834 if (err != REG_NOERROR)
2835 return err;
2836 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2837 sub_last->str_idx);
2838 if (BE (err != REG_NOERROR, 0))
2839 return err;
2840 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2841 return clean_state_log_if_needed (mctx, to_idx);
2844 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2845 Search '(' if FL_OPEN, or search ')' otherwise.
2846 TODO: This function isn't efficient...
2847 Because there might be more than one nodes whose types are
2848 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2849 nodes.
2850 E.g. RE: (a){2} */
2852 static int
2853 internal_function
2854 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2855 int subexp_idx, int type)
2857 int cls_idx;
2858 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2860 int cls_node = nodes->elems[cls_idx];
2861 const re_token_t *node = dfa->nodes + cls_node;
2862 if (node->type == type
2863 && node->opr.idx == subexp_idx)
2864 return cls_node;
2866 return -1;
2869 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2870 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2871 heavily reused.
2872 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2874 static reg_errcode_t
2875 internal_function __attribute_warn_unused_result__
2876 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2877 int top_str, int last_node, int last_str, int type)
2879 const re_dfa_t *const dfa = mctx->dfa;
2880 reg_errcode_t err = REG_NOERROR;
2881 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2882 re_dfastate_t *cur_state = NULL;
2883 re_node_set *cur_nodes, next_nodes;
2884 re_dfastate_t **backup_state_log;
2885 unsigned int context;
2887 subexp_num = dfa->nodes[top_node].opr.idx;
2888 /* Extend the buffer if we need. */
2889 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2891 re_dfastate_t **new_array;
2892 int old_alloc = path->alloc;
2893 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2894 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2895 if (BE (new_array == NULL, 0))
2897 path->alloc = old_alloc;
2898 return REG_ESPACE;
2900 path->array = new_array;
2901 memset (new_array + old_alloc, '\0',
2902 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2905 str_idx = path->next_idx ?: top_str;
2907 /* Temporary modify MCTX. */
2908 backup_state_log = mctx->state_log;
2909 backup_cur_idx = mctx->input.cur_idx;
2910 mctx->state_log = path->array;
2911 mctx->input.cur_idx = str_idx;
2913 /* Setup initial node set. */
2914 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2915 if (str_idx == top_str)
2917 err = re_node_set_init_1 (&next_nodes, top_node);
2918 if (BE (err != REG_NOERROR, 0))
2919 return err;
2920 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2921 if (BE (err != REG_NOERROR, 0))
2923 re_node_set_free (&next_nodes);
2924 return err;
2927 else
2929 cur_state = mctx->state_log[str_idx];
2930 if (cur_state && cur_state->has_backref)
2932 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2933 if (BE (err != REG_NOERROR, 0))
2934 return err;
2936 else
2937 re_node_set_init_empty (&next_nodes);
2939 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2941 if (next_nodes.nelem)
2943 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2944 subexp_num, type);
2945 if (BE (err != REG_NOERROR, 0))
2947 re_node_set_free (&next_nodes);
2948 return err;
2951 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2952 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2954 re_node_set_free (&next_nodes);
2955 return err;
2957 mctx->state_log[str_idx] = cur_state;
2960 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2962 re_node_set_empty (&next_nodes);
2963 if (mctx->state_log[str_idx + 1])
2965 err = re_node_set_merge (&next_nodes,
2966 &mctx->state_log[str_idx + 1]->nodes);
2967 if (BE (err != REG_NOERROR, 0))
2969 re_node_set_free (&next_nodes);
2970 return err;
2973 if (cur_state)
2975 err = check_arrival_add_next_nodes (mctx, str_idx,
2976 &cur_state->non_eps_nodes,
2977 &next_nodes);
2978 if (BE (err != REG_NOERROR, 0))
2980 re_node_set_free (&next_nodes);
2981 return err;
2984 ++str_idx;
2985 if (next_nodes.nelem)
2987 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2988 if (BE (err != REG_NOERROR, 0))
2990 re_node_set_free (&next_nodes);
2991 return err;
2993 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2994 subexp_num, type);
2995 if (BE (err != REG_NOERROR, 0))
2997 re_node_set_free (&next_nodes);
2998 return err;
3001 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3002 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3003 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3005 re_node_set_free (&next_nodes);
3006 return err;
3008 mctx->state_log[str_idx] = cur_state;
3009 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3011 re_node_set_free (&next_nodes);
3012 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3013 : &mctx->state_log[last_str]->nodes);
3014 path->next_idx = str_idx;
3016 /* Fix MCTX. */
3017 mctx->state_log = backup_state_log;
3018 mctx->input.cur_idx = backup_cur_idx;
3020 /* Then check the current node set has the node LAST_NODE. */
3021 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3022 return REG_NOERROR;
3024 return REG_NOMATCH;
3027 /* Helper functions for check_arrival. */
3029 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3030 to NEXT_NODES.
3031 TODO: This function is similar to the functions transit_state*(),
3032 however this function has many additional works.
3033 Can't we unify them? */
3035 static reg_errcode_t
3036 internal_function __attribute_warn_unused_result__
3037 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3038 re_node_set *cur_nodes, re_node_set *next_nodes)
3040 const re_dfa_t *const dfa = mctx->dfa;
3041 int result;
3042 int cur_idx;
3043 reg_errcode_t err = REG_NOERROR;
3044 re_node_set union_set;
3045 re_node_set_init_empty (&union_set);
3046 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3048 int naccepted = 0;
3049 int cur_node = cur_nodes->elems[cur_idx];
3050 #ifdef DEBUG
3051 re_token_type_t type = dfa->nodes[cur_node].type;
3052 assert (!IS_EPSILON_NODE (type));
3053 #endif
3054 #ifdef RE_ENABLE_I18N
3055 /* If the node may accept `multi byte'. */
3056 if (dfa->nodes[cur_node].accept_mb)
3058 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3059 str_idx);
3060 if (naccepted > 1)
3062 re_dfastate_t *dest_state;
3063 int next_node = dfa->nexts[cur_node];
3064 int next_idx = str_idx + naccepted;
3065 dest_state = mctx->state_log[next_idx];
3066 re_node_set_empty (&union_set);
3067 if (dest_state)
3069 err = re_node_set_merge (&union_set, &dest_state->nodes);
3070 if (BE (err != REG_NOERROR, 0))
3072 re_node_set_free (&union_set);
3073 return err;
3076 result = re_node_set_insert (&union_set, next_node);
3077 if (BE (result < 0, 0))
3079 re_node_set_free (&union_set);
3080 return REG_ESPACE;
3082 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3083 &union_set);
3084 if (BE (mctx->state_log[next_idx] == NULL
3085 && err != REG_NOERROR, 0))
3087 re_node_set_free (&union_set);
3088 return err;
3092 #endif /* RE_ENABLE_I18N */
3093 if (naccepted
3094 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3096 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3097 if (BE (result < 0, 0))
3099 re_node_set_free (&union_set);
3100 return REG_ESPACE;
3104 re_node_set_free (&union_set);
3105 return REG_NOERROR;
3108 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3109 CUR_NODES, however exclude the nodes which are:
3110 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3111 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3114 static reg_errcode_t
3115 internal_function
3116 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3117 int ex_subexp, int type)
3119 reg_errcode_t err;
3120 int idx, outside_node;
3121 re_node_set new_nodes;
3122 #ifdef DEBUG
3123 assert (cur_nodes->nelem);
3124 #endif
3125 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3126 if (BE (err != REG_NOERROR, 0))
3127 return err;
3128 /* Create a new node set NEW_NODES with the nodes which are epsilon
3129 closures of the node in CUR_NODES. */
3131 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3133 int cur_node = cur_nodes->elems[idx];
3134 const re_node_set *eclosure = dfa->eclosures + cur_node;
3135 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3136 if (outside_node == -1)
3138 /* There are no problematic nodes, just merge them. */
3139 err = re_node_set_merge (&new_nodes, eclosure);
3140 if (BE (err != REG_NOERROR, 0))
3142 re_node_set_free (&new_nodes);
3143 return err;
3146 else
3148 /* There are problematic nodes, re-calculate incrementally. */
3149 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3150 ex_subexp, type);
3151 if (BE (err != REG_NOERROR, 0))
3153 re_node_set_free (&new_nodes);
3154 return err;
3158 re_node_set_free (cur_nodes);
3159 *cur_nodes = new_nodes;
3160 return REG_NOERROR;
3163 /* Helper function for check_arrival_expand_ecl.
3164 Check incrementally the epsilon closure of TARGET, and if it isn't
3165 problematic append it to DST_NODES. */
3167 static reg_errcode_t
3168 internal_function __attribute_warn_unused_result__
3169 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3170 int target, int ex_subexp, int type)
3172 int cur_node;
3173 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3175 int err;
3177 if (dfa->nodes[cur_node].type == type
3178 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3180 if (type == OP_CLOSE_SUBEXP)
3182 err = re_node_set_insert (dst_nodes, cur_node);
3183 if (BE (err == -1, 0))
3184 return REG_ESPACE;
3186 break;
3188 err = re_node_set_insert (dst_nodes, cur_node);
3189 if (BE (err == -1, 0))
3190 return REG_ESPACE;
3191 if (dfa->edests[cur_node].nelem == 0)
3192 break;
3193 if (dfa->edests[cur_node].nelem == 2)
3195 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3196 dfa->edests[cur_node].elems[1],
3197 ex_subexp, type);
3198 if (BE (err != REG_NOERROR, 0))
3199 return err;
3201 cur_node = dfa->edests[cur_node].elems[0];
3203 return REG_NOERROR;
3207 /* For all the back references in the current state, calculate the
3208 destination of the back references by the appropriate entry
3209 in MCTX->BKREF_ENTS. */
3211 static reg_errcode_t
3212 internal_function __attribute_warn_unused_result__
3213 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3214 int cur_str, int subexp_num, int type)
3216 const re_dfa_t *const dfa = mctx->dfa;
3217 reg_errcode_t err;
3218 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3219 struct re_backref_cache_entry *ent;
3221 if (cache_idx_start == -1)
3222 return REG_NOERROR;
3224 restart:
3225 ent = mctx->bkref_ents + cache_idx_start;
3228 int to_idx, next_node;
3230 /* Is this entry ENT is appropriate? */
3231 if (!re_node_set_contains (cur_nodes, ent->node))
3232 continue; /* No. */
3234 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3235 /* Calculate the destination of the back reference, and append it
3236 to MCTX->STATE_LOG. */
3237 if (to_idx == cur_str)
3239 /* The backreference did epsilon transit, we must re-check all the
3240 node in the current state. */
3241 re_node_set new_dests;
3242 reg_errcode_t err2, err3;
3243 next_node = dfa->edests[ent->node].elems[0];
3244 if (re_node_set_contains (cur_nodes, next_node))
3245 continue;
3246 err = re_node_set_init_1 (&new_dests, next_node);
3247 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3248 err3 = re_node_set_merge (cur_nodes, &new_dests);
3249 re_node_set_free (&new_dests);
3250 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3251 || err3 != REG_NOERROR, 0))
3253 err = (err != REG_NOERROR ? err
3254 : (err2 != REG_NOERROR ? err2 : err3));
3255 return err;
3257 /* TODO: It is still inefficient... */
3258 goto restart;
3260 else
3262 re_node_set union_set;
3263 next_node = dfa->nexts[ent->node];
3264 if (mctx->state_log[to_idx])
3266 int ret;
3267 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3268 next_node))
3269 continue;
3270 err = re_node_set_init_copy (&union_set,
3271 &mctx->state_log[to_idx]->nodes);
3272 ret = re_node_set_insert (&union_set, next_node);
3273 if (BE (err != REG_NOERROR || ret < 0, 0))
3275 re_node_set_free (&union_set);
3276 err = err != REG_NOERROR ? err : REG_ESPACE;
3277 return err;
3280 else
3282 err = re_node_set_init_1 (&union_set, next_node);
3283 if (BE (err != REG_NOERROR, 0))
3284 return err;
3286 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3287 re_node_set_free (&union_set);
3288 if (BE (mctx->state_log[to_idx] == NULL
3289 && err != REG_NOERROR, 0))
3290 return err;
3293 while (ent++->more);
3294 return REG_NOERROR;
3297 /* Build transition table for the state.
3298 Return 1 if succeeded, otherwise return NULL. */
3300 static int
3301 internal_function
3302 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3304 reg_errcode_t err;
3305 int i, j, ch, need_word_trtable = 0;
3306 bitset_word_t elem, mask;
3307 bool dests_node_malloced = false;
3308 bool dest_states_malloced = false;
3309 int ndests; /* Number of the destination states from `state'. */
3310 re_dfastate_t **trtable;
3311 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3312 re_node_set follows, *dests_node;
3313 bitset_t *dests_ch;
3314 bitset_t acceptable;
3316 struct dests_alloc
3318 re_node_set dests_node[SBC_MAX];
3319 bitset_t dests_ch[SBC_MAX];
3320 } *dests_alloc;
3322 /* We build DFA states which corresponds to the destination nodes
3323 from `state'. `dests_node[i]' represents the nodes which i-th
3324 destination state contains, and `dests_ch[i]' represents the
3325 characters which i-th destination state accepts. */
3326 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3327 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3328 else
3330 dests_alloc = re_malloc (struct dests_alloc, 1);
3331 if (BE (dests_alloc == NULL, 0))
3332 return 0;
3333 dests_node_malloced = true;
3335 dests_node = dests_alloc->dests_node;
3336 dests_ch = dests_alloc->dests_ch;
3338 /* Initialize transiton table. */
3339 state->word_trtable = state->trtable = NULL;
3341 /* At first, group all nodes belonging to `state' into several
3342 destinations. */
3343 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3344 if (BE (ndests <= 0, 0))
3346 if (dests_node_malloced)
3347 free (dests_alloc);
3348 /* Return 0 in case of an error, 1 otherwise. */
3349 if (ndests == 0)
3351 state->trtable = (re_dfastate_t **)
3352 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3353 if (BE (state->trtable == NULL, 0))
3354 return 0;
3355 return 1;
3357 return 0;
3360 err = re_node_set_alloc (&follows, ndests + 1);
3361 if (BE (err != REG_NOERROR, 0))
3362 goto out_free;
3364 /* Avoid arithmetic overflow in size calculation. */
3365 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3366 / (3 * sizeof (re_dfastate_t *)))
3367 < ndests),
3369 goto out_free;
3371 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3372 + ndests * 3 * sizeof (re_dfastate_t *)))
3373 dest_states = (re_dfastate_t **)
3374 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3375 else
3377 dest_states = (re_dfastate_t **)
3378 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3379 if (BE (dest_states == NULL, 0))
3381 out_free:
3382 if (dest_states_malloced)
3383 free (dest_states);
3384 re_node_set_free (&follows);
3385 for (i = 0; i < ndests; ++i)
3386 re_node_set_free (dests_node + i);
3387 if (dests_node_malloced)
3388 free (dests_alloc);
3389 return 0;
3391 dest_states_malloced = true;
3393 dest_states_word = dest_states + ndests;
3394 dest_states_nl = dest_states_word + ndests;
3395 bitset_empty (acceptable);
3397 /* Then build the states for all destinations. */
3398 for (i = 0; i < ndests; ++i)
3400 int next_node;
3401 re_node_set_empty (&follows);
3402 /* Merge the follows of this destination states. */
3403 for (j = 0; j < dests_node[i].nelem; ++j)
3405 next_node = dfa->nexts[dests_node[i].elems[j]];
3406 if (next_node != -1)
3408 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3409 if (BE (err != REG_NOERROR, 0))
3410 goto out_free;
3413 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3414 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3415 goto out_free;
3416 /* If the new state has context constraint,
3417 build appropriate states for these contexts. */
3418 if (dest_states[i]->has_constraint)
3420 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3421 CONTEXT_WORD);
3422 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3423 goto out_free;
3425 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3426 need_word_trtable = 1;
3428 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3429 CONTEXT_NEWLINE);
3430 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3431 goto out_free;
3433 else
3435 dest_states_word[i] = dest_states[i];
3436 dest_states_nl[i] = dest_states[i];
3438 bitset_merge (acceptable, dests_ch[i]);
3441 if (!BE (need_word_trtable, 0))
3443 /* We don't care about whether the following character is a word
3444 character, or we are in a single-byte character set so we can
3445 discern by looking at the character code: allocate a
3446 256-entry transition table. */
3447 trtable = state->trtable =
3448 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3449 if (BE (trtable == NULL, 0))
3450 goto out_free;
3452 /* For all characters ch...: */
3453 for (i = 0; i < BITSET_WORDS; ++i)
3454 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3455 elem;
3456 mask <<= 1, elem >>= 1, ++ch)
3457 if (BE (elem & 1, 0))
3459 /* There must be exactly one destination which accepts
3460 character ch. See group_nodes_into_DFAstates. */
3461 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3464 /* j-th destination accepts the word character ch. */
3465 if (dfa->word_char[i] & mask)
3466 trtable[ch] = dest_states_word[j];
3467 else
3468 trtable[ch] = dest_states[j];
3471 else
3473 /* We care about whether the following character is a word
3474 character, and we are in a multi-byte character set: discern
3475 by looking at the character code: build two 256-entry
3476 transition tables, one starting at trtable[0] and one
3477 starting at trtable[SBC_MAX]. */
3478 trtable = state->word_trtable =
3479 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3480 if (BE (trtable == NULL, 0))
3481 goto out_free;
3483 /* For all characters ch...: */
3484 for (i = 0; i < BITSET_WORDS; ++i)
3485 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3486 elem;
3487 mask <<= 1, elem >>= 1, ++ch)
3488 if (BE (elem & 1, 0))
3490 /* There must be exactly one destination which accepts
3491 character ch. See group_nodes_into_DFAstates. */
3492 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3495 /* j-th destination accepts the word character ch. */
3496 trtable[ch] = dest_states[j];
3497 trtable[ch + SBC_MAX] = dest_states_word[j];
3501 /* new line */
3502 if (bitset_contain (acceptable, NEWLINE_CHAR))
3504 /* The current state accepts newline character. */
3505 for (j = 0; j < ndests; ++j)
3506 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3508 /* k-th destination accepts newline character. */
3509 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3510 if (need_word_trtable)
3511 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3512 /* There must be only one destination which accepts
3513 newline. See group_nodes_into_DFAstates. */
3514 break;
3518 if (dest_states_malloced)
3519 free (dest_states);
3521 re_node_set_free (&follows);
3522 for (i = 0; i < ndests; ++i)
3523 re_node_set_free (dests_node + i);
3525 if (dests_node_malloced)
3526 free (dests_alloc);
3528 return 1;
3531 /* Group all nodes belonging to STATE into several destinations.
3532 Then for all destinations, set the nodes belonging to the destination
3533 to DESTS_NODE[i] and set the characters accepted by the destination
3534 to DEST_CH[i]. This function return the number of destinations. */
3536 static int
3537 internal_function
3538 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3539 re_node_set *dests_node, bitset_t *dests_ch)
3541 reg_errcode_t err;
3542 int result;
3543 int i, j, k;
3544 int ndests; /* Number of the destinations from `state'. */
3545 bitset_t accepts; /* Characters a node can accept. */
3546 const re_node_set *cur_nodes = &state->nodes;
3547 bitset_empty (accepts);
3548 ndests = 0;
3550 /* For all the nodes belonging to `state', */
3551 for (i = 0; i < cur_nodes->nelem; ++i)
3553 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3554 re_token_type_t type = node->type;
3555 unsigned int constraint = node->constraint;
3557 /* Enumerate all single byte character this node can accept. */
3558 if (type == CHARACTER)
3559 bitset_set (accepts, node->opr.c);
3560 else if (type == SIMPLE_BRACKET)
3562 bitset_merge (accepts, node->opr.sbcset);
3564 else if (type == OP_PERIOD)
3566 #ifdef RE_ENABLE_I18N
3567 if (dfa->mb_cur_max > 1)
3568 bitset_merge (accepts, dfa->sb_char);
3569 else
3570 #endif
3571 bitset_set_all (accepts);
3572 if (!(dfa->syntax & RE_DOT_NEWLINE))
3573 bitset_clear (accepts, '\n');
3574 if (dfa->syntax & RE_DOT_NOT_NULL)
3575 bitset_clear (accepts, '\0');
3577 #ifdef RE_ENABLE_I18N
3578 else if (type == OP_UTF8_PERIOD)
3580 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3581 if (!(dfa->syntax & RE_DOT_NEWLINE))
3582 bitset_clear (accepts, '\n');
3583 if (dfa->syntax & RE_DOT_NOT_NULL)
3584 bitset_clear (accepts, '\0');
3586 #endif
3587 else
3588 continue;
3590 /* Check the `accepts' and sift the characters which are not
3591 match it the context. */
3592 if (constraint)
3594 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3596 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3597 bitset_empty (accepts);
3598 if (accepts_newline)
3599 bitset_set (accepts, NEWLINE_CHAR);
3600 else
3601 continue;
3603 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3605 bitset_empty (accepts);
3606 continue;
3609 if (constraint & NEXT_WORD_CONSTRAINT)
3611 bitset_word_t any_set = 0;
3612 if (type == CHARACTER && !node->word_char)
3614 bitset_empty (accepts);
3615 continue;
3617 #ifdef RE_ENABLE_I18N
3618 if (dfa->mb_cur_max > 1)
3619 for (j = 0; j < BITSET_WORDS; ++j)
3620 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3621 else
3622 #endif
3623 for (j = 0; j < BITSET_WORDS; ++j)
3624 any_set |= (accepts[j] &= dfa->word_char[j]);
3625 if (!any_set)
3626 continue;
3628 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3630 bitset_word_t any_set = 0;
3631 if (type == CHARACTER && node->word_char)
3633 bitset_empty (accepts);
3634 continue;
3636 #ifdef RE_ENABLE_I18N
3637 if (dfa->mb_cur_max > 1)
3638 for (j = 0; j < BITSET_WORDS; ++j)
3639 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3640 else
3641 #endif
3642 for (j = 0; j < BITSET_WORDS; ++j)
3643 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3644 if (!any_set)
3645 continue;
3649 /* Then divide `accepts' into DFA states, or create a new
3650 state. Above, we make sure that accepts is not empty. */
3651 for (j = 0; j < ndests; ++j)
3653 bitset_t intersec; /* Intersection sets, see below. */
3654 bitset_t remains;
3655 /* Flags, see below. */
3656 bitset_word_t has_intersec, not_subset, not_consumed;
3658 /* Optimization, skip if this state doesn't accept the character. */
3659 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3660 continue;
3662 /* Enumerate the intersection set of this state and `accepts'. */
3663 has_intersec = 0;
3664 for (k = 0; k < BITSET_WORDS; ++k)
3665 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3666 /* And skip if the intersection set is empty. */
3667 if (!has_intersec)
3668 continue;
3670 /* Then check if this state is a subset of `accepts'. */
3671 not_subset = not_consumed = 0;
3672 for (k = 0; k < BITSET_WORDS; ++k)
3674 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3675 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3678 /* If this state isn't a subset of `accepts', create a
3679 new group state, which has the `remains'. */
3680 if (not_subset)
3682 bitset_copy (dests_ch[ndests], remains);
3683 bitset_copy (dests_ch[j], intersec);
3684 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3685 if (BE (err != REG_NOERROR, 0))
3686 goto error_return;
3687 ++ndests;
3690 /* Put the position in the current group. */
3691 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3692 if (BE (result < 0, 0))
3693 goto error_return;
3695 /* If all characters are consumed, go to next node. */
3696 if (!not_consumed)
3697 break;
3699 /* Some characters remain, create a new group. */
3700 if (j == ndests)
3702 bitset_copy (dests_ch[ndests], accepts);
3703 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3704 if (BE (err != REG_NOERROR, 0))
3705 goto error_return;
3706 ++ndests;
3707 bitset_empty (accepts);
3710 return ndests;
3711 error_return:
3712 for (j = 0; j < ndests; ++j)
3713 re_node_set_free (dests_node + j);
3714 return -1;
3717 #ifdef RE_ENABLE_I18N
3718 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3719 Return the number of the bytes the node accepts.
3720 STR_IDX is the current index of the input string.
3722 This function handles the nodes which can accept one character, or
3723 one collating element like '.', '[a-z]', opposite to the other nodes
3724 can only accept one byte. */
3726 # ifdef _LIBC
3727 # include <locale/weight.h>
3728 # endif
3730 static int
3731 internal_function
3732 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3733 const re_string_t *input, int str_idx)
3735 const re_token_t *node = dfa->nodes + node_idx;
3736 int char_len, elem_len;
3737 int i;
3739 if (BE (node->type == OP_UTF8_PERIOD, 0))
3741 unsigned char c = re_string_byte_at (input, str_idx), d;
3742 if (BE (c < 0xc2, 1))
3743 return 0;
3745 if (str_idx + 2 > input->len)
3746 return 0;
3748 d = re_string_byte_at (input, str_idx + 1);
3749 if (c < 0xe0)
3750 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3751 else if (c < 0xf0)
3753 char_len = 3;
3754 if (c == 0xe0 && d < 0xa0)
3755 return 0;
3757 else if (c < 0xf8)
3759 char_len = 4;
3760 if (c == 0xf0 && d < 0x90)
3761 return 0;
3763 else if (c < 0xfc)
3765 char_len = 5;
3766 if (c == 0xf8 && d < 0x88)
3767 return 0;
3769 else if (c < 0xfe)
3771 char_len = 6;
3772 if (c == 0xfc && d < 0x84)
3773 return 0;
3775 else
3776 return 0;
3778 if (str_idx + char_len > input->len)
3779 return 0;
3781 for (i = 1; i < char_len; ++i)
3783 d = re_string_byte_at (input, str_idx + i);
3784 if (d < 0x80 || d > 0xbf)
3785 return 0;
3787 return char_len;
3790 char_len = re_string_char_size_at (input, str_idx);
3791 if (node->type == OP_PERIOD)
3793 if (char_len <= 1)
3794 return 0;
3795 /* FIXME: I don't think this if is needed, as both '\n'
3796 and '\0' are char_len == 1. */
3797 /* '.' accepts any one character except the following two cases. */
3798 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3799 re_string_byte_at (input, str_idx) == '\n') ||
3800 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3801 re_string_byte_at (input, str_idx) == '\0'))
3802 return 0;
3803 return char_len;
3806 elem_len = re_string_elem_size_at (input, str_idx);
3807 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3808 return 0;
3810 if (node->type == COMPLEX_BRACKET)
3812 const re_charset_t *cset = node->opr.mbcset;
3813 # ifdef _LIBC
3814 const unsigned char *pin
3815 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3816 int j;
3817 uint32_t nrules;
3818 # endif /* _LIBC */
3819 int match_len = 0;
3820 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3821 ? re_string_wchar_at (input, str_idx) : 0);
3823 /* match with multibyte character? */
3824 for (i = 0; i < cset->nmbchars; ++i)
3825 if (wc == cset->mbchars[i])
3827 match_len = char_len;
3828 goto check_node_accept_bytes_match;
3830 /* match with character_class? */
3831 for (i = 0; i < cset->nchar_classes; ++i)
3833 wctype_t wt = cset->char_classes[i];
3834 if (__iswctype (wc, wt))
3836 match_len = char_len;
3837 goto check_node_accept_bytes_match;
3841 # ifdef _LIBC
3842 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3843 if (nrules != 0)
3845 unsigned int in_collseq = 0;
3846 const int32_t *table, *indirect;
3847 const unsigned char *weights, *extra;
3848 const char *collseqwc;
3850 /* match with collating_symbol? */
3851 if (cset->ncoll_syms)
3852 extra = (const unsigned char *)
3853 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3854 for (i = 0; i < cset->ncoll_syms; ++i)
3856 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3857 /* Compare the length of input collating element and
3858 the length of current collating element. */
3859 if (*coll_sym != elem_len)
3860 continue;
3861 /* Compare each bytes. */
3862 for (j = 0; j < *coll_sym; j++)
3863 if (pin[j] != coll_sym[1 + j])
3864 break;
3865 if (j == *coll_sym)
3867 /* Match if every bytes is equal. */
3868 match_len = j;
3869 goto check_node_accept_bytes_match;
3873 if (cset->nranges)
3875 if (elem_len <= char_len)
3877 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3878 in_collseq = __collseq_table_lookup (collseqwc, wc);
3880 else
3881 in_collseq = find_collation_sequence_value (pin, elem_len);
3883 /* match with range expression? */
3884 for (i = 0; i < cset->nranges; ++i)
3885 if (cset->range_starts[i] <= in_collseq
3886 && in_collseq <= cset->range_ends[i])
3888 match_len = elem_len;
3889 goto check_node_accept_bytes_match;
3892 /* match with equivalence_class? */
3893 if (cset->nequiv_classes)
3895 const unsigned char *cp = pin;
3896 table = (const int32_t *)
3897 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3898 weights = (const unsigned char *)
3899 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3900 extra = (const unsigned char *)
3901 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3902 indirect = (const int32_t *)
3903 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3904 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3905 if (idx > 0)
3906 for (i = 0; i < cset->nequiv_classes; ++i)
3908 int32_t equiv_class_idx = cset->equiv_classes[i];
3909 size_t weight_len = weights[idx & 0xffffff];
3910 if (weight_len == weights[equiv_class_idx & 0xffffff]
3911 && (idx >> 24) == (equiv_class_idx >> 24))
3913 int cnt = 0;
3915 idx &= 0xffffff;
3916 equiv_class_idx &= 0xffffff;
3918 while (cnt <= weight_len
3919 && (weights[equiv_class_idx + 1 + cnt]
3920 == weights[idx + 1 + cnt]))
3921 ++cnt;
3922 if (cnt > weight_len)
3924 match_len = elem_len;
3925 goto check_node_accept_bytes_match;
3931 else
3932 # endif /* _LIBC */
3934 /* match with range expression? */
3935 #if __GNUC__ >= 2
3936 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3937 #else
3938 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3939 cmp_buf[2] = wc;
3940 #endif
3941 for (i = 0; i < cset->nranges; ++i)
3943 cmp_buf[0] = cset->range_starts[i];
3944 cmp_buf[4] = cset->range_ends[i];
3945 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
3946 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3948 match_len = char_len;
3949 goto check_node_accept_bytes_match;
3953 check_node_accept_bytes_match:
3954 if (!cset->non_match)
3955 return match_len;
3956 else
3958 if (match_len > 0)
3959 return 0;
3960 else
3961 return (elem_len > char_len) ? elem_len : char_len;
3964 return 0;
3967 # ifdef _LIBC
3968 static unsigned int
3969 internal_function
3970 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3972 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3973 if (nrules == 0)
3975 if (mbs_len == 1)
3977 /* No valid character. Match it as a single byte character. */
3978 const unsigned char *collseq = (const unsigned char *)
3979 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3980 return collseq[mbs[0]];
3982 return UINT_MAX;
3984 else
3986 int32_t idx;
3987 const unsigned char *extra = (const unsigned char *)
3988 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3989 int32_t extrasize = (const unsigned char *)
3990 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3992 for (idx = 0; idx < extrasize;)
3994 int mbs_cnt, found = 0;
3995 int32_t elem_mbs_len;
3996 /* Skip the name of collating element name. */
3997 idx = idx + extra[idx] + 1;
3998 elem_mbs_len = extra[idx++];
3999 if (mbs_len == elem_mbs_len)
4001 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4002 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4003 break;
4004 if (mbs_cnt == elem_mbs_len)
4005 /* Found the entry. */
4006 found = 1;
4008 /* Skip the byte sequence of the collating element. */
4009 idx += elem_mbs_len;
4010 /* Adjust for the alignment. */
4011 idx = (idx + 3) & ~3;
4012 /* Skip the collation sequence value. */
4013 idx += sizeof (uint32_t);
4014 /* Skip the wide char sequence of the collating element. */
4015 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4016 /* If we found the entry, return the sequence value. */
4017 if (found)
4018 return *(uint32_t *) (extra + idx);
4019 /* Skip the collation sequence value. */
4020 idx += sizeof (uint32_t);
4022 return UINT_MAX;
4025 # endif /* _LIBC */
4026 #endif /* RE_ENABLE_I18N */
4028 /* Check whether the node accepts the byte which is IDX-th
4029 byte of the INPUT. */
4031 static int
4032 internal_function
4033 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4034 int idx)
4036 unsigned char ch;
4037 ch = re_string_byte_at (&mctx->input, idx);
4038 switch (node->type)
4040 case CHARACTER:
4041 if (node->opr.c != ch)
4042 return 0;
4043 break;
4045 case SIMPLE_BRACKET:
4046 if (!bitset_contain (node->opr.sbcset, ch))
4047 return 0;
4048 break;
4050 #ifdef RE_ENABLE_I18N
4051 case OP_UTF8_PERIOD:
4052 if (ch >= 0x80)
4053 return 0;
4054 /* FALLTHROUGH */
4055 #endif
4056 case OP_PERIOD:
4057 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4058 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4059 return 0;
4060 break;
4062 default:
4063 return 0;
4066 if (node->constraint)
4068 /* The node has constraints. Check whether the current context
4069 satisfies the constraints. */
4070 unsigned int context = re_string_context_at (&mctx->input, idx,
4071 mctx->eflags);
4072 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4073 return 0;
4076 return 1;
4079 /* Extend the buffers, if the buffers have run out. */
4081 static reg_errcode_t
4082 internal_function __attribute_warn_unused_result__
4083 extend_buffers (re_match_context_t *mctx, int min_len)
4085 reg_errcode_t ret;
4086 re_string_t *pstr = &mctx->input;
4088 /* Avoid overflow. */
4089 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4090 return REG_ESPACE;
4092 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4093 ret = re_string_realloc_buffers (pstr,
4094 MAX (min_len,
4095 MIN (pstr->len, pstr->bufs_len * 2)));
4096 if (BE (ret != REG_NOERROR, 0))
4097 return ret;
4099 if (mctx->state_log != NULL)
4101 /* And double the length of state_log. */
4102 /* XXX We have no indication of the size of this buffer. If this
4103 allocation fail we have no indication that the state_log array
4104 does not have the right size. */
4105 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4106 pstr->bufs_len + 1);
4107 if (BE (new_array == NULL, 0))
4108 return REG_ESPACE;
4109 mctx->state_log = new_array;
4112 /* Then reconstruct the buffers. */
4113 if (pstr->icase)
4115 #ifdef RE_ENABLE_I18N
4116 if (pstr->mb_cur_max > 1)
4118 ret = build_wcs_upper_buffer (pstr);
4119 if (BE (ret != REG_NOERROR, 0))
4120 return ret;
4122 else
4123 #endif /* RE_ENABLE_I18N */
4124 build_upper_buffer (pstr);
4126 else
4128 #ifdef RE_ENABLE_I18N
4129 if (pstr->mb_cur_max > 1)
4130 build_wcs_buffer (pstr);
4131 else
4132 #endif /* RE_ENABLE_I18N */
4134 if (pstr->trans != NULL)
4135 re_string_translate_buffer (pstr);
4138 return REG_NOERROR;
4142 /* Functions for matching context. */
4144 /* Initialize MCTX. */
4146 static reg_errcode_t
4147 internal_function __attribute_warn_unused_result__
4148 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4150 mctx->eflags = eflags;
4151 mctx->match_last = -1;
4152 if (n > 0)
4154 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4155 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4156 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4157 return REG_ESPACE;
4159 /* Already zero-ed by the caller.
4160 else
4161 mctx->bkref_ents = NULL;
4162 mctx->nbkref_ents = 0;
4163 mctx->nsub_tops = 0; */
4164 mctx->abkref_ents = n;
4165 mctx->max_mb_elem_len = 1;
4166 mctx->asub_tops = n;
4167 return REG_NOERROR;
4170 /* Clean the entries which depend on the current input in MCTX.
4171 This function must be invoked when the matcher changes the start index
4172 of the input, or changes the input string. */
4174 static void
4175 internal_function
4176 match_ctx_clean (re_match_context_t *mctx)
4178 int st_idx;
4179 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4181 int sl_idx;
4182 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4183 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4185 re_sub_match_last_t *last = top->lasts[sl_idx];
4186 re_free (last->path.array);
4187 re_free (last);
4189 re_free (top->lasts);
4190 if (top->path)
4192 re_free (top->path->array);
4193 re_free (top->path);
4195 free (top);
4198 mctx->nsub_tops = 0;
4199 mctx->nbkref_ents = 0;
4202 /* Free all the memory associated with MCTX. */
4204 static void
4205 internal_function
4206 match_ctx_free (re_match_context_t *mctx)
4208 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4209 match_ctx_clean (mctx);
4210 re_free (mctx->sub_tops);
4211 re_free (mctx->bkref_ents);
4214 /* Add a new backreference entry to MCTX.
4215 Note that we assume that caller never call this function with duplicate
4216 entry, and call with STR_IDX which isn't smaller than any existing entry.
4219 static reg_errcode_t
4220 internal_function __attribute_warn_unused_result__
4221 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4222 int to)
4224 if (mctx->nbkref_ents >= mctx->abkref_ents)
4226 struct re_backref_cache_entry* new_entry;
4227 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4228 mctx->abkref_ents * 2);
4229 if (BE (new_entry == NULL, 0))
4231 re_free (mctx->bkref_ents);
4232 return REG_ESPACE;
4234 mctx->bkref_ents = new_entry;
4235 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4236 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4237 mctx->abkref_ents *= 2;
4239 if (mctx->nbkref_ents > 0
4240 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4241 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4243 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4244 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4245 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4246 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4248 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4249 If bit N is clear, means that this entry won't epsilon-transition to
4250 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4251 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4252 such node.
4254 A backreference does not epsilon-transition unless it is empty, so set
4255 to all zeros if FROM != TO. */
4256 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4257 = (from == to ? ~0 : 0);
4259 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4260 if (mctx->max_mb_elem_len < to - from)
4261 mctx->max_mb_elem_len = to - from;
4262 return REG_NOERROR;
4265 /* Search for the first entry which has the same str_idx, or -1 if none is
4266 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4268 static int
4269 internal_function
4270 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4272 int left, right, mid, last;
4273 last = right = mctx->nbkref_ents;
4274 for (left = 0; left < right;)
4276 mid = (left + right) / 2;
4277 if (mctx->bkref_ents[mid].str_idx < str_idx)
4278 left = mid + 1;
4279 else
4280 right = mid;
4282 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4283 return left;
4284 else
4285 return -1;
4288 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4289 at STR_IDX. */
4291 static reg_errcode_t
4292 internal_function __attribute_warn_unused_result__
4293 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4295 #ifdef DEBUG
4296 assert (mctx->sub_tops != NULL);
4297 assert (mctx->asub_tops > 0);
4298 #endif
4299 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4301 int new_asub_tops = mctx->asub_tops * 2;
4302 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4303 re_sub_match_top_t *,
4304 new_asub_tops);
4305 if (BE (new_array == NULL, 0))
4306 return REG_ESPACE;
4307 mctx->sub_tops = new_array;
4308 mctx->asub_tops = new_asub_tops;
4310 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4311 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4312 return REG_ESPACE;
4313 mctx->sub_tops[mctx->nsub_tops]->node = node;
4314 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4315 return REG_NOERROR;
4318 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4319 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4321 static re_sub_match_last_t *
4322 internal_function
4323 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4325 re_sub_match_last_t *new_entry;
4326 if (BE (subtop->nlasts == subtop->alasts, 0))
4328 int new_alasts = 2 * subtop->alasts + 1;
4329 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4330 re_sub_match_last_t *,
4331 new_alasts);
4332 if (BE (new_array == NULL, 0))
4333 return NULL;
4334 subtop->lasts = new_array;
4335 subtop->alasts = new_alasts;
4337 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4338 if (BE (new_entry != NULL, 1))
4340 subtop->lasts[subtop->nlasts] = new_entry;
4341 new_entry->node = node;
4342 new_entry->str_idx = str_idx;
4343 ++subtop->nlasts;
4345 return new_entry;
4348 static void
4349 internal_function
4350 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4351 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4353 sctx->sifted_states = sifted_sts;
4354 sctx->limited_states = limited_sts;
4355 sctx->last_node = last_node;
4356 sctx->last_str_idx = last_str_idx;
4357 re_node_set_init_empty (&sctx->limits);