strerror_r-posix: Fixes for MSVC 14.
[gnulib.git] / lib / regexec.c
blobef52b243ad74710c3db350eaefbf20d8dd485a2e
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
26 internal_function;
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
28 internal_function;
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
33 internal_function;
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
36 Idx last_str_idx)
37 internal_function;
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, Idx last_start, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, bool ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 bool ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 internal_function;
58 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
59 Idx *p_match_first) internal_function;
60 static Idx check_halt_state_context (const re_match_context_t *mctx,
61 const re_dfastate_t *state, Idx idx)
62 internal_function;
63 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 regmatch_t *prev_idx_match, Idx cur_node,
65 Idx cur_idx, Idx nmatch) internal_function;
66 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 Idx str_idx, Idx dest_node, Idx nregs,
68 regmatch_t *regs,
69 re_node_set *eps_via_nodes)
70 internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 bool fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
76 internal_function;
78 #ifdef RE_ENABLE_I18N
79 static int sift_states_iter_mb (const re_match_context_t *mctx,
80 re_sift_context_t *sctx,
81 Idx node_idx, Idx str_idx, Idx max_str_idx)
82 internal_function;
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 re_sift_context_t *sctx)
86 internal_function;
87 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 re_sift_context_t *sctx, Idx str_idx,
89 re_node_set *cur_dest)
90 internal_function;
91 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
93 Idx str_idx,
94 re_node_set *dest_nodes)
95 internal_function;
96 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates)
99 internal_function;
100 static bool check_dst_limits (const re_match_context_t *mctx,
101 const re_node_set *limits,
102 Idx dst_node, Idx dst_idx, Idx src_node,
103 Idx src_idx) internal_function;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 int boundaries, Idx subexp_idx,
106 Idx from_node, Idx bkref_idx)
107 internal_function;
108 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 Idx limit, Idx subexp_idx,
110 Idx node, Idx str_idx,
111 Idx bkref_idx) internal_function;
112 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates,
115 re_node_set *limits,
116 struct re_backref_cache_entry *bkref_ents,
117 Idx str_idx) internal_function;
118 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 re_sift_context_t *sctx,
120 Idx str_idx, const re_node_set *candidates)
121 internal_function;
122 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **dst,
124 re_dfastate_t **src, Idx num)
125 internal_function;
126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128 static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state)
134 internal_function;
135 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 re_node_set *cur_nodes,
137 Idx str_idx) internal_function;
138 #if 0
139 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 re_match_context_t *mctx,
141 re_dfastate_t *pstate)
142 internal_function;
143 #endif
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 re_dfastate_t *pstate)
147 internal_function;
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes)
151 internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 Idx bkref_node, Idx bkref_str_idx)
154 internal_function;
155 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 const re_sub_match_top_t *sub_top,
157 re_sub_match_last_t *sub_last,
158 Idx bkref_node, Idx bkref_str)
159 internal_function;
160 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 Idx subexp_idx, int type) internal_function;
162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, Idx top_node,
164 Idx top_str, Idx last_node, Idx last_str,
165 int type) internal_function;
166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 Idx str_idx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes)
170 internal_function;
171 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 re_node_set *cur_nodes,
173 Idx ex_subexp, int type)
174 internal_function;
175 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 re_node_set *dst_nodes,
177 Idx target, Idx ex_subexp,
178 int type) internal_function;
179 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 re_node_set *cur_nodes, Idx cur_str,
181 Idx subexp_num, int type)
182 internal_function;
183 static bool build_trtable (const re_dfa_t *dfa,
184 re_dfastate_t *state) internal_function;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
187 const re_string_t *input, Idx idx)
188 internal_function;
189 # ifdef _LIBC
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
191 size_t name_len)
192 internal_function;
193 # endif /* _LIBC */
194 #endif /* RE_ENABLE_I18N */
195 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 const re_dfastate_t *state,
197 re_node_set *states_node,
198 bitset_t *states_ch) internal_function;
199 static bool check_node_accept (const re_match_context_t *mctx,
200 const re_token_t *node, Idx idx)
201 internal_function;
202 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
203 internal_function;
205 /* Entry point for POSIX code. */
207 /* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
215 EFLAGS specifies "execution flags" which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
219 We return 0 if we find a match and REG_NOMATCH if not. */
222 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
223 size_t nmatch, regmatch_t pmatch[], int eflags)
225 reg_errcode_t err;
226 Idx start, length;
227 re_dfa_t *dfa = preg->buffer;
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230 return REG_BADPAT;
232 if (eflags & REG_STARTEND)
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
237 else
239 start = 0;
240 length = strlen (string);
243 lock_lock (dfa->lock);
244 if (preg->no_sub)
245 err = re_search_internal (preg, string, length, start, length,
246 length, 0, NULL, eflags);
247 else
248 err = re_search_internal (preg, string, length, start, length,
249 length, nmatch, pmatch, eflags);
250 lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
254 #ifdef _LIBC
255 # include <shlib-compat.h>
256 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec) __compat_regexec;
262 attribute_compat_text_section
263 __compat_regexec (const regex_t *_Restrict_ preg,
264 const char *_Restrict_ string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
270 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271 # endif
272 #endif
274 /* Entry points for GNU code. */
276 /* re_match, re_search, re_match_2, re_search_2
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
288 way as re_match().)
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
292 concerned.
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stored in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
297 strings.)
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
303 regoff_t
304 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
305 Idx start, struct re_registers *regs)
307 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
309 #ifdef _LIBC
310 weak_alias (__re_match, re_match)
311 #endif
313 regoff_t
314 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
315 Idx start, regoff_t range, struct re_registers *regs)
317 return re_search_stub (bufp, string, length, start, range, length, regs,
318 false);
320 #ifdef _LIBC
321 weak_alias (__re_search, re_search)
322 #endif
324 regoff_t
325 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
326 const char *string2, Idx length2, Idx start,
327 struct re_registers *regs, Idx stop)
329 return re_search_2_stub (bufp, string1, length1, string2, length2,
330 start, 0, regs, stop, true);
332 #ifdef _LIBC
333 weak_alias (__re_match_2, re_match_2)
334 #endif
336 regoff_t
337 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
338 const char *string2, Idx length2, Idx start, regoff_t range,
339 struct re_registers *regs, Idx stop)
341 return re_search_2_stub (bufp, string1, length1, string2, length2,
342 start, range, regs, stop, false);
344 #ifdef _LIBC
345 weak_alias (__re_search_2, re_search_2)
346 #endif
348 static regoff_t
349 internal_function
350 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
351 Idx length1, const char *string2, Idx length2, Idx start,
352 regoff_t range, struct re_registers *regs,
353 Idx stop, bool ret_len)
355 const char *str;
356 regoff_t rval;
357 Idx len;
358 char *s = NULL;
360 if (BE ((length1 < 0 || length2 < 0 || stop < 0
361 || INT_ADD_WRAPV (length1, length2, &len)),
363 return -2;
365 /* Concatenate the strings. */
366 if (length2 > 0)
367 if (length1 > 0)
369 s = re_malloc (char, len);
371 if (BE (s == NULL, 0))
372 return -2;
373 #ifdef _LIBC
374 memcpy (__mempcpy (s, string1, length1), string2, length2);
375 #else
376 memcpy (s, string1, length1);
377 memcpy (s + length1, string2, length2);
378 #endif
379 str = s;
381 else
382 str = string2;
383 else
384 str = string1;
386 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
387 ret_len);
388 re_free (s);
389 return rval;
392 /* The parameters have the same meaning as those of re_search.
393 Additional parameters:
394 If RET_LEN is true the length of the match is returned (re_match style);
395 otherwise the position of the match is returned. */
397 static regoff_t
398 internal_function
399 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
400 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
401 bool ret_len)
403 reg_errcode_t result;
404 regmatch_t *pmatch;
405 Idx nregs;
406 regoff_t rval;
407 int eflags = 0;
408 re_dfa_t *dfa = bufp->buffer;
409 Idx last_start = start + range;
411 /* Check for out-of-range. */
412 if (BE (start < 0 || start > length, 0))
413 return -1;
414 if (BE (length < last_start || (0 <= range && last_start < start), 0))
415 last_start = length;
416 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
417 last_start = 0;
419 lock_lock (dfa->lock);
421 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
422 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
424 /* Compile fastmap if we haven't yet. */
425 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
426 re_compile_fastmap (bufp);
428 if (BE (bufp->no_sub, 0))
429 regs = NULL;
431 /* We need at least 1 register. */
432 if (regs == NULL)
433 nregs = 1;
434 else if (BE (bufp->regs_allocated == REGS_FIXED
435 && regs->num_regs <= bufp->re_nsub, 0))
437 nregs = regs->num_regs;
438 if (BE (nregs < 1, 0))
440 /* Nothing can be copied to regs. */
441 regs = NULL;
442 nregs = 1;
445 else
446 nregs = bufp->re_nsub + 1;
447 pmatch = re_malloc (regmatch_t, nregs);
448 if (BE (pmatch == NULL, 0))
450 rval = -2;
451 goto out;
454 result = re_search_internal (bufp, string, length, start, last_start, stop,
455 nregs, pmatch, eflags);
457 rval = 0;
459 /* I hope we needn't fill their regs with -1's when no match was found. */
460 if (result != REG_NOERROR)
461 rval = result == REG_NOMATCH ? -1 : -2;
462 else if (regs != NULL)
464 /* If caller wants register contents data back, copy them. */
465 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
466 bufp->regs_allocated);
467 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
468 rval = -2;
471 if (BE (rval == 0, 1))
473 if (ret_len)
475 assert (pmatch[0].rm_so == start);
476 rval = pmatch[0].rm_eo - start;
478 else
479 rval = pmatch[0].rm_so;
481 re_free (pmatch);
482 out:
483 lock_unlock (dfa->lock);
484 return rval;
487 static unsigned
488 internal_function
489 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
490 int regs_allocated)
492 int rval = REGS_REALLOCATE;
493 Idx i;
494 Idx need_regs = nregs + 1;
495 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
496 uses. */
498 /* Have the register data arrays been allocated? */
499 if (regs_allocated == REGS_UNALLOCATED)
500 { /* No. So allocate them with malloc. */
501 regs->start = re_malloc (regoff_t, need_regs);
502 if (BE (regs->start == NULL, 0))
503 return REGS_UNALLOCATED;
504 regs->end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->end == NULL, 0))
507 re_free (regs->start);
508 return REGS_UNALLOCATED;
510 regs->num_regs = need_regs;
512 else if (regs_allocated == REGS_REALLOCATE)
513 { /* Yes. If we need more elements than were already
514 allocated, reallocate them. If we need fewer, just
515 leave it alone. */
516 if (BE (need_regs > regs->num_regs, 0))
518 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
519 regoff_t *new_end;
520 if (BE (new_start == NULL, 0))
521 return REGS_UNALLOCATED;
522 new_end = re_realloc (regs->end, regoff_t, need_regs);
523 if (BE (new_end == NULL, 0))
525 re_free (new_start);
526 return REGS_UNALLOCATED;
528 regs->start = new_start;
529 regs->end = new_end;
530 regs->num_regs = need_regs;
533 else
535 assert (regs_allocated == REGS_FIXED);
536 /* This function may not be called with REGS_FIXED and nregs too big. */
537 assert (regs->num_regs >= nregs);
538 rval = REGS_FIXED;
541 /* Copy the regs. */
542 for (i = 0; i < nregs; ++i)
544 regs->start[i] = pmatch[i].rm_so;
545 regs->end[i] = pmatch[i].rm_eo;
547 for ( ; i < regs->num_regs; ++i)
548 regs->start[i] = regs->end[i] = -1;
550 return rval;
553 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
554 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
555 this memory for recording register information. STARTS and ENDS
556 must be allocated using the malloc library routine, and must each
557 be at least NUM_REGS * sizeof (regoff_t) bytes long.
559 If NUM_REGS == 0, then subsequent matches should allocate their own
560 register data.
562 Unless this function is called, the first search or match using
563 PATTERN_BUFFER will allocate its own register data, without
564 freeing the old data. */
566 void
567 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
568 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
570 if (num_regs)
572 bufp->regs_allocated = REGS_REALLOCATE;
573 regs->num_regs = num_regs;
574 regs->start = starts;
575 regs->end = ends;
577 else
579 bufp->regs_allocated = REGS_UNALLOCATED;
580 regs->num_regs = 0;
581 regs->start = regs->end = NULL;
584 #ifdef _LIBC
585 weak_alias (__re_set_registers, re_set_registers)
586 #endif
588 /* Entry points compatible with 4.2 BSD regex library. We don't define
589 them unless specifically requested. */
591 #if defined _REGEX_RE_COMP || defined _LIBC
593 # ifdef _LIBC
594 weak_function
595 # endif
596 re_exec (const char *s)
598 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
600 #endif /* _REGEX_RE_COMP */
602 /* Internal entry point. */
604 /* Searches for a compiled pattern PREG in the string STRING, whose
605 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
606 meaning as with regexec. LAST_START is START + RANGE, where
607 START and RANGE have the same meaning as with re_search.
608 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
609 otherwise return the error code.
610 Note: We assume front end functions already check ranges.
611 (0 <= LAST_START && LAST_START <= LENGTH) */
613 static reg_errcode_t
614 __attribute_warn_unused_result__ internal_function
615 re_search_internal (const regex_t *preg, const char *string, Idx length,
616 Idx start, Idx last_start, Idx stop, size_t nmatch,
617 regmatch_t pmatch[], int eflags)
619 reg_errcode_t err;
620 const re_dfa_t *dfa = preg->buffer;
621 Idx left_lim, right_lim;
622 int incr;
623 bool fl_longest_match;
624 int match_kind;
625 Idx match_first;
626 Idx match_last = -1;
627 Idx extra_nmatch;
628 bool sb;
629 int ch;
630 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
631 re_match_context_t mctx = { .dfa = dfa };
632 #else
633 re_match_context_t mctx;
634 #endif
635 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
636 && start != last_start && !preg->can_be_null)
637 ? preg->fastmap : NULL);
638 RE_TRANSLATE_TYPE t = preg->translate;
640 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
641 memset (&mctx, '\0', sizeof (re_match_context_t));
642 mctx.dfa = dfa;
643 #endif
645 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
646 nmatch -= extra_nmatch;
648 /* Check if the DFA haven't been compiled. */
649 if (BE (preg->used == 0 || dfa->init_state == NULL
650 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
651 || dfa->init_state_begbuf == NULL, 0))
652 return REG_NOMATCH;
654 #ifdef DEBUG
655 /* We assume front-end functions already check them. */
656 assert (0 <= last_start && last_start <= length);
657 #endif
659 /* If initial states with non-begbuf contexts have no elements,
660 the regex must be anchored. If preg->newline_anchor is set,
661 we'll never use init_state_nl, so do not check it. */
662 if (dfa->init_state->nodes.nelem == 0
663 && dfa->init_state_word->nodes.nelem == 0
664 && (dfa->init_state_nl->nodes.nelem == 0
665 || !preg->newline_anchor))
667 if (start != 0 && last_start != 0)
668 return REG_NOMATCH;
669 start = last_start = 0;
672 /* We must check the longest matching, if nmatch > 0. */
673 fl_longest_match = (nmatch != 0 || dfa->nbackref);
675 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
676 preg->translate, (preg->syntax & RE_ICASE) != 0,
677 dfa);
678 if (BE (err != REG_NOERROR, 0))
679 goto free_return;
680 mctx.input.stop = stop;
681 mctx.input.raw_stop = stop;
682 mctx.input.newline_anchor = preg->newline_anchor;
684 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
685 if (BE (err != REG_NOERROR, 0))
686 goto free_return;
688 /* We will log all the DFA states through which the dfa pass,
689 if nmatch > 1, or this dfa has "multibyte node", which is a
690 back-reference or a node which can accept multibyte character or
691 multi character collating element. */
692 if (nmatch > 1 || dfa->has_mb_node)
694 /* Avoid overflow. */
695 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
696 <= mctx.input.bufs_len), 0))
698 err = REG_ESPACE;
699 goto free_return;
702 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
703 if (BE (mctx.state_log == NULL, 0))
705 err = REG_ESPACE;
706 goto free_return;
709 else
710 mctx.state_log = NULL;
712 match_first = start;
713 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
714 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
716 /* Check incrementally whether the input string matches. */
717 incr = (last_start < start) ? -1 : 1;
718 left_lim = (last_start < start) ? last_start : start;
719 right_lim = (last_start < start) ? start : last_start;
720 sb = dfa->mb_cur_max == 1;
721 match_kind =
722 (fastmap
723 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
724 | (start <= last_start ? 2 : 0)
725 | (t != NULL ? 1 : 0))
726 : 8);
728 for (;; match_first += incr)
730 err = REG_NOMATCH;
731 if (match_first < left_lim || right_lim < match_first)
732 goto free_return;
734 /* Advance as rapidly as possible through the string, until we
735 find a plausible place to start matching. This may be done
736 with varying efficiency, so there are various possibilities:
737 only the most common of them are specialized, in order to
738 save on code size. We use a switch statement for speed. */
739 switch (match_kind)
741 case 8:
742 /* No fastmap. */
743 break;
745 case 7:
746 /* Fastmap with single-byte translation, match forward. */
747 while (BE (match_first < right_lim, 1)
748 && !fastmap[t[(unsigned char) string[match_first]]])
749 ++match_first;
750 goto forward_match_found_start_or_reached_end;
752 case 6:
753 /* Fastmap without translation, match forward. */
754 while (BE (match_first < right_lim, 1)
755 && !fastmap[(unsigned char) string[match_first]])
756 ++match_first;
758 forward_match_found_start_or_reached_end:
759 if (BE (match_first == right_lim, 0))
761 ch = match_first >= length
762 ? 0 : (unsigned char) string[match_first];
763 if (!fastmap[t ? t[ch] : ch])
764 goto free_return;
766 break;
768 case 4:
769 case 5:
770 /* Fastmap without multi-byte translation, match backwards. */
771 while (match_first >= left_lim)
773 ch = match_first >= length
774 ? 0 : (unsigned char) string[match_first];
775 if (fastmap[t ? t[ch] : ch])
776 break;
777 --match_first;
779 if (match_first < left_lim)
780 goto free_return;
781 break;
783 default:
784 /* In this case, we can't determine easily the current byte,
785 since it might be a component byte of a multibyte
786 character. Then we use the constructed buffer instead. */
787 for (;;)
789 /* If MATCH_FIRST is out of the valid range, reconstruct the
790 buffers. */
791 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
792 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
794 err = re_string_reconstruct (&mctx.input, match_first,
795 eflags);
796 if (BE (err != REG_NOERROR, 0))
797 goto free_return;
799 offset = match_first - mctx.input.raw_mbs_idx;
801 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
802 Note that MATCH_FIRST must not be smaller than 0. */
803 ch = (match_first >= length
804 ? 0 : re_string_byte_at (&mctx.input, offset));
805 if (fastmap[ch])
806 break;
807 match_first += incr;
808 if (match_first < left_lim || match_first > right_lim)
810 err = REG_NOMATCH;
811 goto free_return;
814 break;
817 /* Reconstruct the buffers so that the matcher can assume that
818 the matching starts from the beginning of the buffer. */
819 err = re_string_reconstruct (&mctx.input, match_first, eflags);
820 if (BE (err != REG_NOERROR, 0))
821 goto free_return;
823 #ifdef RE_ENABLE_I18N
824 /* Don't consider this char as a possible match start if it part,
825 yet isn't the head, of a multibyte character. */
826 if (!sb && !re_string_first_byte (&mctx.input, 0))
827 continue;
828 #endif
830 /* It seems to be appropriate one, then use the matcher. */
831 /* We assume that the matching starts from 0. */
832 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
833 match_last = check_matching (&mctx, fl_longest_match,
834 start <= last_start ? &match_first : NULL);
835 if (match_last != -1)
837 if (BE (match_last == -2, 0))
839 err = REG_ESPACE;
840 goto free_return;
842 else
844 mctx.match_last = match_last;
845 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
847 re_dfastate_t *pstate = mctx.state_log[match_last];
848 mctx.last_node = check_halt_state_context (&mctx, pstate,
849 match_last);
851 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
852 || dfa->nbackref)
854 err = prune_impossible_nodes (&mctx);
855 if (err == REG_NOERROR)
856 break;
857 if (BE (err != REG_NOMATCH, 0))
858 goto free_return;
859 match_last = -1;
861 else
862 break; /* We found a match. */
866 match_ctx_clean (&mctx);
869 #ifdef DEBUG
870 assert (match_last != -1);
871 assert (err == REG_NOERROR);
872 #endif
874 /* Set pmatch[] if we need. */
875 if (nmatch > 0)
877 Idx reg_idx;
879 /* Initialize registers. */
880 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
881 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
883 /* Set the points where matching start/end. */
884 pmatch[0].rm_so = 0;
885 pmatch[0].rm_eo = mctx.match_last;
886 /* FIXME: This function should fail if mctx.match_last exceeds
887 the maximum possible regoff_t value. We need a new error
888 code REG_OVERFLOW. */
890 if (!preg->no_sub && nmatch > 1)
892 err = set_regs (preg, &mctx, nmatch, pmatch,
893 dfa->has_plural_match && dfa->nbackref > 0);
894 if (BE (err != REG_NOERROR, 0))
895 goto free_return;
898 /* At last, add the offset to each register, since we slid
899 the buffers so that we could assume that the matching starts
900 from 0. */
901 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
902 if (pmatch[reg_idx].rm_so != -1)
904 #ifdef RE_ENABLE_I18N
905 if (BE (mctx.input.offsets_needed != 0, 0))
907 pmatch[reg_idx].rm_so =
908 (pmatch[reg_idx].rm_so == mctx.input.valid_len
909 ? mctx.input.valid_raw_len
910 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
911 pmatch[reg_idx].rm_eo =
912 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
913 ? mctx.input.valid_raw_len
914 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
916 #else
917 assert (mctx.input.offsets_needed == 0);
918 #endif
919 pmatch[reg_idx].rm_so += match_first;
920 pmatch[reg_idx].rm_eo += match_first;
922 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
924 pmatch[nmatch + reg_idx].rm_so = -1;
925 pmatch[nmatch + reg_idx].rm_eo = -1;
928 if (dfa->subexp_map)
929 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
930 if (dfa->subexp_map[reg_idx] != reg_idx)
932 pmatch[reg_idx + 1].rm_so
933 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
934 pmatch[reg_idx + 1].rm_eo
935 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
939 free_return:
940 re_free (mctx.state_log);
941 if (dfa->nbackref)
942 match_ctx_free (&mctx);
943 re_string_destruct (&mctx.input);
944 return err;
947 static reg_errcode_t
948 internal_function __attribute_warn_unused_result__
949 prune_impossible_nodes (re_match_context_t *mctx)
951 const re_dfa_t *const dfa = mctx->dfa;
952 Idx halt_node, match_last;
953 reg_errcode_t ret;
954 re_dfastate_t **sifted_states;
955 re_dfastate_t **lim_states = NULL;
956 re_sift_context_t sctx;
957 #ifdef DEBUG
958 assert (mctx->state_log != NULL);
959 #endif
960 match_last = mctx->match_last;
961 halt_node = mctx->last_node;
963 /* Avoid overflow. */
964 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
965 return REG_ESPACE;
967 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
968 if (BE (sifted_states == NULL, 0))
970 ret = REG_ESPACE;
971 goto free_return;
973 if (dfa->nbackref)
975 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
976 if (BE (lim_states == NULL, 0))
978 ret = REG_ESPACE;
979 goto free_return;
981 while (1)
983 memset (lim_states, '\0',
984 sizeof (re_dfastate_t *) * (match_last + 1));
985 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
986 match_last);
987 ret = sift_states_backward (mctx, &sctx);
988 re_node_set_free (&sctx.limits);
989 if (BE (ret != REG_NOERROR, 0))
990 goto free_return;
991 if (sifted_states[0] != NULL || lim_states[0] != NULL)
992 break;
995 --match_last;
996 if (match_last < 0)
998 ret = REG_NOMATCH;
999 goto free_return;
1001 } while (mctx->state_log[match_last] == NULL
1002 || !mctx->state_log[match_last]->halt);
1003 halt_node = check_halt_state_context (mctx,
1004 mctx->state_log[match_last],
1005 match_last);
1007 ret = merge_state_array (dfa, sifted_states, lim_states,
1008 match_last + 1);
1009 re_free (lim_states);
1010 lim_states = NULL;
1011 if (BE (ret != REG_NOERROR, 0))
1012 goto free_return;
1014 else
1016 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1017 ret = sift_states_backward (mctx, &sctx);
1018 re_node_set_free (&sctx.limits);
1019 if (BE (ret != REG_NOERROR, 0))
1020 goto free_return;
1021 if (sifted_states[0] == NULL)
1023 ret = REG_NOMATCH;
1024 goto free_return;
1027 re_free (mctx->state_log);
1028 mctx->state_log = sifted_states;
1029 sifted_states = NULL;
1030 mctx->last_node = halt_node;
1031 mctx->match_last = match_last;
1032 ret = REG_NOERROR;
1033 free_return:
1034 re_free (sifted_states);
1035 re_free (lim_states);
1036 return ret;
1039 /* Acquire an initial state and return it.
1040 We must select appropriate initial state depending on the context,
1041 since initial states may have constraints like "\<", "^", etc.. */
1043 static inline re_dfastate_t *
1044 __attribute__ ((always_inline)) internal_function
1045 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1046 Idx idx)
1048 const re_dfa_t *const dfa = mctx->dfa;
1049 if (dfa->init_state->has_constraint)
1051 unsigned int context;
1052 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1053 if (IS_WORD_CONTEXT (context))
1054 return dfa->init_state_word;
1055 else if (IS_ORDINARY_CONTEXT (context))
1056 return dfa->init_state;
1057 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1058 return dfa->init_state_begbuf;
1059 else if (IS_NEWLINE_CONTEXT (context))
1060 return dfa->init_state_nl;
1061 else if (IS_BEGBUF_CONTEXT (context))
1063 /* It is relatively rare case, then calculate on demand. */
1064 return re_acquire_state_context (err, dfa,
1065 dfa->init_state->entrance_nodes,
1066 context);
1068 else
1069 /* Must not happen? */
1070 return dfa->init_state;
1072 else
1073 return dfa->init_state;
1076 /* Check whether the regular expression match input string INPUT or not,
1077 and return the index where the matching end. Return -1 if
1078 there is no match, and return -2 in case of an error.
1079 FL_LONGEST_MATCH means we want the POSIX longest matching.
1080 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1081 next place where we may want to try matching.
1082 Note that the matcher assumes that the matching starts from the current
1083 index of the buffer. */
1085 static Idx
1086 internal_function __attribute_warn_unused_result__
1087 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1088 Idx *p_match_first)
1090 const re_dfa_t *const dfa = mctx->dfa;
1091 reg_errcode_t err;
1092 Idx match = 0;
1093 Idx match_last = -1;
1094 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1095 re_dfastate_t *cur_state;
1096 bool at_init_state = p_match_first != NULL;
1097 Idx next_start_idx = cur_str_idx;
1099 err = REG_NOERROR;
1100 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1101 /* An initial state must not be NULL (invalid). */
1102 if (BE (cur_state == NULL, 0))
1104 assert (err == REG_ESPACE);
1105 return -2;
1108 if (mctx->state_log != NULL)
1110 mctx->state_log[cur_str_idx] = cur_state;
1112 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1113 later. E.g. Processing back references. */
1114 if (BE (dfa->nbackref, 0))
1116 at_init_state = false;
1117 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1118 if (BE (err != REG_NOERROR, 0))
1119 return err;
1121 if (cur_state->has_backref)
1123 err = transit_state_bkref (mctx, &cur_state->nodes);
1124 if (BE (err != REG_NOERROR, 0))
1125 return err;
1130 /* If the RE accepts NULL string. */
1131 if (BE (cur_state->halt, 0))
1133 if (!cur_state->has_constraint
1134 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1136 if (!fl_longest_match)
1137 return cur_str_idx;
1138 else
1140 match_last = cur_str_idx;
1141 match = 1;
1146 while (!re_string_eoi (&mctx->input))
1148 re_dfastate_t *old_state = cur_state;
1149 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1151 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1152 && mctx->input.bufs_len < mctx->input.len)
1153 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1154 && mctx->input.valid_len < mctx->input.len))
1156 err = extend_buffers (mctx, next_char_idx + 1);
1157 if (BE (err != REG_NOERROR, 0))
1159 assert (err == REG_ESPACE);
1160 return -2;
1164 cur_state = transit_state (&err, mctx, cur_state);
1165 if (mctx->state_log != NULL)
1166 cur_state = merge_state_with_log (&err, mctx, cur_state);
1168 if (cur_state == NULL)
1170 /* Reached the invalid state or an error. Try to recover a valid
1171 state using the state log, if available and if we have not
1172 already found a valid (even if not the longest) match. */
1173 if (BE (err != REG_NOERROR, 0))
1174 return -2;
1176 if (mctx->state_log == NULL
1177 || (match && !fl_longest_match)
1178 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1179 break;
1182 if (BE (at_init_state, 0))
1184 if (old_state == cur_state)
1185 next_start_idx = next_char_idx;
1186 else
1187 at_init_state = false;
1190 if (cur_state->halt)
1192 /* Reached a halt state.
1193 Check the halt state can satisfy the current context. */
1194 if (!cur_state->has_constraint
1195 || check_halt_state_context (mctx, cur_state,
1196 re_string_cur_idx (&mctx->input)))
1198 /* We found an appropriate halt state. */
1199 match_last = re_string_cur_idx (&mctx->input);
1200 match = 1;
1202 /* We found a match, do not modify match_first below. */
1203 p_match_first = NULL;
1204 if (!fl_longest_match)
1205 break;
1210 if (p_match_first)
1211 *p_match_first += next_start_idx;
1213 return match_last;
1216 /* Check NODE match the current context. */
1218 static bool
1219 internal_function
1220 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1222 re_token_type_t type = dfa->nodes[node].type;
1223 unsigned int constraint = dfa->nodes[node].constraint;
1224 if (type != END_OF_RE)
1225 return false;
1226 if (!constraint)
1227 return true;
1228 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1229 return false;
1230 return true;
1233 /* Check the halt state STATE match the current context.
1234 Return 0 if not match, if the node, STATE has, is a halt node and
1235 match the context, return the node. */
1237 static Idx
1238 internal_function
1239 check_halt_state_context (const re_match_context_t *mctx,
1240 const re_dfastate_t *state, Idx idx)
1242 Idx i;
1243 unsigned int context;
1244 #ifdef DEBUG
1245 assert (state->halt);
1246 #endif
1247 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1248 for (i = 0; i < state->nodes.nelem; ++i)
1249 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1250 return state->nodes.elems[i];
1251 return 0;
1254 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1255 corresponding to the DFA).
1256 Return the destination node, and update EPS_VIA_NODES;
1257 return -1 in case of errors. */
1259 static Idx
1260 internal_function
1261 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1262 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1263 struct re_fail_stack_t *fs)
1265 const re_dfa_t *const dfa = mctx->dfa;
1266 Idx i;
1267 bool ok;
1268 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1270 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1271 re_node_set *edests = &dfa->edests[node];
1272 Idx dest_node;
1273 ok = re_node_set_insert (eps_via_nodes, node);
1274 if (BE (! ok, 0))
1275 return -2;
1276 /* Pick up a valid destination, or return -1 if none
1277 is found. */
1278 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1280 Idx candidate = edests->elems[i];
1281 if (!re_node_set_contains (cur_nodes, candidate))
1282 continue;
1283 if (dest_node == -1)
1284 dest_node = candidate;
1286 else
1288 /* In order to avoid infinite loop like "(a*)*", return the second
1289 epsilon-transition if the first was already considered. */
1290 if (re_node_set_contains (eps_via_nodes, dest_node))
1291 return candidate;
1293 /* Otherwise, push the second epsilon-transition on the fail stack. */
1294 else if (fs != NULL
1295 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1296 eps_via_nodes))
1297 return -2;
1299 /* We know we are going to exit. */
1300 break;
1303 return dest_node;
1305 else
1307 Idx naccepted = 0;
1308 re_token_type_t type = dfa->nodes[node].type;
1310 #ifdef RE_ENABLE_I18N
1311 if (dfa->nodes[node].accept_mb)
1312 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1313 else
1314 #endif /* RE_ENABLE_I18N */
1315 if (type == OP_BACK_REF)
1317 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1318 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1319 if (fs != NULL)
1321 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1322 return -1;
1323 else if (naccepted)
1325 char *buf = (char *) re_string_get_buffer (&mctx->input);
1326 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1327 naccepted) != 0)
1328 return -1;
1332 if (naccepted == 0)
1334 Idx dest_node;
1335 ok = re_node_set_insert (eps_via_nodes, node);
1336 if (BE (! ok, 0))
1337 return -2;
1338 dest_node = dfa->edests[node].elems[0];
1339 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1340 dest_node))
1341 return dest_node;
1345 if (naccepted != 0
1346 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1348 Idx dest_node = dfa->nexts[node];
1349 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1350 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1351 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1352 dest_node)))
1353 return -1;
1354 re_node_set_empty (eps_via_nodes);
1355 return dest_node;
1358 return -1;
1361 static reg_errcode_t
1362 internal_function __attribute_warn_unused_result__
1363 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1364 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1366 reg_errcode_t err;
1367 Idx num = fs->num++;
1368 if (fs->num == fs->alloc)
1370 struct re_fail_stack_ent_t *new_array;
1371 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1372 * fs->alloc * 2));
1373 if (new_array == NULL)
1374 return REG_ESPACE;
1375 fs->alloc *= 2;
1376 fs->stack = new_array;
1378 fs->stack[num].idx = str_idx;
1379 fs->stack[num].node = dest_node;
1380 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1381 if (fs->stack[num].regs == NULL)
1382 return REG_ESPACE;
1383 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1384 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1385 return err;
1388 static Idx
1389 internal_function
1390 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1391 regmatch_t *regs, re_node_set *eps_via_nodes)
1393 Idx num = --fs->num;
1394 assert (num >= 0);
1395 *pidx = fs->stack[num].idx;
1396 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1397 re_node_set_free (eps_via_nodes);
1398 re_free (fs->stack[num].regs);
1399 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1400 return fs->stack[num].node;
1403 /* Set the positions where the subexpressions are starts/ends to registers
1404 PMATCH.
1405 Note: We assume that pmatch[0] is already set, and
1406 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1408 static reg_errcode_t
1409 internal_function __attribute_warn_unused_result__
1410 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1411 regmatch_t *pmatch, bool fl_backtrack)
1413 const re_dfa_t *dfa = preg->buffer;
1414 Idx idx, cur_node;
1415 re_node_set eps_via_nodes;
1416 struct re_fail_stack_t *fs;
1417 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1418 regmatch_t *prev_idx_match;
1419 bool prev_idx_match_malloced = false;
1421 #ifdef DEBUG
1422 assert (nmatch > 1);
1423 assert (mctx->state_log != NULL);
1424 #endif
1425 if (fl_backtrack)
1427 fs = &fs_body;
1428 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1429 if (fs->stack == NULL)
1430 return REG_ESPACE;
1432 else
1433 fs = NULL;
1435 cur_node = dfa->init_node;
1436 re_node_set_init_empty (&eps_via_nodes);
1438 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1439 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440 else
1442 prev_idx_match = re_malloc (regmatch_t, nmatch);
1443 if (prev_idx_match == NULL)
1445 free_fail_stack_return (fs);
1446 return REG_ESPACE;
1448 prev_idx_match_malloced = true;
1450 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1452 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1454 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1456 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1458 Idx reg_idx;
1459 if (fs)
1461 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1462 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1463 break;
1464 if (reg_idx == nmatch)
1466 re_node_set_free (&eps_via_nodes);
1467 if (prev_idx_match_malloced)
1468 re_free (prev_idx_match);
1469 return free_fail_stack_return (fs);
1471 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1472 &eps_via_nodes);
1474 else
1476 re_node_set_free (&eps_via_nodes);
1477 if (prev_idx_match_malloced)
1478 re_free (prev_idx_match);
1479 return REG_NOERROR;
1483 /* Proceed to next node. */
1484 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1485 &eps_via_nodes, fs);
1487 if (BE (cur_node < 0, 0))
1489 if (BE (cur_node == -2, 0))
1491 re_node_set_free (&eps_via_nodes);
1492 if (prev_idx_match_malloced)
1493 re_free (prev_idx_match);
1494 free_fail_stack_return (fs);
1495 return REG_ESPACE;
1497 if (fs)
1498 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1499 &eps_via_nodes);
1500 else
1502 re_node_set_free (&eps_via_nodes);
1503 if (prev_idx_match_malloced)
1504 re_free (prev_idx_match);
1505 return REG_NOMATCH;
1509 re_node_set_free (&eps_via_nodes);
1510 if (prev_idx_match_malloced)
1511 re_free (prev_idx_match);
1512 return free_fail_stack_return (fs);
1515 static reg_errcode_t
1516 internal_function
1517 free_fail_stack_return (struct re_fail_stack_t *fs)
1519 if (fs)
1521 Idx fs_idx;
1522 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1524 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1525 re_free (fs->stack[fs_idx].regs);
1527 re_free (fs->stack);
1529 return REG_NOERROR;
1532 static void
1533 internal_function
1534 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1535 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1537 int type = dfa->nodes[cur_node].type;
1538 if (type == OP_OPEN_SUBEXP)
1540 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542 /* We are at the first node of this sub expression. */
1543 if (reg_num < nmatch)
1545 pmatch[reg_num].rm_so = cur_idx;
1546 pmatch[reg_num].rm_eo = -1;
1549 else if (type == OP_CLOSE_SUBEXP)
1551 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1552 if (reg_num < nmatch)
1554 /* We are at the last node of this sub expression. */
1555 if (pmatch[reg_num].rm_so < cur_idx)
1557 pmatch[reg_num].rm_eo = cur_idx;
1558 /* This is a non-empty match or we are not inside an optional
1559 subexpression. Accept this right away. */
1560 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562 else
1564 if (dfa->nodes[cur_node].opt_subexp
1565 && prev_idx_match[reg_num].rm_so != -1)
1566 /* We transited through an empty match for an optional
1567 subexpression, like (a?)*, and this is not the subexp's
1568 first match. Copy back the old content of the registers
1569 so that matches of an inner subexpression are undone as
1570 well, like in ((a?))*. */
1571 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1572 else
1573 /* We completed a subexpression, but it may be part of
1574 an optional one, so do not update PREV_IDX_MATCH. */
1575 pmatch[reg_num].rm_eo = cur_idx;
1581 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1582 and sift the nodes in each states according to the following rules.
1583 Updated state_log will be wrote to STATE_LOG.
1585 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1586 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1587 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1588 the LAST_NODE, we throw away the node 'a'.
1589 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1590 string 's' and transit to 'b':
1591 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1592 away the node 'a'.
1593 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1594 thrown away, we throw away the node 'a'.
1595 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1596 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1597 node 'a'.
1598 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1599 we throw away the node 'a'. */
1601 #define STATE_NODE_CONTAINS(state,node) \
1602 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1604 static reg_errcode_t
1605 internal_function
1606 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608 reg_errcode_t err;
1609 int null_cnt = 0;
1610 Idx str_idx = sctx->last_str_idx;
1611 re_node_set cur_dest;
1613 #ifdef DEBUG
1614 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1615 #endif
1617 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1618 transit to the last_node and the last_node itself. */
1619 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1620 if (BE (err != REG_NOERROR, 0))
1621 return err;
1622 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1623 if (BE (err != REG_NOERROR, 0))
1624 goto free_return;
1626 /* Then check each states in the state_log. */
1627 while (str_idx > 0)
1629 /* Update counters. */
1630 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1631 if (null_cnt > mctx->max_mb_elem_len)
1633 memset (sctx->sifted_states, '\0',
1634 sizeof (re_dfastate_t *) * str_idx);
1635 re_node_set_free (&cur_dest);
1636 return REG_NOERROR;
1638 re_node_set_empty (&cur_dest);
1639 --str_idx;
1641 if (mctx->state_log[str_idx])
1643 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1644 if (BE (err != REG_NOERROR, 0))
1645 goto free_return;
1648 /* Add all the nodes which satisfy the following conditions:
1649 - It can epsilon transit to a node in CUR_DEST.
1650 - It is in CUR_SRC.
1651 And update state_log. */
1652 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1653 if (BE (err != REG_NOERROR, 0))
1654 goto free_return;
1656 err = REG_NOERROR;
1657 free_return:
1658 re_node_set_free (&cur_dest);
1659 return err;
1662 static reg_errcode_t
1663 internal_function __attribute_warn_unused_result__
1664 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1665 Idx str_idx, re_node_set *cur_dest)
1667 const re_dfa_t *const dfa = mctx->dfa;
1668 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1669 Idx i;
1671 /* Then build the next sifted state.
1672 We build the next sifted state on 'cur_dest', and update
1673 'sifted_states[str_idx]' with 'cur_dest'.
1674 Note:
1675 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1676 'cur_src' points the node_set of the old 'state_log[str_idx]'
1677 (with the epsilon nodes pre-filtered out). */
1678 for (i = 0; i < cur_src->nelem; i++)
1680 Idx prev_node = cur_src->elems[i];
1681 int naccepted = 0;
1682 bool ok;
1684 #ifdef DEBUG
1685 re_token_type_t type = dfa->nodes[prev_node].type;
1686 assert (!IS_EPSILON_NODE (type));
1687 #endif
1688 #ifdef RE_ENABLE_I18N
1689 /* If the node may accept "multi byte". */
1690 if (dfa->nodes[prev_node].accept_mb)
1691 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1692 str_idx, sctx->last_str_idx);
1693 #endif /* RE_ENABLE_I18N */
1695 /* We don't check backreferences here.
1696 See update_cur_sifted_state(). */
1697 if (!naccepted
1698 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1699 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1700 dfa->nexts[prev_node]))
1701 naccepted = 1;
1703 if (naccepted == 0)
1704 continue;
1706 if (sctx->limits.nelem)
1708 Idx to_idx = str_idx + naccepted;
1709 if (check_dst_limits (mctx, &sctx->limits,
1710 dfa->nexts[prev_node], to_idx,
1711 prev_node, str_idx))
1712 continue;
1714 ok = re_node_set_insert (cur_dest, prev_node);
1715 if (BE (! ok, 0))
1716 return REG_ESPACE;
1719 return REG_NOERROR;
1722 /* Helper functions. */
1724 static reg_errcode_t
1725 internal_function
1726 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1728 Idx top = mctx->state_log_top;
1730 if ((next_state_log_idx >= mctx->input.bufs_len
1731 && mctx->input.bufs_len < mctx->input.len)
1732 || (next_state_log_idx >= mctx->input.valid_len
1733 && mctx->input.valid_len < mctx->input.len))
1735 reg_errcode_t err;
1736 err = extend_buffers (mctx, next_state_log_idx + 1);
1737 if (BE (err != REG_NOERROR, 0))
1738 return err;
1741 if (top < next_state_log_idx)
1743 memset (mctx->state_log + top + 1, '\0',
1744 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1745 mctx->state_log_top = next_state_log_idx;
1747 return REG_NOERROR;
1750 static reg_errcode_t
1751 internal_function
1752 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1753 re_dfastate_t **src, Idx num)
1755 Idx st_idx;
1756 reg_errcode_t err;
1757 for (st_idx = 0; st_idx < num; ++st_idx)
1759 if (dst[st_idx] == NULL)
1760 dst[st_idx] = src[st_idx];
1761 else if (src[st_idx] != NULL)
1763 re_node_set merged_set;
1764 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1765 &src[st_idx]->nodes);
1766 if (BE (err != REG_NOERROR, 0))
1767 return err;
1768 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1769 re_node_set_free (&merged_set);
1770 if (BE (err != REG_NOERROR, 0))
1771 return err;
1774 return REG_NOERROR;
1777 static reg_errcode_t
1778 internal_function
1779 update_cur_sifted_state (const re_match_context_t *mctx,
1780 re_sift_context_t *sctx, Idx str_idx,
1781 re_node_set *dest_nodes)
1783 const re_dfa_t *const dfa = mctx->dfa;
1784 reg_errcode_t err = REG_NOERROR;
1785 const re_node_set *candidates;
1786 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1787 : &mctx->state_log[str_idx]->nodes);
1789 if (dest_nodes->nelem == 0)
1790 sctx->sifted_states[str_idx] = NULL;
1791 else
1793 if (candidates)
1795 /* At first, add the nodes which can epsilon transit to a node in
1796 DEST_NODE. */
1797 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1798 if (BE (err != REG_NOERROR, 0))
1799 return err;
1801 /* Then, check the limitations in the current sift_context. */
1802 if (sctx->limits.nelem)
1804 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1805 mctx->bkref_ents, str_idx);
1806 if (BE (err != REG_NOERROR, 0))
1807 return err;
1811 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1812 if (BE (err != REG_NOERROR, 0))
1813 return err;
1816 if (candidates && mctx->state_log[str_idx]->has_backref)
1818 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1819 if (BE (err != REG_NOERROR, 0))
1820 return err;
1822 return REG_NOERROR;
1825 static reg_errcode_t
1826 internal_function __attribute_warn_unused_result__
1827 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1828 const re_node_set *candidates)
1830 reg_errcode_t err = REG_NOERROR;
1831 Idx i;
1833 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1834 if (BE (err != REG_NOERROR, 0))
1835 return err;
1837 if (!state->inveclosure.alloc)
1839 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1840 if (BE (err != REG_NOERROR, 0))
1841 return REG_ESPACE;
1842 for (i = 0; i < dest_nodes->nelem; i++)
1844 err = re_node_set_merge (&state->inveclosure,
1845 dfa->inveclosures + dest_nodes->elems[i]);
1846 if (BE (err != REG_NOERROR, 0))
1847 return REG_ESPACE;
1850 return re_node_set_add_intersect (dest_nodes, candidates,
1851 &state->inveclosure);
1854 static reg_errcode_t
1855 internal_function
1856 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1857 const re_node_set *candidates)
1859 Idx ecl_idx;
1860 reg_errcode_t err;
1861 re_node_set *inv_eclosure = dfa->inveclosures + node;
1862 re_node_set except_nodes;
1863 re_node_set_init_empty (&except_nodes);
1864 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1866 Idx cur_node = inv_eclosure->elems[ecl_idx];
1867 if (cur_node == node)
1868 continue;
1869 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1871 Idx edst1 = dfa->edests[cur_node].elems[0];
1872 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1873 ? dfa->edests[cur_node].elems[1] : -1);
1874 if ((!re_node_set_contains (inv_eclosure, edst1)
1875 && re_node_set_contains (dest_nodes, edst1))
1876 || (edst2 > 0
1877 && !re_node_set_contains (inv_eclosure, edst2)
1878 && re_node_set_contains (dest_nodes, edst2)))
1880 err = re_node_set_add_intersect (&except_nodes, candidates,
1881 dfa->inveclosures + cur_node);
1882 if (BE (err != REG_NOERROR, 0))
1884 re_node_set_free (&except_nodes);
1885 return err;
1890 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1892 Idx cur_node = inv_eclosure->elems[ecl_idx];
1893 if (!re_node_set_contains (&except_nodes, cur_node))
1895 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1896 re_node_set_remove_at (dest_nodes, idx);
1899 re_node_set_free (&except_nodes);
1900 return REG_NOERROR;
1903 static bool
1904 internal_function
1905 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1906 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1908 const re_dfa_t *const dfa = mctx->dfa;
1909 Idx lim_idx, src_pos, dst_pos;
1911 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1912 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1913 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1915 Idx subexp_idx;
1916 struct re_backref_cache_entry *ent;
1917 ent = mctx->bkref_ents + limits->elems[lim_idx];
1918 subexp_idx = dfa->nodes[ent->node].opr.idx;
1920 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1921 subexp_idx, dst_node, dst_idx,
1922 dst_bkref_idx);
1923 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1924 subexp_idx, src_node, src_idx,
1925 src_bkref_idx);
1927 /* In case of:
1928 <src> <dst> ( <subexp> )
1929 ( <subexp> ) <src> <dst>
1930 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1931 if (src_pos == dst_pos)
1932 continue; /* This is unrelated limitation. */
1933 else
1934 return true;
1936 return false;
1939 static int
1940 internal_function
1941 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1942 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1944 const re_dfa_t *const dfa = mctx->dfa;
1945 const re_node_set *eclosures = dfa->eclosures + from_node;
1946 Idx node_idx;
1948 /* Else, we are on the boundary: examine the nodes on the epsilon
1949 closure. */
1950 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1952 Idx node = eclosures->elems[node_idx];
1953 switch (dfa->nodes[node].type)
1955 case OP_BACK_REF:
1956 if (bkref_idx != -1)
1958 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1961 Idx dst;
1962 int cpos;
1964 if (ent->node != node)
1965 continue;
1967 if (subexp_idx < BITSET_WORD_BITS
1968 && !(ent->eps_reachable_subexps_map
1969 & ((bitset_word_t) 1 << subexp_idx)))
1970 continue;
1972 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1973 OP_CLOSE_SUBEXP cases below. But, if the
1974 destination node is the same node as the source
1975 node, don't recurse because it would cause an
1976 infinite loop: a regex that exhibits this behavior
1977 is ()\1*\1* */
1978 dst = dfa->edests[node].elems[0];
1979 if (dst == from_node)
1981 if (boundaries & 1)
1982 return -1;
1983 else /* if (boundaries & 2) */
1984 return 0;
1987 cpos =
1988 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1989 dst, bkref_idx);
1990 if (cpos == -1 /* && (boundaries & 1) */)
1991 return -1;
1992 if (cpos == 0 && (boundaries & 2))
1993 return 0;
1995 if (subexp_idx < BITSET_WORD_BITS)
1996 ent->eps_reachable_subexps_map
1997 &= ~((bitset_word_t) 1 << subexp_idx);
1999 while (ent++->more);
2001 break;
2003 case OP_OPEN_SUBEXP:
2004 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2005 return -1;
2006 break;
2008 case OP_CLOSE_SUBEXP:
2009 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2010 return 0;
2011 break;
2013 default:
2014 break;
2018 return (boundaries & 2) ? 1 : 0;
2021 static int
2022 internal_function
2023 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2024 Idx subexp_idx, Idx from_node, Idx str_idx,
2025 Idx bkref_idx)
2027 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2028 int boundaries;
2030 /* If we are outside the range of the subexpression, return -1 or 1. */
2031 if (str_idx < lim->subexp_from)
2032 return -1;
2034 if (lim->subexp_to < str_idx)
2035 return 1;
2037 /* If we are within the subexpression, return 0. */
2038 boundaries = (str_idx == lim->subexp_from);
2039 boundaries |= (str_idx == lim->subexp_to) << 1;
2040 if (boundaries == 0)
2041 return 0;
2043 /* Else, examine epsilon closure. */
2044 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2045 from_node, bkref_idx);
2048 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2049 which are against limitations from DEST_NODES. */
2051 static reg_errcode_t
2052 internal_function
2053 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2054 const re_node_set *candidates, re_node_set *limits,
2055 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2057 reg_errcode_t err;
2058 Idx node_idx, lim_idx;
2060 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2062 Idx subexp_idx;
2063 struct re_backref_cache_entry *ent;
2064 ent = bkref_ents + limits->elems[lim_idx];
2066 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2067 continue; /* This is unrelated limitation. */
2069 subexp_idx = dfa->nodes[ent->node].opr.idx;
2070 if (ent->subexp_to == str_idx)
2072 Idx ops_node = -1;
2073 Idx cls_node = -1;
2074 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2076 Idx node = dest_nodes->elems[node_idx];
2077 re_token_type_t type = dfa->nodes[node].type;
2078 if (type == OP_OPEN_SUBEXP
2079 && subexp_idx == dfa->nodes[node].opr.idx)
2080 ops_node = node;
2081 else if (type == OP_CLOSE_SUBEXP
2082 && subexp_idx == dfa->nodes[node].opr.idx)
2083 cls_node = node;
2086 /* Check the limitation of the open subexpression. */
2087 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2088 if (ops_node >= 0)
2090 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2091 candidates);
2092 if (BE (err != REG_NOERROR, 0))
2093 return err;
2096 /* Check the limitation of the close subexpression. */
2097 if (cls_node >= 0)
2098 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2100 Idx node = dest_nodes->elems[node_idx];
2101 if (!re_node_set_contains (dfa->inveclosures + node,
2102 cls_node)
2103 && !re_node_set_contains (dfa->eclosures + node,
2104 cls_node))
2106 /* It is against this limitation.
2107 Remove it form the current sifted state. */
2108 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2109 candidates);
2110 if (BE (err != REG_NOERROR, 0))
2111 return err;
2112 --node_idx;
2116 else /* (ent->subexp_to != str_idx) */
2118 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2120 Idx node = dest_nodes->elems[node_idx];
2121 re_token_type_t type = dfa->nodes[node].type;
2122 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2124 if (subexp_idx != dfa->nodes[node].opr.idx)
2125 continue;
2126 /* It is against this limitation.
2127 Remove it form the current sifted state. */
2128 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2129 candidates);
2130 if (BE (err != REG_NOERROR, 0))
2131 return err;
2136 return REG_NOERROR;
2139 static reg_errcode_t
2140 internal_function __attribute_warn_unused_result__
2141 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2142 Idx str_idx, const re_node_set *candidates)
2144 const re_dfa_t *const dfa = mctx->dfa;
2145 reg_errcode_t err;
2146 Idx node_idx, node;
2147 re_sift_context_t local_sctx;
2148 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2150 if (first_idx == -1)
2151 return REG_NOERROR;
2153 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2155 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2157 Idx enabled_idx;
2158 re_token_type_t type;
2159 struct re_backref_cache_entry *entry;
2160 node = candidates->elems[node_idx];
2161 type = dfa->nodes[node].type;
2162 /* Avoid infinite loop for the REs like "()\1+". */
2163 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2164 continue;
2165 if (type != OP_BACK_REF)
2166 continue;
2168 entry = mctx->bkref_ents + first_idx;
2169 enabled_idx = first_idx;
2172 Idx subexp_len;
2173 Idx to_idx;
2174 Idx dst_node;
2175 bool ok;
2176 re_dfastate_t *cur_state;
2178 if (entry->node != node)
2179 continue;
2180 subexp_len = entry->subexp_to - entry->subexp_from;
2181 to_idx = str_idx + subexp_len;
2182 dst_node = (subexp_len ? dfa->nexts[node]
2183 : dfa->edests[node].elems[0]);
2185 if (to_idx > sctx->last_str_idx
2186 || sctx->sifted_states[to_idx] == NULL
2187 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2188 || check_dst_limits (mctx, &sctx->limits, node,
2189 str_idx, dst_node, to_idx))
2190 continue;
2192 if (local_sctx.sifted_states == NULL)
2194 local_sctx = *sctx;
2195 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2196 if (BE (err != REG_NOERROR, 0))
2197 goto free_return;
2199 local_sctx.last_node = node;
2200 local_sctx.last_str_idx = str_idx;
2201 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2202 if (BE (! ok, 0))
2204 err = REG_ESPACE;
2205 goto free_return;
2207 cur_state = local_sctx.sifted_states[str_idx];
2208 err = sift_states_backward (mctx, &local_sctx);
2209 if (BE (err != REG_NOERROR, 0))
2210 goto free_return;
2211 if (sctx->limited_states != NULL)
2213 err = merge_state_array (dfa, sctx->limited_states,
2214 local_sctx.sifted_states,
2215 str_idx + 1);
2216 if (BE (err != REG_NOERROR, 0))
2217 goto free_return;
2219 local_sctx.sifted_states[str_idx] = cur_state;
2220 re_node_set_remove (&local_sctx.limits, enabled_idx);
2222 /* mctx->bkref_ents may have changed, reload the pointer. */
2223 entry = mctx->bkref_ents + enabled_idx;
2225 while (enabled_idx++, entry++->more);
2227 err = REG_NOERROR;
2228 free_return:
2229 if (local_sctx.sifted_states != NULL)
2231 re_node_set_free (&local_sctx.limits);
2234 return err;
2238 #ifdef RE_ENABLE_I18N
2239 static int
2240 internal_function
2241 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2242 Idx node_idx, Idx str_idx, Idx max_str_idx)
2244 const re_dfa_t *const dfa = mctx->dfa;
2245 int naccepted;
2246 /* Check the node can accept "multi byte". */
2247 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2248 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2249 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2250 dfa->nexts[node_idx]))
2251 /* The node can't accept the "multi byte", or the
2252 destination was already thrown away, then the node
2253 could't accept the current input "multi byte". */
2254 naccepted = 0;
2255 /* Otherwise, it is sure that the node could accept
2256 'naccepted' bytes input. */
2257 return naccepted;
2259 #endif /* RE_ENABLE_I18N */
2262 /* Functions for state transition. */
2264 /* Return the next state to which the current state STATE will transit by
2265 accepting the current input byte, and update STATE_LOG if necessary.
2266 If STATE can accept a multibyte char/collating element/back reference
2267 update the destination of STATE_LOG. */
2269 static re_dfastate_t *
2270 internal_function __attribute_warn_unused_result__
2271 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2272 re_dfastate_t *state)
2274 re_dfastate_t **trtable;
2275 unsigned char ch;
2277 #ifdef RE_ENABLE_I18N
2278 /* If the current state can accept multibyte. */
2279 if (BE (state->accept_mb, 0))
2281 *err = transit_state_mb (mctx, state);
2282 if (BE (*err != REG_NOERROR, 0))
2283 return NULL;
2285 #endif /* RE_ENABLE_I18N */
2287 /* Then decide the next state with the single byte. */
2288 #if 0
2289 if (0)
2290 /* don't use transition table */
2291 return transit_state_sb (err, mctx, state);
2292 #endif
2294 /* Use transition table */
2295 ch = re_string_fetch_byte (&mctx->input);
2296 for (;;)
2298 trtable = state->trtable;
2299 if (BE (trtable != NULL, 1))
2300 return trtable[ch];
2302 trtable = state->word_trtable;
2303 if (BE (trtable != NULL, 1))
2305 unsigned int context;
2306 context
2307 = re_string_context_at (&mctx->input,
2308 re_string_cur_idx (&mctx->input) - 1,
2309 mctx->eflags);
2310 if (IS_WORD_CONTEXT (context))
2311 return trtable[ch + SBC_MAX];
2312 else
2313 return trtable[ch];
2316 if (!build_trtable (mctx->dfa, state))
2318 *err = REG_ESPACE;
2319 return NULL;
2322 /* Retry, we now have a transition table. */
2326 /* Update the state_log if we need */
2327 static re_dfastate_t *
2328 internal_function
2329 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2330 re_dfastate_t *next_state)
2332 const re_dfa_t *const dfa = mctx->dfa;
2333 Idx cur_idx = re_string_cur_idx (&mctx->input);
2335 if (cur_idx > mctx->state_log_top)
2337 mctx->state_log[cur_idx] = next_state;
2338 mctx->state_log_top = cur_idx;
2340 else if (mctx->state_log[cur_idx] == 0)
2342 mctx->state_log[cur_idx] = next_state;
2344 else
2346 re_dfastate_t *pstate;
2347 unsigned int context;
2348 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2349 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2350 the destination of a multibyte char/collating element/
2351 back reference. Then the next state is the union set of
2352 these destinations and the results of the transition table. */
2353 pstate = mctx->state_log[cur_idx];
2354 log_nodes = pstate->entrance_nodes;
2355 if (next_state != NULL)
2357 table_nodes = next_state->entrance_nodes;
2358 *err = re_node_set_init_union (&next_nodes, table_nodes,
2359 log_nodes);
2360 if (BE (*err != REG_NOERROR, 0))
2361 return NULL;
2363 else
2364 next_nodes = *log_nodes;
2365 /* Note: We already add the nodes of the initial state,
2366 then we don't need to add them here. */
2368 context = re_string_context_at (&mctx->input,
2369 re_string_cur_idx (&mctx->input) - 1,
2370 mctx->eflags);
2371 next_state = mctx->state_log[cur_idx]
2372 = re_acquire_state_context (err, dfa, &next_nodes, context);
2373 /* We don't need to check errors here, since the return value of
2374 this function is next_state and ERR is already set. */
2376 if (table_nodes != NULL)
2377 re_node_set_free (&next_nodes);
2380 if (BE (dfa->nbackref, 0) && next_state != NULL)
2382 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2383 later. We must check them here, since the back references in the
2384 next state might use them. */
2385 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2386 cur_idx);
2387 if (BE (*err != REG_NOERROR, 0))
2388 return NULL;
2390 /* If the next state has back references. */
2391 if (next_state->has_backref)
2393 *err = transit_state_bkref (mctx, &next_state->nodes);
2394 if (BE (*err != REG_NOERROR, 0))
2395 return NULL;
2396 next_state = mctx->state_log[cur_idx];
2400 return next_state;
2403 /* Skip bytes in the input that correspond to part of a
2404 multi-byte match, then look in the log for a state
2405 from which to restart matching. */
2406 static re_dfastate_t *
2407 internal_function
2408 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2410 re_dfastate_t *cur_state;
2413 Idx max = mctx->state_log_top;
2414 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2418 if (++cur_str_idx > max)
2419 return NULL;
2420 re_string_skip_bytes (&mctx->input, 1);
2422 while (mctx->state_log[cur_str_idx] == NULL);
2424 cur_state = merge_state_with_log (err, mctx, NULL);
2426 while (*err == REG_NOERROR && cur_state == NULL);
2427 return cur_state;
2430 /* Helper functions for transit_state. */
2432 /* From the node set CUR_NODES, pick up the nodes whose types are
2433 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2434 expression. And register them to use them later for evaluating the
2435 corresponding back references. */
2437 static reg_errcode_t
2438 internal_function
2439 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2440 Idx str_idx)
2442 const re_dfa_t *const dfa = mctx->dfa;
2443 Idx node_idx;
2444 reg_errcode_t err;
2446 /* TODO: This isn't efficient.
2447 Because there might be more than one nodes whose types are
2448 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2449 nodes.
2450 E.g. RE: (a){2} */
2451 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2453 Idx node = cur_nodes->elems[node_idx];
2454 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2455 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2456 && (dfa->used_bkref_map
2457 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2459 err = match_ctx_add_subtop (mctx, node, str_idx);
2460 if (BE (err != REG_NOERROR, 0))
2461 return err;
2464 return REG_NOERROR;
2467 #if 0
2468 /* Return the next state to which the current state STATE will transit by
2469 accepting the current input byte. */
2471 static re_dfastate_t *
2472 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2473 re_dfastate_t *state)
2475 const re_dfa_t *const dfa = mctx->dfa;
2476 re_node_set next_nodes;
2477 re_dfastate_t *next_state;
2478 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2479 unsigned int context;
2481 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2482 if (BE (*err != REG_NOERROR, 0))
2483 return NULL;
2484 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2486 Idx cur_node = state->nodes.elems[node_cnt];
2487 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2489 *err = re_node_set_merge (&next_nodes,
2490 dfa->eclosures + dfa->nexts[cur_node]);
2491 if (BE (*err != REG_NOERROR, 0))
2493 re_node_set_free (&next_nodes);
2494 return NULL;
2498 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2499 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2500 /* We don't need to check errors here, since the return value of
2501 this function is next_state and ERR is already set. */
2503 re_node_set_free (&next_nodes);
2504 re_string_skip_bytes (&mctx->input, 1);
2505 return next_state;
2507 #endif
2509 #ifdef RE_ENABLE_I18N
2510 static reg_errcode_t
2511 internal_function
2512 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2514 const re_dfa_t *const dfa = mctx->dfa;
2515 reg_errcode_t err;
2516 Idx i;
2518 for (i = 0; i < pstate->nodes.nelem; ++i)
2520 re_node_set dest_nodes, *new_nodes;
2521 Idx cur_node_idx = pstate->nodes.elems[i];
2522 int naccepted;
2523 Idx dest_idx;
2524 unsigned int context;
2525 re_dfastate_t *dest_state;
2527 if (!dfa->nodes[cur_node_idx].accept_mb)
2528 continue;
2530 if (dfa->nodes[cur_node_idx].constraint)
2532 context = re_string_context_at (&mctx->input,
2533 re_string_cur_idx (&mctx->input),
2534 mctx->eflags);
2535 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2536 context))
2537 continue;
2540 /* How many bytes the node can accept? */
2541 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2542 re_string_cur_idx (&mctx->input));
2543 if (naccepted == 0)
2544 continue;
2546 /* The node can accepts 'naccepted' bytes. */
2547 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2548 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2549 : mctx->max_mb_elem_len);
2550 err = clean_state_log_if_needed (mctx, dest_idx);
2551 if (BE (err != REG_NOERROR, 0))
2552 return err;
2553 #ifdef DEBUG
2554 assert (dfa->nexts[cur_node_idx] != -1);
2555 #endif
2556 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2558 dest_state = mctx->state_log[dest_idx];
2559 if (dest_state == NULL)
2560 dest_nodes = *new_nodes;
2561 else
2563 err = re_node_set_init_union (&dest_nodes,
2564 dest_state->entrance_nodes, new_nodes);
2565 if (BE (err != REG_NOERROR, 0))
2566 return err;
2568 context = re_string_context_at (&mctx->input, dest_idx - 1,
2569 mctx->eflags);
2570 mctx->state_log[dest_idx]
2571 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2572 if (dest_state != NULL)
2573 re_node_set_free (&dest_nodes);
2574 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2575 return err;
2577 return REG_NOERROR;
2579 #endif /* RE_ENABLE_I18N */
2581 static reg_errcode_t
2582 internal_function
2583 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2585 const re_dfa_t *const dfa = mctx->dfa;
2586 reg_errcode_t err;
2587 Idx i;
2588 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2590 for (i = 0; i < nodes->nelem; ++i)
2592 Idx dest_str_idx, prev_nelem, bkc_idx;
2593 Idx node_idx = nodes->elems[i];
2594 unsigned int context;
2595 const re_token_t *node = dfa->nodes + node_idx;
2596 re_node_set *new_dest_nodes;
2598 /* Check whether 'node' is a backreference or not. */
2599 if (node->type != OP_BACK_REF)
2600 continue;
2602 if (node->constraint)
2604 context = re_string_context_at (&mctx->input, cur_str_idx,
2605 mctx->eflags);
2606 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2607 continue;
2610 /* 'node' is a backreference.
2611 Check the substring which the substring matched. */
2612 bkc_idx = mctx->nbkref_ents;
2613 err = get_subexp (mctx, node_idx, cur_str_idx);
2614 if (BE (err != REG_NOERROR, 0))
2615 goto free_return;
2617 /* And add the epsilon closures (which is 'new_dest_nodes') of
2618 the backreference to appropriate state_log. */
2619 #ifdef DEBUG
2620 assert (dfa->nexts[node_idx] != -1);
2621 #endif
2622 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2624 Idx subexp_len;
2625 re_dfastate_t *dest_state;
2626 struct re_backref_cache_entry *bkref_ent;
2627 bkref_ent = mctx->bkref_ents + bkc_idx;
2628 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2629 continue;
2630 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2631 new_dest_nodes = (subexp_len == 0
2632 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2633 : dfa->eclosures + dfa->nexts[node_idx]);
2634 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2635 - bkref_ent->subexp_from);
2636 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2637 mctx->eflags);
2638 dest_state = mctx->state_log[dest_str_idx];
2639 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2640 : mctx->state_log[cur_str_idx]->nodes.nelem);
2641 /* Add 'new_dest_node' to state_log. */
2642 if (dest_state == NULL)
2644 mctx->state_log[dest_str_idx]
2645 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2646 context);
2647 if (BE (mctx->state_log[dest_str_idx] == NULL
2648 && err != REG_NOERROR, 0))
2649 goto free_return;
2651 else
2653 re_node_set dest_nodes;
2654 err = re_node_set_init_union (&dest_nodes,
2655 dest_state->entrance_nodes,
2656 new_dest_nodes);
2657 if (BE (err != REG_NOERROR, 0))
2659 re_node_set_free (&dest_nodes);
2660 goto free_return;
2662 mctx->state_log[dest_str_idx]
2663 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2664 re_node_set_free (&dest_nodes);
2665 if (BE (mctx->state_log[dest_str_idx] == NULL
2666 && err != REG_NOERROR, 0))
2667 goto free_return;
2669 /* We need to check recursively if the backreference can epsilon
2670 transit. */
2671 if (subexp_len == 0
2672 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2674 err = check_subexp_matching_top (mctx, new_dest_nodes,
2675 cur_str_idx);
2676 if (BE (err != REG_NOERROR, 0))
2677 goto free_return;
2678 err = transit_state_bkref (mctx, new_dest_nodes);
2679 if (BE (err != REG_NOERROR, 0))
2680 goto free_return;
2684 err = REG_NOERROR;
2685 free_return:
2686 return err;
2689 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2690 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2691 Note that we might collect inappropriate candidates here.
2692 However, the cost of checking them strictly here is too high, then we
2693 delay these checking for prune_impossible_nodes(). */
2695 static reg_errcode_t
2696 internal_function __attribute_warn_unused_result__
2697 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2699 const re_dfa_t *const dfa = mctx->dfa;
2700 Idx subexp_num, sub_top_idx;
2701 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2702 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2703 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2704 if (cache_idx != -1)
2706 const struct re_backref_cache_entry *entry
2707 = mctx->bkref_ents + cache_idx;
2709 if (entry->node == bkref_node)
2710 return REG_NOERROR; /* We already checked it. */
2711 while (entry++->more);
2714 subexp_num = dfa->nodes[bkref_node].opr.idx;
2716 /* For each sub expression */
2717 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2719 reg_errcode_t err;
2720 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2721 re_sub_match_last_t *sub_last;
2722 Idx sub_last_idx, sl_str, bkref_str_off;
2724 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2725 continue; /* It isn't related. */
2727 sl_str = sub_top->str_idx;
2728 bkref_str_off = bkref_str_idx;
2729 /* At first, check the last node of sub expressions we already
2730 evaluated. */
2731 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2733 regoff_t sl_str_diff;
2734 sub_last = sub_top->lasts[sub_last_idx];
2735 sl_str_diff = sub_last->str_idx - sl_str;
2736 /* The matched string by the sub expression match with the substring
2737 at the back reference? */
2738 if (sl_str_diff > 0)
2740 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2742 /* Not enough chars for a successful match. */
2743 if (bkref_str_off + sl_str_diff > mctx->input.len)
2744 break;
2746 err = clean_state_log_if_needed (mctx,
2747 bkref_str_off
2748 + sl_str_diff);
2749 if (BE (err != REG_NOERROR, 0))
2750 return err;
2751 buf = (const char *) re_string_get_buffer (&mctx->input);
2753 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2754 /* We don't need to search this sub expression any more. */
2755 break;
2757 bkref_str_off += sl_str_diff;
2758 sl_str += sl_str_diff;
2759 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2760 bkref_str_idx);
2762 /* Reload buf, since the preceding call might have reallocated
2763 the buffer. */
2764 buf = (const char *) re_string_get_buffer (&mctx->input);
2766 if (err == REG_NOMATCH)
2767 continue;
2768 if (BE (err != REG_NOERROR, 0))
2769 return err;
2772 if (sub_last_idx < sub_top->nlasts)
2773 continue;
2774 if (sub_last_idx > 0)
2775 ++sl_str;
2776 /* Then, search for the other last nodes of the sub expression. */
2777 for (; sl_str <= bkref_str_idx; ++sl_str)
2779 Idx cls_node;
2780 regoff_t sl_str_off;
2781 const re_node_set *nodes;
2782 sl_str_off = sl_str - sub_top->str_idx;
2783 /* The matched string by the sub expression match with the substring
2784 at the back reference? */
2785 if (sl_str_off > 0)
2787 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2789 /* If we are at the end of the input, we cannot match. */
2790 if (bkref_str_off >= mctx->input.len)
2791 break;
2793 err = extend_buffers (mctx, bkref_str_off + 1);
2794 if (BE (err != REG_NOERROR, 0))
2795 return err;
2797 buf = (const char *) re_string_get_buffer (&mctx->input);
2799 if (buf [bkref_str_off++] != buf[sl_str - 1])
2800 break; /* We don't need to search this sub expression
2801 any more. */
2803 if (mctx->state_log[sl_str] == NULL)
2804 continue;
2805 /* Does this state have a ')' of the sub expression? */
2806 nodes = &mctx->state_log[sl_str]->nodes;
2807 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2808 OP_CLOSE_SUBEXP);
2809 if (cls_node == -1)
2810 continue; /* No. */
2811 if (sub_top->path == NULL)
2813 sub_top->path = calloc (sizeof (state_array_t),
2814 sl_str - sub_top->str_idx + 1);
2815 if (sub_top->path == NULL)
2816 return REG_ESPACE;
2818 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2819 in the current context? */
2820 err = check_arrival (mctx, sub_top->path, sub_top->node,
2821 sub_top->str_idx, cls_node, sl_str,
2822 OP_CLOSE_SUBEXP);
2823 if (err == REG_NOMATCH)
2824 continue;
2825 if (BE (err != REG_NOERROR, 0))
2826 return err;
2827 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2828 if (BE (sub_last == NULL, 0))
2829 return REG_ESPACE;
2830 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2831 bkref_str_idx);
2832 if (err == REG_NOMATCH)
2833 continue;
2836 return REG_NOERROR;
2839 /* Helper functions for get_subexp(). */
2841 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2842 If it can arrive, register the sub expression expressed with SUB_TOP
2843 and SUB_LAST. */
2845 static reg_errcode_t
2846 internal_function
2847 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2848 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2850 reg_errcode_t err;
2851 Idx to_idx;
2852 /* Can the subexpression arrive the back reference? */
2853 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2854 sub_last->str_idx, bkref_node, bkref_str,
2855 OP_OPEN_SUBEXP);
2856 if (err != REG_NOERROR)
2857 return err;
2858 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2859 sub_last->str_idx);
2860 if (BE (err != REG_NOERROR, 0))
2861 return err;
2862 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2863 return clean_state_log_if_needed (mctx, to_idx);
2866 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2867 Search '(' if FL_OPEN, or search ')' otherwise.
2868 TODO: This function isn't efficient...
2869 Because there might be more than one nodes whose types are
2870 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2871 nodes.
2872 E.g. RE: (a){2} */
2874 static Idx
2875 internal_function
2876 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2877 Idx subexp_idx, int type)
2879 Idx cls_idx;
2880 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2882 Idx cls_node = nodes->elems[cls_idx];
2883 const re_token_t *node = dfa->nodes + cls_node;
2884 if (node->type == type
2885 && node->opr.idx == subexp_idx)
2886 return cls_node;
2888 return -1;
2891 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2892 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2893 heavily reused.
2894 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2896 static reg_errcode_t
2897 internal_function __attribute_warn_unused_result__
2898 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2899 Idx top_str, Idx last_node, Idx last_str, int type)
2901 const re_dfa_t *const dfa = mctx->dfa;
2902 reg_errcode_t err = REG_NOERROR;
2903 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2904 re_dfastate_t *cur_state = NULL;
2905 re_node_set *cur_nodes, next_nodes;
2906 re_dfastate_t **backup_state_log;
2907 unsigned int context;
2909 subexp_num = dfa->nodes[top_node].opr.idx;
2910 /* Extend the buffer if we need. */
2911 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2913 re_dfastate_t **new_array;
2914 Idx old_alloc = path->alloc;
2915 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2916 Idx new_alloc;
2917 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2918 return REG_ESPACE;
2919 new_alloc = old_alloc + incr_alloc;
2920 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2921 return REG_ESPACE;
2922 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2923 if (BE (new_array == NULL, 0))
2924 return REG_ESPACE;
2925 path->array = new_array;
2926 path->alloc = new_alloc;
2927 memset (new_array + old_alloc, '\0',
2928 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2931 str_idx = path->next_idx ? path->next_idx : top_str;
2933 /* Temporary modify MCTX. */
2934 backup_state_log = mctx->state_log;
2935 backup_cur_idx = mctx->input.cur_idx;
2936 mctx->state_log = path->array;
2937 mctx->input.cur_idx = str_idx;
2939 /* Setup initial node set. */
2940 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941 if (str_idx == top_str)
2943 err = re_node_set_init_1 (&next_nodes, top_node);
2944 if (BE (err != REG_NOERROR, 0))
2945 return err;
2946 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947 if (BE (err != REG_NOERROR, 0))
2949 re_node_set_free (&next_nodes);
2950 return err;
2953 else
2955 cur_state = mctx->state_log[str_idx];
2956 if (cur_state && cur_state->has_backref)
2958 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2959 if (BE (err != REG_NOERROR, 0))
2960 return err;
2962 else
2963 re_node_set_init_empty (&next_nodes);
2965 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2967 if (next_nodes.nelem)
2969 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2970 subexp_num, type);
2971 if (BE (err != REG_NOERROR, 0))
2973 re_node_set_free (&next_nodes);
2974 return err;
2977 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2978 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2980 re_node_set_free (&next_nodes);
2981 return err;
2983 mctx->state_log[str_idx] = cur_state;
2986 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2988 re_node_set_empty (&next_nodes);
2989 if (mctx->state_log[str_idx + 1])
2991 err = re_node_set_merge (&next_nodes,
2992 &mctx->state_log[str_idx + 1]->nodes);
2993 if (BE (err != REG_NOERROR, 0))
2995 re_node_set_free (&next_nodes);
2996 return err;
2999 if (cur_state)
3001 err = check_arrival_add_next_nodes (mctx, str_idx,
3002 &cur_state->non_eps_nodes,
3003 &next_nodes);
3004 if (BE (err != REG_NOERROR, 0))
3006 re_node_set_free (&next_nodes);
3007 return err;
3010 ++str_idx;
3011 if (next_nodes.nelem)
3013 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3014 if (BE (err != REG_NOERROR, 0))
3016 re_node_set_free (&next_nodes);
3017 return err;
3019 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3020 subexp_num, type);
3021 if (BE (err != REG_NOERROR, 0))
3023 re_node_set_free (&next_nodes);
3024 return err;
3027 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3028 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3029 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3031 re_node_set_free (&next_nodes);
3032 return err;
3034 mctx->state_log[str_idx] = cur_state;
3035 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3037 re_node_set_free (&next_nodes);
3038 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3039 : &mctx->state_log[last_str]->nodes);
3040 path->next_idx = str_idx;
3042 /* Fix MCTX. */
3043 mctx->state_log = backup_state_log;
3044 mctx->input.cur_idx = backup_cur_idx;
3046 /* Then check the current node set has the node LAST_NODE. */
3047 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3048 return REG_NOERROR;
3050 return REG_NOMATCH;
3053 /* Helper functions for check_arrival. */
3055 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3056 to NEXT_NODES.
3057 TODO: This function is similar to the functions transit_state*(),
3058 however this function has many additional works.
3059 Can't we unify them? */
3061 static reg_errcode_t
3062 internal_function __attribute_warn_unused_result__
3063 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3064 re_node_set *cur_nodes, re_node_set *next_nodes)
3066 const re_dfa_t *const dfa = mctx->dfa;
3067 bool ok;
3068 Idx cur_idx;
3069 #ifdef RE_ENABLE_I18N
3070 reg_errcode_t err = REG_NOERROR;
3071 #endif
3072 re_node_set union_set;
3073 re_node_set_init_empty (&union_set);
3074 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3076 int naccepted = 0;
3077 Idx cur_node = cur_nodes->elems[cur_idx];
3078 #ifdef DEBUG
3079 re_token_type_t type = dfa->nodes[cur_node].type;
3080 assert (!IS_EPSILON_NODE (type));
3081 #endif
3082 #ifdef RE_ENABLE_I18N
3083 /* If the node may accept "multi byte". */
3084 if (dfa->nodes[cur_node].accept_mb)
3086 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3087 str_idx);
3088 if (naccepted > 1)
3090 re_dfastate_t *dest_state;
3091 Idx next_node = dfa->nexts[cur_node];
3092 Idx next_idx = str_idx + naccepted;
3093 dest_state = mctx->state_log[next_idx];
3094 re_node_set_empty (&union_set);
3095 if (dest_state)
3097 err = re_node_set_merge (&union_set, &dest_state->nodes);
3098 if (BE (err != REG_NOERROR, 0))
3100 re_node_set_free (&union_set);
3101 return err;
3104 ok = re_node_set_insert (&union_set, next_node);
3105 if (BE (! ok, 0))
3107 re_node_set_free (&union_set);
3108 return REG_ESPACE;
3110 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3111 &union_set);
3112 if (BE (mctx->state_log[next_idx] == NULL
3113 && err != REG_NOERROR, 0))
3115 re_node_set_free (&union_set);
3116 return err;
3120 #endif /* RE_ENABLE_I18N */
3121 if (naccepted
3122 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3124 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3125 if (BE (! ok, 0))
3127 re_node_set_free (&union_set);
3128 return REG_ESPACE;
3132 re_node_set_free (&union_set);
3133 return REG_NOERROR;
3136 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3137 CUR_NODES, however exclude the nodes which are:
3138 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3139 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3142 static reg_errcode_t
3143 internal_function
3144 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3145 Idx ex_subexp, int type)
3147 reg_errcode_t err;
3148 Idx idx, outside_node;
3149 re_node_set new_nodes;
3150 #ifdef DEBUG
3151 assert (cur_nodes->nelem);
3152 #endif
3153 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3154 if (BE (err != REG_NOERROR, 0))
3155 return err;
3156 /* Create a new node set NEW_NODES with the nodes which are epsilon
3157 closures of the node in CUR_NODES. */
3159 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3161 Idx cur_node = cur_nodes->elems[idx];
3162 const re_node_set *eclosure = dfa->eclosures + cur_node;
3163 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3164 if (outside_node == -1)
3166 /* There are no problematic nodes, just merge them. */
3167 err = re_node_set_merge (&new_nodes, eclosure);
3168 if (BE (err != REG_NOERROR, 0))
3170 re_node_set_free (&new_nodes);
3171 return err;
3174 else
3176 /* There are problematic nodes, re-calculate incrementally. */
3177 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3178 ex_subexp, type);
3179 if (BE (err != REG_NOERROR, 0))
3181 re_node_set_free (&new_nodes);
3182 return err;
3186 re_node_set_free (cur_nodes);
3187 *cur_nodes = new_nodes;
3188 return REG_NOERROR;
3191 /* Helper function for check_arrival_expand_ecl.
3192 Check incrementally the epsilon closure of TARGET, and if it isn't
3193 problematic append it to DST_NODES. */
3195 static reg_errcode_t
3196 internal_function __attribute_warn_unused_result__
3197 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3198 Idx target, Idx ex_subexp, int type)
3200 Idx cur_node;
3201 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3203 bool ok;
3205 if (dfa->nodes[cur_node].type == type
3206 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3208 if (type == OP_CLOSE_SUBEXP)
3210 ok = re_node_set_insert (dst_nodes, cur_node);
3211 if (BE (! ok, 0))
3212 return REG_ESPACE;
3214 break;
3216 ok = re_node_set_insert (dst_nodes, cur_node);
3217 if (BE (! ok, 0))
3218 return REG_ESPACE;
3219 if (dfa->edests[cur_node].nelem == 0)
3220 break;
3221 if (dfa->edests[cur_node].nelem == 2)
3223 reg_errcode_t err;
3224 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3225 dfa->edests[cur_node].elems[1],
3226 ex_subexp, type);
3227 if (BE (err != REG_NOERROR, 0))
3228 return err;
3230 cur_node = dfa->edests[cur_node].elems[0];
3232 return REG_NOERROR;
3236 /* For all the back references in the current state, calculate the
3237 destination of the back references by the appropriate entry
3238 in MCTX->BKREF_ENTS. */
3240 static reg_errcode_t
3241 internal_function __attribute_warn_unused_result__
3242 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3243 Idx cur_str, Idx subexp_num, int type)
3245 const re_dfa_t *const dfa = mctx->dfa;
3246 reg_errcode_t err;
3247 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3248 struct re_backref_cache_entry *ent;
3250 if (cache_idx_start == -1)
3251 return REG_NOERROR;
3253 restart:
3254 ent = mctx->bkref_ents + cache_idx_start;
3257 Idx to_idx, next_node;
3259 /* Is this entry ENT is appropriate? */
3260 if (!re_node_set_contains (cur_nodes, ent->node))
3261 continue; /* No. */
3263 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3264 /* Calculate the destination of the back reference, and append it
3265 to MCTX->STATE_LOG. */
3266 if (to_idx == cur_str)
3268 /* The backreference did epsilon transit, we must re-check all the
3269 node in the current state. */
3270 re_node_set new_dests;
3271 reg_errcode_t err2, err3;
3272 next_node = dfa->edests[ent->node].elems[0];
3273 if (re_node_set_contains (cur_nodes, next_node))
3274 continue;
3275 err = re_node_set_init_1 (&new_dests, next_node);
3276 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3277 err3 = re_node_set_merge (cur_nodes, &new_dests);
3278 re_node_set_free (&new_dests);
3279 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3280 || err3 != REG_NOERROR, 0))
3282 err = (err != REG_NOERROR ? err
3283 : (err2 != REG_NOERROR ? err2 : err3));
3284 return err;
3286 /* TODO: It is still inefficient... */
3287 goto restart;
3289 else
3291 re_node_set union_set;
3292 next_node = dfa->nexts[ent->node];
3293 if (mctx->state_log[to_idx])
3295 bool ok;
3296 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3297 next_node))
3298 continue;
3299 err = re_node_set_init_copy (&union_set,
3300 &mctx->state_log[to_idx]->nodes);
3301 ok = re_node_set_insert (&union_set, next_node);
3302 if (BE (err != REG_NOERROR || ! ok, 0))
3304 re_node_set_free (&union_set);
3305 err = err != REG_NOERROR ? err : REG_ESPACE;
3306 return err;
3309 else
3311 err = re_node_set_init_1 (&union_set, next_node);
3312 if (BE (err != REG_NOERROR, 0))
3313 return err;
3315 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3316 re_node_set_free (&union_set);
3317 if (BE (mctx->state_log[to_idx] == NULL
3318 && err != REG_NOERROR, 0))
3319 return err;
3322 while (ent++->more);
3323 return REG_NOERROR;
3326 /* Build transition table for the state.
3327 Return true if successful. */
3329 static bool
3330 internal_function
3331 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3333 reg_errcode_t err;
3334 Idx i, j;
3335 int ch;
3336 bool need_word_trtable = false;
3337 bitset_word_t elem, mask;
3338 bool dests_node_malloced = false;
3339 bool dest_states_malloced = false;
3340 Idx ndests; /* Number of the destination states from 'state'. */
3341 re_dfastate_t **trtable;
3342 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3343 re_node_set follows, *dests_node;
3344 bitset_t *dests_ch;
3345 bitset_t acceptable;
3347 struct dests_alloc
3349 re_node_set dests_node[SBC_MAX];
3350 bitset_t dests_ch[SBC_MAX];
3351 } *dests_alloc;
3353 /* We build DFA states which corresponds to the destination nodes
3354 from 'state'. 'dests_node[i]' represents the nodes which i-th
3355 destination state contains, and 'dests_ch[i]' represents the
3356 characters which i-th destination state accepts. */
3357 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3358 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3359 else
3361 dests_alloc = re_malloc (struct dests_alloc, 1);
3362 if (BE (dests_alloc == NULL, 0))
3363 return false;
3364 dests_node_malloced = true;
3366 dests_node = dests_alloc->dests_node;
3367 dests_ch = dests_alloc->dests_ch;
3369 /* Initialize transition table. */
3370 state->word_trtable = state->trtable = NULL;
3372 /* At first, group all nodes belonging to 'state' into several
3373 destinations. */
3374 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3375 if (BE (ndests <= 0, 0))
3377 if (dests_node_malloced)
3378 free (dests_alloc);
3379 /* Return false in case of an error, true otherwise. */
3380 if (ndests == 0)
3382 state->trtable = (re_dfastate_t **)
3383 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3384 if (BE (state->trtable == NULL, 0))
3385 return false;
3386 return true;
3388 return false;
3391 err = re_node_set_alloc (&follows, ndests + 1);
3392 if (BE (err != REG_NOERROR, 0))
3393 goto out_free;
3395 /* Avoid arithmetic overflow in size calculation. */
3396 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3397 / (3 * sizeof (re_dfastate_t *)))
3398 < ndests),
3400 goto out_free;
3402 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3403 + ndests * 3 * sizeof (re_dfastate_t *)))
3404 dest_states = (re_dfastate_t **)
3405 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3406 else
3408 dest_states = (re_dfastate_t **)
3409 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3410 if (BE (dest_states == NULL, 0))
3412 out_free:
3413 if (dest_states_malloced)
3414 free (dest_states);
3415 re_node_set_free (&follows);
3416 for (i = 0; i < ndests; ++i)
3417 re_node_set_free (dests_node + i);
3418 if (dests_node_malloced)
3419 free (dests_alloc);
3420 return false;
3422 dest_states_malloced = true;
3424 dest_states_word = dest_states + ndests;
3425 dest_states_nl = dest_states_word + ndests;
3426 bitset_empty (acceptable);
3428 /* Then build the states for all destinations. */
3429 for (i = 0; i < ndests; ++i)
3431 Idx next_node;
3432 re_node_set_empty (&follows);
3433 /* Merge the follows of this destination states. */
3434 for (j = 0; j < dests_node[i].nelem; ++j)
3436 next_node = dfa->nexts[dests_node[i].elems[j]];
3437 if (next_node != -1)
3439 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3440 if (BE (err != REG_NOERROR, 0))
3441 goto out_free;
3444 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3445 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3446 goto out_free;
3447 /* If the new state has context constraint,
3448 build appropriate states for these contexts. */
3449 if (dest_states[i]->has_constraint)
3451 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3452 CONTEXT_WORD);
3453 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3454 goto out_free;
3456 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3457 need_word_trtable = true;
3459 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3460 CONTEXT_NEWLINE);
3461 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3462 goto out_free;
3464 else
3466 dest_states_word[i] = dest_states[i];
3467 dest_states_nl[i] = dest_states[i];
3469 bitset_merge (acceptable, dests_ch[i]);
3472 if (!BE (need_word_trtable, 0))
3474 /* We don't care about whether the following character is a word
3475 character, or we are in a single-byte character set so we can
3476 discern by looking at the character code: allocate a
3477 256-entry transition table. */
3478 trtable = state->trtable =
3479 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3480 if (BE (trtable == NULL, 0))
3481 goto out_free;
3483 /* For all characters ch...: */
3484 for (i = 0; i < BITSET_WORDS; ++i)
3485 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3486 elem;
3487 mask <<= 1, elem >>= 1, ++ch)
3488 if (BE (elem & 1, 0))
3490 /* There must be exactly one destination which accepts
3491 character ch. See group_nodes_into_DFAstates. */
3492 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3495 /* j-th destination accepts the word character ch. */
3496 if (dfa->word_char[i] & mask)
3497 trtable[ch] = dest_states_word[j];
3498 else
3499 trtable[ch] = dest_states[j];
3502 else
3504 /* We care about whether the following character is a word
3505 character, and we are in a multi-byte character set: discern
3506 by looking at the character code: build two 256-entry
3507 transition tables, one starting at trtable[0] and one
3508 starting at trtable[SBC_MAX]. */
3509 trtable = state->word_trtable =
3510 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3511 if (BE (trtable == NULL, 0))
3512 goto out_free;
3514 /* For all characters ch...: */
3515 for (i = 0; i < BITSET_WORDS; ++i)
3516 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3517 elem;
3518 mask <<= 1, elem >>= 1, ++ch)
3519 if (BE (elem & 1, 0))
3521 /* There must be exactly one destination which accepts
3522 character ch. See group_nodes_into_DFAstates. */
3523 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3526 /* j-th destination accepts the word character ch. */
3527 trtable[ch] = dest_states[j];
3528 trtable[ch + SBC_MAX] = dest_states_word[j];
3532 /* new line */
3533 if (bitset_contain (acceptable, NEWLINE_CHAR))
3535 /* The current state accepts newline character. */
3536 for (j = 0; j < ndests; ++j)
3537 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3539 /* k-th destination accepts newline character. */
3540 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3541 if (need_word_trtable)
3542 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3543 /* There must be only one destination which accepts
3544 newline. See group_nodes_into_DFAstates. */
3545 break;
3549 if (dest_states_malloced)
3550 free (dest_states);
3552 re_node_set_free (&follows);
3553 for (i = 0; i < ndests; ++i)
3554 re_node_set_free (dests_node + i);
3556 if (dests_node_malloced)
3557 free (dests_alloc);
3559 return true;
3562 /* Group all nodes belonging to STATE into several destinations.
3563 Then for all destinations, set the nodes belonging to the destination
3564 to DESTS_NODE[i] and set the characters accepted by the destination
3565 to DEST_CH[i]. This function return the number of destinations. */
3567 static Idx
3568 internal_function
3569 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3570 re_node_set *dests_node, bitset_t *dests_ch)
3572 reg_errcode_t err;
3573 bool ok;
3574 Idx i, j, k;
3575 Idx ndests; /* Number of the destinations from 'state'. */
3576 bitset_t accepts; /* Characters a node can accept. */
3577 const re_node_set *cur_nodes = &state->nodes;
3578 bitset_empty (accepts);
3579 ndests = 0;
3581 /* For all the nodes belonging to 'state', */
3582 for (i = 0; i < cur_nodes->nelem; ++i)
3584 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3585 re_token_type_t type = node->type;
3586 unsigned int constraint = node->constraint;
3588 /* Enumerate all single byte character this node can accept. */
3589 if (type == CHARACTER)
3590 bitset_set (accepts, node->opr.c);
3591 else if (type == SIMPLE_BRACKET)
3593 bitset_merge (accepts, node->opr.sbcset);
3595 else if (type == OP_PERIOD)
3597 #ifdef RE_ENABLE_I18N
3598 if (dfa->mb_cur_max > 1)
3599 bitset_merge (accepts, dfa->sb_char);
3600 else
3601 #endif
3602 bitset_set_all (accepts);
3603 if (!(dfa->syntax & RE_DOT_NEWLINE))
3604 bitset_clear (accepts, '\n');
3605 if (dfa->syntax & RE_DOT_NOT_NULL)
3606 bitset_clear (accepts, '\0');
3608 #ifdef RE_ENABLE_I18N
3609 else if (type == OP_UTF8_PERIOD)
3611 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3612 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3613 else
3614 bitset_merge (accepts, utf8_sb_map);
3615 if (!(dfa->syntax & RE_DOT_NEWLINE))
3616 bitset_clear (accepts, '\n');
3617 if (dfa->syntax & RE_DOT_NOT_NULL)
3618 bitset_clear (accepts, '\0');
3620 #endif
3621 else
3622 continue;
3624 /* Check the 'accepts' and sift the characters which are not
3625 match it the context. */
3626 if (constraint)
3628 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3630 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3631 bitset_empty (accepts);
3632 if (accepts_newline)
3633 bitset_set (accepts, NEWLINE_CHAR);
3634 else
3635 continue;
3637 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3639 bitset_empty (accepts);
3640 continue;
3643 if (constraint & NEXT_WORD_CONSTRAINT)
3645 bitset_word_t any_set = 0;
3646 if (type == CHARACTER && !node->word_char)
3648 bitset_empty (accepts);
3649 continue;
3651 #ifdef RE_ENABLE_I18N
3652 if (dfa->mb_cur_max > 1)
3653 for (j = 0; j < BITSET_WORDS; ++j)
3654 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3655 else
3656 #endif
3657 for (j = 0; j < BITSET_WORDS; ++j)
3658 any_set |= (accepts[j] &= dfa->word_char[j]);
3659 if (!any_set)
3660 continue;
3662 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3664 bitset_word_t any_set = 0;
3665 if (type == CHARACTER && node->word_char)
3667 bitset_empty (accepts);
3668 continue;
3670 #ifdef RE_ENABLE_I18N
3671 if (dfa->mb_cur_max > 1)
3672 for (j = 0; j < BITSET_WORDS; ++j)
3673 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3674 else
3675 #endif
3676 for (j = 0; j < BITSET_WORDS; ++j)
3677 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3678 if (!any_set)
3679 continue;
3683 /* Then divide 'accepts' into DFA states, or create a new
3684 state. Above, we make sure that accepts is not empty. */
3685 for (j = 0; j < ndests; ++j)
3687 bitset_t intersec; /* Intersection sets, see below. */
3688 bitset_t remains;
3689 /* Flags, see below. */
3690 bitset_word_t has_intersec, not_subset, not_consumed;
3692 /* Optimization, skip if this state doesn't accept the character. */
3693 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3694 continue;
3696 /* Enumerate the intersection set of this state and 'accepts'. */
3697 has_intersec = 0;
3698 for (k = 0; k < BITSET_WORDS; ++k)
3699 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3700 /* And skip if the intersection set is empty. */
3701 if (!has_intersec)
3702 continue;
3704 /* Then check if this state is a subset of 'accepts'. */
3705 not_subset = not_consumed = 0;
3706 for (k = 0; k < BITSET_WORDS; ++k)
3708 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3709 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3712 /* If this state isn't a subset of 'accepts', create a
3713 new group state, which has the 'remains'. */
3714 if (not_subset)
3716 bitset_copy (dests_ch[ndests], remains);
3717 bitset_copy (dests_ch[j], intersec);
3718 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3719 if (BE (err != REG_NOERROR, 0))
3720 goto error_return;
3721 ++ndests;
3724 /* Put the position in the current group. */
3725 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3726 if (BE (! ok, 0))
3727 goto error_return;
3729 /* If all characters are consumed, go to next node. */
3730 if (!not_consumed)
3731 break;
3733 /* Some characters remain, create a new group. */
3734 if (j == ndests)
3736 bitset_copy (dests_ch[ndests], accepts);
3737 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3738 if (BE (err != REG_NOERROR, 0))
3739 goto error_return;
3740 ++ndests;
3741 bitset_empty (accepts);
3744 return ndests;
3745 error_return:
3746 for (j = 0; j < ndests; ++j)
3747 re_node_set_free (dests_node + j);
3748 return -1;
3751 #ifdef RE_ENABLE_I18N
3752 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3753 Return the number of the bytes the node accepts.
3754 STR_IDX is the current index of the input string.
3756 This function handles the nodes which can accept one character, or
3757 one collating element like '.', '[a-z]', opposite to the other nodes
3758 can only accept one byte. */
3760 # ifdef _LIBC
3761 # include <locale/weight.h>
3762 # endif
3764 static int
3765 internal_function
3766 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3767 const re_string_t *input, Idx str_idx)
3769 const re_token_t *node = dfa->nodes + node_idx;
3770 int char_len, elem_len;
3771 Idx i;
3773 if (BE (node->type == OP_UTF8_PERIOD, 0))
3775 unsigned char c = re_string_byte_at (input, str_idx), d;
3776 if (BE (c < 0xc2, 1))
3777 return 0;
3779 if (str_idx + 2 > input->len)
3780 return 0;
3782 d = re_string_byte_at (input, str_idx + 1);
3783 if (c < 0xe0)
3784 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3785 else if (c < 0xf0)
3787 char_len = 3;
3788 if (c == 0xe0 && d < 0xa0)
3789 return 0;
3791 else if (c < 0xf8)
3793 char_len = 4;
3794 if (c == 0xf0 && d < 0x90)
3795 return 0;
3797 else if (c < 0xfc)
3799 char_len = 5;
3800 if (c == 0xf8 && d < 0x88)
3801 return 0;
3803 else if (c < 0xfe)
3805 char_len = 6;
3806 if (c == 0xfc && d < 0x84)
3807 return 0;
3809 else
3810 return 0;
3812 if (str_idx + char_len > input->len)
3813 return 0;
3815 for (i = 1; i < char_len; ++i)
3817 d = re_string_byte_at (input, str_idx + i);
3818 if (d < 0x80 || d > 0xbf)
3819 return 0;
3821 return char_len;
3824 char_len = re_string_char_size_at (input, str_idx);
3825 if (node->type == OP_PERIOD)
3827 if (char_len <= 1)
3828 return 0;
3829 /* FIXME: I don't think this if is needed, as both '\n'
3830 and '\0' are char_len == 1. */
3831 /* '.' accepts any one character except the following two cases. */
3832 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3833 re_string_byte_at (input, str_idx) == '\n') ||
3834 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3835 re_string_byte_at (input, str_idx) == '\0'))
3836 return 0;
3837 return char_len;
3840 elem_len = re_string_elem_size_at (input, str_idx);
3841 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3842 return 0;
3844 if (node->type == COMPLEX_BRACKET)
3846 const re_charset_t *cset = node->opr.mbcset;
3847 # ifdef _LIBC
3848 const unsigned char *pin
3849 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3850 Idx j;
3851 uint32_t nrules;
3852 # endif /* _LIBC */
3853 int match_len = 0;
3854 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3855 ? re_string_wchar_at (input, str_idx) : 0);
3857 /* match with multibyte character? */
3858 for (i = 0; i < cset->nmbchars; ++i)
3859 if (wc == cset->mbchars[i])
3861 match_len = char_len;
3862 goto check_node_accept_bytes_match;
3864 /* match with character_class? */
3865 for (i = 0; i < cset->nchar_classes; ++i)
3867 wctype_t wt = cset->char_classes[i];
3868 if (__iswctype (wc, wt))
3870 match_len = char_len;
3871 goto check_node_accept_bytes_match;
3875 # ifdef _LIBC
3876 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3877 if (nrules != 0)
3879 unsigned int in_collseq = 0;
3880 const int32_t *table, *indirect;
3881 const unsigned char *weights, *extra;
3882 const char *collseqwc;
3884 /* match with collating_symbol? */
3885 if (cset->ncoll_syms)
3886 extra = (const unsigned char *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3888 for (i = 0; i < cset->ncoll_syms; ++i)
3890 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3891 /* Compare the length of input collating element and
3892 the length of current collating element. */
3893 if (*coll_sym != elem_len)
3894 continue;
3895 /* Compare each bytes. */
3896 for (j = 0; j < *coll_sym; j++)
3897 if (pin[j] != coll_sym[1 + j])
3898 break;
3899 if (j == *coll_sym)
3901 /* Match if every bytes is equal. */
3902 match_len = j;
3903 goto check_node_accept_bytes_match;
3907 if (cset->nranges)
3909 if (elem_len <= char_len)
3911 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3912 in_collseq = __collseq_table_lookup (collseqwc, wc);
3914 else
3915 in_collseq = find_collation_sequence_value (pin, elem_len);
3917 /* match with range expression? */
3918 /* FIXME: Implement rational ranges here, too. */
3919 for (i = 0; i < cset->nranges; ++i)
3920 if (cset->range_starts[i] <= in_collseq
3921 && in_collseq <= cset->range_ends[i])
3923 match_len = elem_len;
3924 goto check_node_accept_bytes_match;
3927 /* match with equivalence_class? */
3928 if (cset->nequiv_classes)
3930 const unsigned char *cp = pin;
3931 table = (const int32_t *)
3932 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3933 weights = (const unsigned char *)
3934 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3935 extra = (const unsigned char *)
3936 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3937 indirect = (const int32_t *)
3938 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3939 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3940 if (idx > 0)
3941 for (i = 0; i < cset->nequiv_classes; ++i)
3943 int32_t equiv_class_idx = cset->equiv_classes[i];
3944 size_t weight_len = weights[idx & 0xffffff];
3945 if (weight_len == weights[equiv_class_idx & 0xffffff]
3946 && (idx >> 24) == (equiv_class_idx >> 24))
3948 Idx cnt = 0;
3950 idx &= 0xffffff;
3951 equiv_class_idx &= 0xffffff;
3953 while (cnt <= weight_len
3954 && (weights[equiv_class_idx + 1 + cnt]
3955 == weights[idx + 1 + cnt]))
3956 ++cnt;
3957 if (cnt > weight_len)
3959 match_len = elem_len;
3960 goto check_node_accept_bytes_match;
3966 else
3967 # endif /* _LIBC */
3969 /* match with range expression? */
3970 for (i = 0; i < cset->nranges; ++i)
3972 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3974 match_len = char_len;
3975 goto check_node_accept_bytes_match;
3979 check_node_accept_bytes_match:
3980 if (!cset->non_match)
3981 return match_len;
3982 else
3984 if (match_len > 0)
3985 return 0;
3986 else
3987 return (elem_len > char_len) ? elem_len : char_len;
3990 return 0;
3993 # ifdef _LIBC
3994 static unsigned int
3995 internal_function
3996 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3998 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3999 if (nrules == 0)
4001 if (mbs_len == 1)
4003 /* No valid character. Match it as a single byte character. */
4004 const unsigned char *collseq = (const unsigned char *)
4005 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4006 return collseq[mbs[0]];
4008 return UINT_MAX;
4010 else
4012 int32_t idx;
4013 const unsigned char *extra = (const unsigned char *)
4014 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4015 int32_t extrasize = (const unsigned char *)
4016 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4018 for (idx = 0; idx < extrasize;)
4020 int mbs_cnt;
4021 bool found = false;
4022 int32_t elem_mbs_len;
4023 /* Skip the name of collating element name. */
4024 idx = idx + extra[idx] + 1;
4025 elem_mbs_len = extra[idx++];
4026 if (mbs_len == elem_mbs_len)
4028 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4029 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4030 break;
4031 if (mbs_cnt == elem_mbs_len)
4032 /* Found the entry. */
4033 found = true;
4035 /* Skip the byte sequence of the collating element. */
4036 idx += elem_mbs_len;
4037 /* Adjust for the alignment. */
4038 idx = (idx + 3) & ~3;
4039 /* Skip the collation sequence value. */
4040 idx += sizeof (uint32_t);
4041 /* Skip the wide char sequence of the collating element. */
4042 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4043 /* If we found the entry, return the sequence value. */
4044 if (found)
4045 return *(uint32_t *) (extra + idx);
4046 /* Skip the collation sequence value. */
4047 idx += sizeof (uint32_t);
4049 return UINT_MAX;
4052 # endif /* _LIBC */
4053 #endif /* RE_ENABLE_I18N */
4055 /* Check whether the node accepts the byte which is IDX-th
4056 byte of the INPUT. */
4058 static bool
4059 internal_function
4060 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4061 Idx idx)
4063 unsigned char ch;
4064 ch = re_string_byte_at (&mctx->input, idx);
4065 switch (node->type)
4067 case CHARACTER:
4068 if (node->opr.c != ch)
4069 return false;
4070 break;
4072 case SIMPLE_BRACKET:
4073 if (!bitset_contain (node->opr.sbcset, ch))
4074 return false;
4075 break;
4077 #ifdef RE_ENABLE_I18N
4078 case OP_UTF8_PERIOD:
4079 if (ch >= ASCII_CHARS)
4080 return false;
4081 /* FALLTHROUGH */
4082 #endif
4083 case OP_PERIOD:
4084 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4085 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4086 return false;
4087 break;
4089 default:
4090 return false;
4093 if (node->constraint)
4095 /* The node has constraints. Check whether the current context
4096 satisfies the constraints. */
4097 unsigned int context = re_string_context_at (&mctx->input, idx,
4098 mctx->eflags);
4099 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4100 return false;
4103 return true;
4106 /* Extend the buffers, if the buffers have run out. */
4108 static reg_errcode_t
4109 internal_function __attribute_warn_unused_result__
4110 extend_buffers (re_match_context_t *mctx, int min_len)
4112 reg_errcode_t ret;
4113 re_string_t *pstr = &mctx->input;
4115 /* Avoid overflow. */
4116 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4117 <= pstr->bufs_len, 0))
4118 return REG_ESPACE;
4120 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4121 ret = re_string_realloc_buffers (pstr,
4122 MAX (min_len,
4123 MIN (pstr->len, pstr->bufs_len * 2)));
4124 if (BE (ret != REG_NOERROR, 0))
4125 return ret;
4127 if (mctx->state_log != NULL)
4129 /* And double the length of state_log. */
4130 /* XXX We have no indication of the size of this buffer. If this
4131 allocation fail we have no indication that the state_log array
4132 does not have the right size. */
4133 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4134 pstr->bufs_len + 1);
4135 if (BE (new_array == NULL, 0))
4136 return REG_ESPACE;
4137 mctx->state_log = new_array;
4140 /* Then reconstruct the buffers. */
4141 if (pstr->icase)
4143 #ifdef RE_ENABLE_I18N
4144 if (pstr->mb_cur_max > 1)
4146 ret = build_wcs_upper_buffer (pstr);
4147 if (BE (ret != REG_NOERROR, 0))
4148 return ret;
4150 else
4151 #endif /* RE_ENABLE_I18N */
4152 build_upper_buffer (pstr);
4154 else
4156 #ifdef RE_ENABLE_I18N
4157 if (pstr->mb_cur_max > 1)
4158 build_wcs_buffer (pstr);
4159 else
4160 #endif /* RE_ENABLE_I18N */
4162 if (pstr->trans != NULL)
4163 re_string_translate_buffer (pstr);
4166 return REG_NOERROR;
4170 /* Functions for matching context. */
4172 /* Initialize MCTX. */
4174 static reg_errcode_t
4175 internal_function __attribute_warn_unused_result__
4176 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4178 mctx->eflags = eflags;
4179 mctx->match_last = -1;
4180 if (n > 0)
4182 /* Avoid overflow. */
4183 size_t max_object_size =
4184 MAX (sizeof (struct re_backref_cache_entry),
4185 sizeof (re_sub_match_top_t *));
4186 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4187 return REG_ESPACE;
4189 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4190 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4191 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4192 return REG_ESPACE;
4194 /* Already zero-ed by the caller.
4195 else
4196 mctx->bkref_ents = NULL;
4197 mctx->nbkref_ents = 0;
4198 mctx->nsub_tops = 0; */
4199 mctx->abkref_ents = n;
4200 mctx->max_mb_elem_len = 1;
4201 mctx->asub_tops = n;
4202 return REG_NOERROR;
4205 /* Clean the entries which depend on the current input in MCTX.
4206 This function must be invoked when the matcher changes the start index
4207 of the input, or changes the input string. */
4209 static void
4210 internal_function
4211 match_ctx_clean (re_match_context_t *mctx)
4213 Idx st_idx;
4214 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4216 Idx sl_idx;
4217 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4218 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4220 re_sub_match_last_t *last = top->lasts[sl_idx];
4221 re_free (last->path.array);
4222 re_free (last);
4224 re_free (top->lasts);
4225 if (top->path)
4227 re_free (top->path->array);
4228 re_free (top->path);
4230 free (top);
4233 mctx->nsub_tops = 0;
4234 mctx->nbkref_ents = 0;
4237 /* Free all the memory associated with MCTX. */
4239 static void
4240 internal_function
4241 match_ctx_free (re_match_context_t *mctx)
4243 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4244 match_ctx_clean (mctx);
4245 re_free (mctx->sub_tops);
4246 re_free (mctx->bkref_ents);
4249 /* Add a new backreference entry to MCTX.
4250 Note that we assume that caller never call this function with duplicate
4251 entry, and call with STR_IDX which isn't smaller than any existing entry.
4254 static reg_errcode_t
4255 internal_function __attribute_warn_unused_result__
4256 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4257 Idx to)
4259 if (mctx->nbkref_ents >= mctx->abkref_ents)
4261 struct re_backref_cache_entry* new_entry;
4262 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4263 mctx->abkref_ents * 2);
4264 if (BE (new_entry == NULL, 0))
4266 re_free (mctx->bkref_ents);
4267 return REG_ESPACE;
4269 mctx->bkref_ents = new_entry;
4270 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4271 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4272 mctx->abkref_ents *= 2;
4274 if (mctx->nbkref_ents > 0
4275 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4276 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4278 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4279 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4280 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4281 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4283 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4284 If bit N is clear, means that this entry won't epsilon-transition to
4285 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4286 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4287 such node.
4289 A backreference does not epsilon-transition unless it is empty, so set
4290 to all zeros if FROM != TO. */
4291 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4292 = (from == to ? -1 : 0);
4294 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4295 if (mctx->max_mb_elem_len < to - from)
4296 mctx->max_mb_elem_len = to - from;
4297 return REG_NOERROR;
4300 /* Return the first entry with the same str_idx, or -1 if none is
4301 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4303 static Idx
4304 internal_function
4305 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4307 Idx left, right, mid, last;
4308 last = right = mctx->nbkref_ents;
4309 for (left = 0; left < right;)
4311 mid = (left + right) / 2;
4312 if (mctx->bkref_ents[mid].str_idx < str_idx)
4313 left = mid + 1;
4314 else
4315 right = mid;
4317 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4318 return left;
4319 else
4320 return -1;
4323 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4324 at STR_IDX. */
4326 static reg_errcode_t
4327 internal_function __attribute_warn_unused_result__
4328 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4330 #ifdef DEBUG
4331 assert (mctx->sub_tops != NULL);
4332 assert (mctx->asub_tops > 0);
4333 #endif
4334 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4336 Idx new_asub_tops = mctx->asub_tops * 2;
4337 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4338 re_sub_match_top_t *,
4339 new_asub_tops);
4340 if (BE (new_array == NULL, 0))
4341 return REG_ESPACE;
4342 mctx->sub_tops = new_array;
4343 mctx->asub_tops = new_asub_tops;
4345 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4346 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4347 return REG_ESPACE;
4348 mctx->sub_tops[mctx->nsub_tops]->node = node;
4349 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4350 return REG_NOERROR;
4353 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4354 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4356 static re_sub_match_last_t *
4357 internal_function
4358 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4360 re_sub_match_last_t *new_entry;
4361 if (BE (subtop->nlasts == subtop->alasts, 0))
4363 Idx new_alasts = 2 * subtop->alasts + 1;
4364 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4365 re_sub_match_last_t *,
4366 new_alasts);
4367 if (BE (new_array == NULL, 0))
4368 return NULL;
4369 subtop->lasts = new_array;
4370 subtop->alasts = new_alasts;
4372 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4373 if (BE (new_entry != NULL, 1))
4375 subtop->lasts[subtop->nlasts] = new_entry;
4376 new_entry->node = node;
4377 new_entry->str_idx = str_idx;
4378 ++subtop->nlasts;
4380 return new_entry;
4383 static void
4384 internal_function
4385 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4386 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4388 sctx->sifted_states = sifted_sts;
4389 sctx->limited_states = limited_sts;
4390 sctx->last_node = last_node;
4391 sctx->last_str_idx = last_str_idx;
4392 re_node_set_init_empty (&sctx->limits);