autoupdate
[gnulib.git] / lib / regexec.c
blobc3e6a5b8cb2ae13f6ff9263e46d39b9dad7a264a
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
175 /* Entry point for POSIX code. */
177 /* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
189 We return 0 if we find a match and REG_NOMATCH if not. */
192 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193 size_t nmatch, regmatch_t pmatch[], int eflags)
195 reg_errcode_t err;
196 Idx start, length;
197 re_dfa_t *dfa = preg->buffer;
199 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200 return REG_BADPAT;
202 if (eflags & REG_STARTEND)
204 start = pmatch[0].rm_so;
205 length = pmatch[0].rm_eo;
207 else
209 start = 0;
210 length = strlen (string);
213 lock_lock (dfa->lock);
214 if (preg->no_sub)
215 err = re_search_internal (preg, string, length, start, length,
216 length, 0, NULL, eflags);
217 else
218 err = re_search_internal (preg, string, length, start, length,
219 length, nmatch, pmatch, eflags);
220 lock_unlock (dfa->lock);
221 return err != REG_NOERROR;
224 #ifdef _LIBC
225 libc_hidden_def (__regexec)
227 # include <shlib-compat.h>
228 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
230 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231 __typeof__ (__regexec) __compat_regexec;
234 attribute_compat_text_section
235 __compat_regexec (const regex_t *_Restrict_ preg,
236 const char *_Restrict_ string, size_t nmatch,
237 regmatch_t pmatch[], int eflags)
239 return regexec (preg, string, nmatch, pmatch,
240 eflags & (REG_NOTBOL | REG_NOTEOL));
242 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243 # endif
244 #endif
246 /* Entry points for GNU code. */
248 /* re_match, re_search, re_match_2, re_search_2
250 The former two functions operate on STRING with length LENGTH,
251 while the later two operate on concatenation of STRING1 and STRING2
252 with lengths LENGTH1 and LENGTH2, respectively.
254 re_match() matches the compiled pattern in BUFP against the string,
255 starting at index START.
257 re_search() first tries matching at index START, then it tries to match
258 starting from index START + 1, and so on. The last start position tried
259 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
260 way as re_match().)
262 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263 the first STOP characters of the concatenation of the strings should be
264 concerned.
266 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
267 and all groups is stored in REGS. (For the "_2" variants, the offsets are
268 computed relative to the concatenation, not relative to the individual
269 strings.)
271 On success, re_match* functions return the length of the match, re_search*
272 return the position of the start of the match. Return value -1 means no
273 match was found and -2 indicates an internal error. */
275 regoff_t
276 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 Idx start, struct re_registers *regs)
279 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
281 #ifdef _LIBC
282 weak_alias (__re_match, re_match)
283 #endif
285 regoff_t
286 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 Idx start, regoff_t range, struct re_registers *regs)
289 return re_search_stub (bufp, string, length, start, range, length, regs,
290 false);
292 #ifdef _LIBC
293 weak_alias (__re_search, re_search)
294 #endif
296 regoff_t
297 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 const char *string2, Idx length2, Idx start,
299 struct re_registers *regs, Idx stop)
301 return re_search_2_stub (bufp, string1, length1, string2, length2,
302 start, 0, regs, stop, true);
304 #ifdef _LIBC
305 weak_alias (__re_match_2, re_match_2)
306 #endif
308 regoff_t
309 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 const char *string2, Idx length2, Idx start, regoff_t range,
311 struct re_registers *regs, Idx stop)
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
314 start, range, regs, stop, false);
316 #ifdef _LIBC
317 weak_alias (__re_search_2, re_search_2)
318 #endif
320 static regoff_t
321 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 Idx length1, const char *string2, Idx length2, Idx start,
323 regoff_t range, struct re_registers *regs,
324 Idx stop, bool ret_len)
326 const char *str;
327 regoff_t rval;
328 Idx len;
329 char *s = NULL;
331 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
332 || INT_ADD_WRAPV (length1, length2, &len))))
333 return -2;
335 /* Concatenate the strings. */
336 if (length2 > 0)
337 if (length1 > 0)
339 s = re_malloc (char, len);
341 if (__glibc_unlikely (s == NULL))
342 return -2;
343 #ifdef _LIBC
344 memcpy (__mempcpy (s, string1, length1), string2, length2);
345 #else
346 memcpy (s, string1, length1);
347 memcpy (s + length1, string2, length2);
348 #endif
349 str = s;
351 else
352 str = string2;
353 else
354 str = string1;
356 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
357 ret_len);
358 re_free (s);
359 return rval;
362 /* The parameters have the same meaning as those of re_search.
363 Additional parameters:
364 If RET_LEN is true the length of the match is returned (re_match style);
365 otherwise the position of the match is returned. */
367 static regoff_t
368 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
369 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
370 bool ret_len)
372 reg_errcode_t result;
373 regmatch_t *pmatch;
374 Idx nregs;
375 regoff_t rval;
376 int eflags = 0;
377 re_dfa_t *dfa = bufp->buffer;
378 Idx last_start = start + range;
380 /* Check for out-of-range. */
381 if (__glibc_unlikely (start < 0 || start > length))
382 return -1;
383 if (__glibc_unlikely (length < last_start
384 || (0 <= range && last_start < start)))
385 last_start = length;
386 else if (__glibc_unlikely (last_start < 0
387 || (range < 0 && start <= last_start)))
388 last_start = 0;
390 lock_lock (dfa->lock);
392 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
393 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
395 /* Compile fastmap if we haven't yet. */
396 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
397 re_compile_fastmap (bufp);
399 if (__glibc_unlikely (bufp->no_sub))
400 regs = NULL;
402 /* We need at least 1 register. */
403 if (regs == NULL)
404 nregs = 1;
405 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
406 && regs->num_regs <= bufp->re_nsub))
408 nregs = regs->num_regs;
409 if (__glibc_unlikely (nregs < 1))
411 /* Nothing can be copied to regs. */
412 regs = NULL;
413 nregs = 1;
416 else
417 nregs = bufp->re_nsub + 1;
418 pmatch = re_malloc (regmatch_t, nregs);
419 if (__glibc_unlikely (pmatch == NULL))
421 rval = -2;
422 goto out;
425 result = re_search_internal (bufp, string, length, start, last_start, stop,
426 nregs, pmatch, eflags);
428 rval = 0;
430 /* I hope we needn't fill their regs with -1's when no match was found. */
431 if (result != REG_NOERROR)
432 rval = result == REG_NOMATCH ? -1 : -2;
433 else if (regs != NULL)
435 /* If caller wants register contents data back, copy them. */
436 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
437 bufp->regs_allocated);
438 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
439 rval = -2;
442 if (__glibc_likely (rval == 0))
444 if (ret_len)
446 assert (pmatch[0].rm_so == start);
447 rval = pmatch[0].rm_eo - start;
449 else
450 rval = pmatch[0].rm_so;
452 re_free (pmatch);
453 out:
454 lock_unlock (dfa->lock);
455 return rval;
458 static unsigned
459 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
460 int regs_allocated)
462 int rval = REGS_REALLOCATE;
463 Idx i;
464 Idx need_regs = nregs + 1;
465 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
466 uses. */
468 /* Have the register data arrays been allocated? */
469 if (regs_allocated == REGS_UNALLOCATED)
470 { /* No. So allocate them with malloc. */
471 regs->start = re_malloc (regoff_t, need_regs);
472 if (__glibc_unlikely (regs->start == NULL))
473 return REGS_UNALLOCATED;
474 regs->end = re_malloc (regoff_t, need_regs);
475 if (__glibc_unlikely (regs->end == NULL))
477 re_free (regs->start);
478 return REGS_UNALLOCATED;
480 regs->num_regs = need_regs;
482 else if (regs_allocated == REGS_REALLOCATE)
483 { /* Yes. If we need more elements than were already
484 allocated, reallocate them. If we need fewer, just
485 leave it alone. */
486 if (__glibc_unlikely (need_regs > regs->num_regs))
488 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
489 regoff_t *new_end;
490 if (__glibc_unlikely (new_start == NULL))
491 return REGS_UNALLOCATED;
492 new_end = re_realloc (regs->end, regoff_t, need_regs);
493 if (__glibc_unlikely (new_end == NULL))
495 re_free (new_start);
496 return REGS_UNALLOCATED;
498 regs->start = new_start;
499 regs->end = new_end;
500 regs->num_regs = need_regs;
503 else
505 assert (regs_allocated == REGS_FIXED);
506 /* This function may not be called with REGS_FIXED and nregs too big. */
507 assert (regs->num_regs >= nregs);
508 rval = REGS_FIXED;
511 /* Copy the regs. */
512 for (i = 0; i < nregs; ++i)
514 regs->start[i] = pmatch[i].rm_so;
515 regs->end[i] = pmatch[i].rm_eo;
517 for ( ; i < regs->num_regs; ++i)
518 regs->start[i] = regs->end[i] = -1;
520 return rval;
523 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
524 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
525 this memory for recording register information. STARTS and ENDS
526 must be allocated using the malloc library routine, and must each
527 be at least NUM_REGS * sizeof (regoff_t) bytes long.
529 If NUM_REGS == 0, then subsequent matches should allocate their own
530 register data.
532 Unless this function is called, the first search or match using
533 PATTERN_BUFFER will allocate its own register data, without
534 freeing the old data. */
536 void
537 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
538 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
540 if (num_regs)
542 bufp->regs_allocated = REGS_REALLOCATE;
543 regs->num_regs = num_regs;
544 regs->start = starts;
545 regs->end = ends;
547 else
549 bufp->regs_allocated = REGS_UNALLOCATED;
550 regs->num_regs = 0;
551 regs->start = regs->end = NULL;
554 #ifdef _LIBC
555 weak_alias (__re_set_registers, re_set_registers)
556 #endif
558 /* Entry points compatible with 4.2 BSD regex library. We don't define
559 them unless specifically requested. */
561 #if defined _REGEX_RE_COMP || defined _LIBC
563 # ifdef _LIBC
564 weak_function
565 # endif
566 re_exec (const char *s)
568 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
570 #endif /* _REGEX_RE_COMP */
572 /* Internal entry point. */
574 /* Searches for a compiled pattern PREG in the string STRING, whose
575 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
576 meaning as with regexec. LAST_START is START + RANGE, where
577 START and RANGE have the same meaning as with re_search.
578 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
579 otherwise return the error code.
580 Note: We assume front end functions already check ranges.
581 (0 <= LAST_START && LAST_START <= LENGTH) */
583 static reg_errcode_t
584 __attribute_warn_unused_result__
585 re_search_internal (const regex_t *preg, const char *string, Idx length,
586 Idx start, Idx last_start, Idx stop, size_t nmatch,
587 regmatch_t pmatch[], int eflags)
589 reg_errcode_t err;
590 const re_dfa_t *dfa = preg->buffer;
591 Idx left_lim, right_lim;
592 int incr;
593 bool fl_longest_match;
594 int match_kind;
595 Idx match_first;
596 Idx match_last = -1;
597 Idx extra_nmatch;
598 bool sb;
599 int ch;
600 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
601 re_match_context_t mctx = { .dfa = dfa };
602 #else
603 re_match_context_t mctx;
604 #endif
605 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
606 && start != last_start && !preg->can_be_null)
607 ? preg->fastmap : NULL);
608 RE_TRANSLATE_TYPE t = preg->translate;
610 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
611 memset (&mctx, '\0', sizeof (re_match_context_t));
612 mctx.dfa = dfa;
613 #endif
615 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
616 nmatch -= extra_nmatch;
618 /* Check if the DFA haven't been compiled. */
619 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
620 || dfa->init_state_word == NULL
621 || dfa->init_state_nl == NULL
622 || dfa->init_state_begbuf == NULL))
623 return REG_NOMATCH;
625 #ifdef DEBUG
626 /* We assume front-end functions already check them. */
627 assert (0 <= last_start && last_start <= length);
628 #endif
630 /* If initial states with non-begbuf contexts have no elements,
631 the regex must be anchored. If preg->newline_anchor is set,
632 we'll never use init_state_nl, so do not check it. */
633 if (dfa->init_state->nodes.nelem == 0
634 && dfa->init_state_word->nodes.nelem == 0
635 && (dfa->init_state_nl->nodes.nelem == 0
636 || !preg->newline_anchor))
638 if (start != 0 && last_start != 0)
639 return REG_NOMATCH;
640 start = last_start = 0;
643 /* We must check the longest matching, if nmatch > 0. */
644 fl_longest_match = (nmatch != 0 || dfa->nbackref);
646 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
647 preg->translate, (preg->syntax & RE_ICASE) != 0,
648 dfa);
649 if (__glibc_unlikely (err != REG_NOERROR))
650 goto free_return;
651 mctx.input.stop = stop;
652 mctx.input.raw_stop = stop;
653 mctx.input.newline_anchor = preg->newline_anchor;
655 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
656 if (__glibc_unlikely (err != REG_NOERROR))
657 goto free_return;
659 /* We will log all the DFA states through which the dfa pass,
660 if nmatch > 1, or this dfa has "multibyte node", which is a
661 back-reference or a node which can accept multibyte character or
662 multi character collating element. */
663 if (nmatch > 1 || dfa->has_mb_node)
665 /* Avoid overflow. */
666 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
667 <= mctx.input.bufs_len)))
669 err = REG_ESPACE;
670 goto free_return;
673 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
674 if (__glibc_unlikely (mctx.state_log == NULL))
676 err = REG_ESPACE;
677 goto free_return;
680 else
681 mctx.state_log = NULL;
683 match_first = start;
684 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
685 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
687 /* Check incrementally whether the input string matches. */
688 incr = (last_start < start) ? -1 : 1;
689 left_lim = (last_start < start) ? last_start : start;
690 right_lim = (last_start < start) ? start : last_start;
691 sb = dfa->mb_cur_max == 1;
692 match_kind =
693 (fastmap
694 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
695 | (start <= last_start ? 2 : 0)
696 | (t != NULL ? 1 : 0))
697 : 8);
699 for (;; match_first += incr)
701 err = REG_NOMATCH;
702 if (match_first < left_lim || right_lim < match_first)
703 goto free_return;
705 /* Advance as rapidly as possible through the string, until we
706 find a plausible place to start matching. This may be done
707 with varying efficiency, so there are various possibilities:
708 only the most common of them are specialized, in order to
709 save on code size. We use a switch statement for speed. */
710 switch (match_kind)
712 case 8:
713 /* No fastmap. */
714 break;
716 case 7:
717 /* Fastmap with single-byte translation, match forward. */
718 while (__glibc_likely (match_first < right_lim)
719 && !fastmap[t[(unsigned char) string[match_first]]])
720 ++match_first;
721 goto forward_match_found_start_or_reached_end;
723 case 6:
724 /* Fastmap without translation, match forward. */
725 while (__glibc_likely (match_first < right_lim)
726 && !fastmap[(unsigned char) string[match_first]])
727 ++match_first;
729 forward_match_found_start_or_reached_end:
730 if (__glibc_unlikely (match_first == right_lim))
732 ch = match_first >= length
733 ? 0 : (unsigned char) string[match_first];
734 if (!fastmap[t ? t[ch] : ch])
735 goto free_return;
737 break;
739 case 4:
740 case 5:
741 /* Fastmap without multi-byte translation, match backwards. */
742 while (match_first >= left_lim)
744 ch = match_first >= length
745 ? 0 : (unsigned char) string[match_first];
746 if (fastmap[t ? t[ch] : ch])
747 break;
748 --match_first;
750 if (match_first < left_lim)
751 goto free_return;
752 break;
754 default:
755 /* In this case, we can't determine easily the current byte,
756 since it might be a component byte of a multibyte
757 character. Then we use the constructed buffer instead. */
758 for (;;)
760 /* If MATCH_FIRST is out of the valid range, reconstruct the
761 buffers. */
762 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
763 if (__glibc_unlikely (offset
764 >= (__re_size_t) mctx.input.valid_raw_len))
766 err = re_string_reconstruct (&mctx.input, match_first,
767 eflags);
768 if (__glibc_unlikely (err != REG_NOERROR))
769 goto free_return;
771 offset = match_first - mctx.input.raw_mbs_idx;
773 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
774 Note that MATCH_FIRST must not be smaller than 0. */
775 ch = (match_first >= length
776 ? 0 : re_string_byte_at (&mctx.input, offset));
777 if (fastmap[ch])
778 break;
779 match_first += incr;
780 if (match_first < left_lim || match_first > right_lim)
782 err = REG_NOMATCH;
783 goto free_return;
786 break;
789 /* Reconstruct the buffers so that the matcher can assume that
790 the matching starts from the beginning of the buffer. */
791 err = re_string_reconstruct (&mctx.input, match_first, eflags);
792 if (__glibc_unlikely (err != REG_NOERROR))
793 goto free_return;
795 #ifdef RE_ENABLE_I18N
796 /* Don't consider this char as a possible match start if it part,
797 yet isn't the head, of a multibyte character. */
798 if (!sb && !re_string_first_byte (&mctx.input, 0))
799 continue;
800 #endif
802 /* It seems to be appropriate one, then use the matcher. */
803 /* We assume that the matching starts from 0. */
804 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
805 match_last = check_matching (&mctx, fl_longest_match,
806 start <= last_start ? &match_first : NULL);
807 if (match_last != -1)
809 if (__glibc_unlikely (match_last == -2))
811 err = REG_ESPACE;
812 goto free_return;
814 else
816 mctx.match_last = match_last;
817 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
819 re_dfastate_t *pstate = mctx.state_log[match_last];
820 mctx.last_node = check_halt_state_context (&mctx, pstate,
821 match_last);
823 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
824 || dfa->nbackref)
826 err = prune_impossible_nodes (&mctx);
827 if (err == REG_NOERROR)
828 break;
829 if (__glibc_unlikely (err != REG_NOMATCH))
830 goto free_return;
831 match_last = -1;
833 else
834 break; /* We found a match. */
838 match_ctx_clean (&mctx);
841 #ifdef DEBUG
842 assert (match_last != -1);
843 assert (err == REG_NOERROR);
844 #endif
846 /* Set pmatch[] if we need. */
847 if (nmatch > 0)
849 Idx reg_idx;
851 /* Initialize registers. */
852 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
853 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
855 /* Set the points where matching start/end. */
856 pmatch[0].rm_so = 0;
857 pmatch[0].rm_eo = mctx.match_last;
858 /* FIXME: This function should fail if mctx.match_last exceeds
859 the maximum possible regoff_t value. We need a new error
860 code REG_OVERFLOW. */
862 if (!preg->no_sub && nmatch > 1)
864 err = set_regs (preg, &mctx, nmatch, pmatch,
865 dfa->has_plural_match && dfa->nbackref > 0);
866 if (__glibc_unlikely (err != REG_NOERROR))
867 goto free_return;
870 /* At last, add the offset to each register, since we slid
871 the buffers so that we could assume that the matching starts
872 from 0. */
873 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
874 if (pmatch[reg_idx].rm_so != -1)
876 #ifdef RE_ENABLE_I18N
877 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
879 pmatch[reg_idx].rm_so =
880 (pmatch[reg_idx].rm_so == mctx.input.valid_len
881 ? mctx.input.valid_raw_len
882 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
883 pmatch[reg_idx].rm_eo =
884 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
885 ? mctx.input.valid_raw_len
886 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
888 #else
889 assert (mctx.input.offsets_needed == 0);
890 #endif
891 pmatch[reg_idx].rm_so += match_first;
892 pmatch[reg_idx].rm_eo += match_first;
894 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
896 pmatch[nmatch + reg_idx].rm_so = -1;
897 pmatch[nmatch + reg_idx].rm_eo = -1;
900 if (dfa->subexp_map)
901 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
902 if (dfa->subexp_map[reg_idx] != reg_idx)
904 pmatch[reg_idx + 1].rm_so
905 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
906 pmatch[reg_idx + 1].rm_eo
907 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
911 free_return:
912 re_free (mctx.state_log);
913 if (dfa->nbackref)
914 match_ctx_free (&mctx);
915 re_string_destruct (&mctx.input);
916 return err;
919 static reg_errcode_t
920 __attribute_warn_unused_result__
921 prune_impossible_nodes (re_match_context_t *mctx)
923 const re_dfa_t *const dfa = mctx->dfa;
924 Idx halt_node, match_last;
925 reg_errcode_t ret;
926 re_dfastate_t **sifted_states;
927 re_dfastate_t **lim_states = NULL;
928 re_sift_context_t sctx;
929 #ifdef DEBUG
930 assert (mctx->state_log != NULL);
931 #endif
932 match_last = mctx->match_last;
933 halt_node = mctx->last_node;
935 /* Avoid overflow. */
936 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
937 <= match_last))
938 return REG_ESPACE;
940 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
941 if (__glibc_unlikely (sifted_states == NULL))
943 ret = REG_ESPACE;
944 goto free_return;
946 if (dfa->nbackref)
948 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
949 if (__glibc_unlikely (lim_states == NULL))
951 ret = REG_ESPACE;
952 goto free_return;
954 while (1)
956 memset (lim_states, '\0',
957 sizeof (re_dfastate_t *) * (match_last + 1));
958 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
959 match_last);
960 ret = sift_states_backward (mctx, &sctx);
961 re_node_set_free (&sctx.limits);
962 if (__glibc_unlikely (ret != REG_NOERROR))
963 goto free_return;
964 if (sifted_states[0] != NULL || lim_states[0] != NULL)
965 break;
968 --match_last;
969 if (match_last < 0)
971 ret = REG_NOMATCH;
972 goto free_return;
974 } while (mctx->state_log[match_last] == NULL
975 || !mctx->state_log[match_last]->halt);
976 halt_node = check_halt_state_context (mctx,
977 mctx->state_log[match_last],
978 match_last);
980 ret = merge_state_array (dfa, sifted_states, lim_states,
981 match_last + 1);
982 re_free (lim_states);
983 lim_states = NULL;
984 if (__glibc_unlikely (ret != REG_NOERROR))
985 goto free_return;
987 else
989 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
990 ret = sift_states_backward (mctx, &sctx);
991 re_node_set_free (&sctx.limits);
992 if (__glibc_unlikely (ret != REG_NOERROR))
993 goto free_return;
994 if (sifted_states[0] == NULL)
996 ret = REG_NOMATCH;
997 goto free_return;
1000 re_free (mctx->state_log);
1001 mctx->state_log = sifted_states;
1002 sifted_states = NULL;
1003 mctx->last_node = halt_node;
1004 mctx->match_last = match_last;
1005 ret = REG_NOERROR;
1006 free_return:
1007 re_free (sifted_states);
1008 re_free (lim_states);
1009 return ret;
1012 /* Acquire an initial state and return it.
1013 We must select appropriate initial state depending on the context,
1014 since initial states may have constraints like "\<", "^", etc.. */
1016 static inline re_dfastate_t *
1017 __attribute__ ((always_inline))
1018 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1019 Idx idx)
1021 const re_dfa_t *const dfa = mctx->dfa;
1022 if (dfa->init_state->has_constraint)
1024 unsigned int context;
1025 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1026 if (IS_WORD_CONTEXT (context))
1027 return dfa->init_state_word;
1028 else if (IS_ORDINARY_CONTEXT (context))
1029 return dfa->init_state;
1030 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1031 return dfa->init_state_begbuf;
1032 else if (IS_NEWLINE_CONTEXT (context))
1033 return dfa->init_state_nl;
1034 else if (IS_BEGBUF_CONTEXT (context))
1036 /* It is relatively rare case, then calculate on demand. */
1037 return re_acquire_state_context (err, dfa,
1038 dfa->init_state->entrance_nodes,
1039 context);
1041 else
1042 /* Must not happen? */
1043 return dfa->init_state;
1045 else
1046 return dfa->init_state;
1049 /* Check whether the regular expression match input string INPUT or not,
1050 and return the index where the matching end. Return -1 if
1051 there is no match, and return -2 in case of an error.
1052 FL_LONGEST_MATCH means we want the POSIX longest matching.
1053 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1054 next place where we may want to try matching.
1055 Note that the matcher assumes that the matching starts from the current
1056 index of the buffer. */
1058 static Idx
1059 __attribute_warn_unused_result__
1060 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1061 Idx *p_match_first)
1063 const re_dfa_t *const dfa = mctx->dfa;
1064 reg_errcode_t err;
1065 Idx match = 0;
1066 Idx match_last = -1;
1067 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1068 re_dfastate_t *cur_state;
1069 bool at_init_state = p_match_first != NULL;
1070 Idx next_start_idx = cur_str_idx;
1072 err = REG_NOERROR;
1073 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1074 /* An initial state must not be NULL (invalid). */
1075 if (__glibc_unlikely (cur_state == NULL))
1077 assert (err == REG_ESPACE);
1078 return -2;
1081 if (mctx->state_log != NULL)
1083 mctx->state_log[cur_str_idx] = cur_state;
1085 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1086 later. E.g. Processing back references. */
1087 if (__glibc_unlikely (dfa->nbackref))
1089 at_init_state = false;
1090 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1091 if (__glibc_unlikely (err != REG_NOERROR))
1092 return err;
1094 if (cur_state->has_backref)
1096 err = transit_state_bkref (mctx, &cur_state->nodes);
1097 if (__glibc_unlikely (err != REG_NOERROR))
1098 return err;
1103 /* If the RE accepts NULL string. */
1104 if (__glibc_unlikely (cur_state->halt))
1106 if (!cur_state->has_constraint
1107 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1109 if (!fl_longest_match)
1110 return cur_str_idx;
1111 else
1113 match_last = cur_str_idx;
1114 match = 1;
1119 while (!re_string_eoi (&mctx->input))
1121 re_dfastate_t *old_state = cur_state;
1122 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1124 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1125 && mctx->input.bufs_len < mctx->input.len)
1126 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1127 && mctx->input.valid_len < mctx->input.len))
1129 err = extend_buffers (mctx, next_char_idx + 1);
1130 if (__glibc_unlikely (err != REG_NOERROR))
1132 assert (err == REG_ESPACE);
1133 return -2;
1137 cur_state = transit_state (&err, mctx, cur_state);
1138 if (mctx->state_log != NULL)
1139 cur_state = merge_state_with_log (&err, mctx, cur_state);
1141 if (cur_state == NULL)
1143 /* Reached the invalid state or an error. Try to recover a valid
1144 state using the state log, if available and if we have not
1145 already found a valid (even if not the longest) match. */
1146 if (__glibc_unlikely (err != REG_NOERROR))
1147 return -2;
1149 if (mctx->state_log == NULL
1150 || (match && !fl_longest_match)
1151 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1152 break;
1155 if (__glibc_unlikely (at_init_state))
1157 if (old_state == cur_state)
1158 next_start_idx = next_char_idx;
1159 else
1160 at_init_state = false;
1163 if (cur_state->halt)
1165 /* Reached a halt state.
1166 Check the halt state can satisfy the current context. */
1167 if (!cur_state->has_constraint
1168 || check_halt_state_context (mctx, cur_state,
1169 re_string_cur_idx (&mctx->input)))
1171 /* We found an appropriate halt state. */
1172 match_last = re_string_cur_idx (&mctx->input);
1173 match = 1;
1175 /* We found a match, do not modify match_first below. */
1176 p_match_first = NULL;
1177 if (!fl_longest_match)
1178 break;
1183 if (p_match_first)
1184 *p_match_first += next_start_idx;
1186 return match_last;
1189 /* Check NODE match the current context. */
1191 static bool
1192 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1194 re_token_type_t type = dfa->nodes[node].type;
1195 unsigned int constraint = dfa->nodes[node].constraint;
1196 if (type != END_OF_RE)
1197 return false;
1198 if (!constraint)
1199 return true;
1200 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1201 return false;
1202 return true;
1205 /* Check the halt state STATE match the current context.
1206 Return 0 if not match, if the node, STATE has, is a halt node and
1207 match the context, return the node. */
1209 static Idx
1210 check_halt_state_context (const re_match_context_t *mctx,
1211 const re_dfastate_t *state, Idx idx)
1213 Idx i;
1214 unsigned int context;
1215 #ifdef DEBUG
1216 assert (state->halt);
1217 #endif
1218 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1219 for (i = 0; i < state->nodes.nelem; ++i)
1220 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1221 return state->nodes.elems[i];
1222 return 0;
1225 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1226 corresponding to the DFA).
1227 Return the destination node, and update EPS_VIA_NODES;
1228 return -1 in case of errors. */
1230 static Idx
1231 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1232 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1233 struct re_fail_stack_t *fs)
1235 const re_dfa_t *const dfa = mctx->dfa;
1236 Idx i;
1237 bool ok;
1238 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1240 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1241 re_node_set *edests = &dfa->edests[node];
1242 Idx dest_node;
1243 ok = re_node_set_insert (eps_via_nodes, node);
1244 if (__glibc_unlikely (! ok))
1245 return -2;
1246 /* Pick up a valid destination, or return -1 if none
1247 is found. */
1248 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1250 Idx candidate = edests->elems[i];
1251 if (!re_node_set_contains (cur_nodes, candidate))
1252 continue;
1253 if (dest_node == -1)
1254 dest_node = candidate;
1256 else
1258 /* In order to avoid infinite loop like "(a*)*", return the second
1259 epsilon-transition if the first was already considered. */
1260 if (re_node_set_contains (eps_via_nodes, dest_node))
1261 return candidate;
1263 /* Otherwise, push the second epsilon-transition on the fail stack. */
1264 else if (fs != NULL
1265 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1266 eps_via_nodes))
1267 return -2;
1269 /* We know we are going to exit. */
1270 break;
1273 return dest_node;
1275 else
1277 Idx naccepted = 0;
1278 re_token_type_t type = dfa->nodes[node].type;
1280 #ifdef RE_ENABLE_I18N
1281 if (dfa->nodes[node].accept_mb)
1282 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1283 else
1284 #endif /* RE_ENABLE_I18N */
1285 if (type == OP_BACK_REF)
1287 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1288 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1289 if (fs != NULL)
1291 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1292 return -1;
1293 else if (naccepted)
1295 char *buf = (char *) re_string_get_buffer (&mctx->input);
1296 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1297 naccepted) != 0)
1298 return -1;
1302 if (naccepted == 0)
1304 Idx dest_node;
1305 ok = re_node_set_insert (eps_via_nodes, node);
1306 if (__glibc_unlikely (! ok))
1307 return -2;
1308 dest_node = dfa->edests[node].elems[0];
1309 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1310 dest_node))
1311 return dest_node;
1315 if (naccepted != 0
1316 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1318 Idx dest_node = dfa->nexts[node];
1319 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1320 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1321 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1322 dest_node)))
1323 return -1;
1324 re_node_set_empty (eps_via_nodes);
1325 return dest_node;
1328 return -1;
1331 static reg_errcode_t
1332 __attribute_warn_unused_result__
1333 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1334 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1336 reg_errcode_t err;
1337 Idx num = fs->num++;
1338 if (fs->num == fs->alloc)
1340 struct re_fail_stack_ent_t *new_array;
1341 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1342 fs->alloc * 2);
1343 if (new_array == NULL)
1344 return REG_ESPACE;
1345 fs->alloc *= 2;
1346 fs->stack = new_array;
1348 fs->stack[num].idx = str_idx;
1349 fs->stack[num].node = dest_node;
1350 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1351 if (fs->stack[num].regs == NULL)
1352 return REG_ESPACE;
1353 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1354 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1355 return err;
1358 static Idx
1359 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1360 regmatch_t *regs, re_node_set *eps_via_nodes)
1362 Idx num = --fs->num;
1363 assert (num >= 0);
1364 *pidx = fs->stack[num].idx;
1365 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1366 re_node_set_free (eps_via_nodes);
1367 re_free (fs->stack[num].regs);
1368 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1369 return fs->stack[num].node;
1372 /* Set the positions where the subexpressions are starts/ends to registers
1373 PMATCH.
1374 Note: We assume that pmatch[0] is already set, and
1375 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1377 static reg_errcode_t
1378 __attribute_warn_unused_result__
1379 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1380 regmatch_t *pmatch, bool fl_backtrack)
1382 const re_dfa_t *dfa = preg->buffer;
1383 Idx idx, cur_node;
1384 re_node_set eps_via_nodes;
1385 struct re_fail_stack_t *fs;
1386 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1387 regmatch_t *prev_idx_match;
1388 bool prev_idx_match_malloced = false;
1390 #ifdef DEBUG
1391 assert (nmatch > 1);
1392 assert (mctx->state_log != NULL);
1393 #endif
1394 if (fl_backtrack)
1396 fs = &fs_body;
1397 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1398 if (fs->stack == NULL)
1399 return REG_ESPACE;
1401 else
1402 fs = NULL;
1404 cur_node = dfa->init_node;
1405 re_node_set_init_empty (&eps_via_nodes);
1407 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1408 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1409 else
1411 prev_idx_match = re_malloc (regmatch_t, nmatch);
1412 if (prev_idx_match == NULL)
1414 free_fail_stack_return (fs);
1415 return REG_ESPACE;
1417 prev_idx_match_malloced = true;
1419 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1421 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1423 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1425 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1427 Idx reg_idx;
1428 if (fs)
1430 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1431 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1432 break;
1433 if (reg_idx == nmatch)
1435 re_node_set_free (&eps_via_nodes);
1436 if (prev_idx_match_malloced)
1437 re_free (prev_idx_match);
1438 return free_fail_stack_return (fs);
1440 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1441 &eps_via_nodes);
1443 else
1445 re_node_set_free (&eps_via_nodes);
1446 if (prev_idx_match_malloced)
1447 re_free (prev_idx_match);
1448 return REG_NOERROR;
1452 /* Proceed to next node. */
1453 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1454 &eps_via_nodes, fs);
1456 if (__glibc_unlikely (cur_node < 0))
1458 if (__glibc_unlikely (cur_node == -2))
1460 re_node_set_free (&eps_via_nodes);
1461 if (prev_idx_match_malloced)
1462 re_free (prev_idx_match);
1463 free_fail_stack_return (fs);
1464 return REG_ESPACE;
1466 if (fs)
1467 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1468 &eps_via_nodes);
1469 else
1471 re_node_set_free (&eps_via_nodes);
1472 if (prev_idx_match_malloced)
1473 re_free (prev_idx_match);
1474 return REG_NOMATCH;
1478 re_node_set_free (&eps_via_nodes);
1479 if (prev_idx_match_malloced)
1480 re_free (prev_idx_match);
1481 return free_fail_stack_return (fs);
1484 static reg_errcode_t
1485 free_fail_stack_return (struct re_fail_stack_t *fs)
1487 if (fs)
1489 Idx fs_idx;
1490 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1492 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1493 re_free (fs->stack[fs_idx].regs);
1495 re_free (fs->stack);
1497 return REG_NOERROR;
1500 static void
1501 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1502 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1504 int type = dfa->nodes[cur_node].type;
1505 if (type == OP_OPEN_SUBEXP)
1507 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1509 /* We are at the first node of this sub expression. */
1510 if (reg_num < nmatch)
1512 pmatch[reg_num].rm_so = cur_idx;
1513 pmatch[reg_num].rm_eo = -1;
1516 else if (type == OP_CLOSE_SUBEXP)
1518 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1519 if (reg_num < nmatch)
1521 /* We are at the last node of this sub expression. */
1522 if (pmatch[reg_num].rm_so < cur_idx)
1524 pmatch[reg_num].rm_eo = cur_idx;
1525 /* This is a non-empty match or we are not inside an optional
1526 subexpression. Accept this right away. */
1527 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1529 else
1531 if (dfa->nodes[cur_node].opt_subexp
1532 && prev_idx_match[reg_num].rm_so != -1)
1533 /* We transited through an empty match for an optional
1534 subexpression, like (a?)*, and this is not the subexp's
1535 first match. Copy back the old content of the registers
1536 so that matches of an inner subexpression are undone as
1537 well, like in ((a?))*. */
1538 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1539 else
1540 /* We completed a subexpression, but it may be part of
1541 an optional one, so do not update PREV_IDX_MATCH. */
1542 pmatch[reg_num].rm_eo = cur_idx;
1548 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1549 and sift the nodes in each states according to the following rules.
1550 Updated state_log will be wrote to STATE_LOG.
1552 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1553 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1554 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1555 the LAST_NODE, we throw away the node 'a'.
1556 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1557 string 's' and transit to 'b':
1558 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1559 away the node 'a'.
1560 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1561 thrown away, we throw away the node 'a'.
1562 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1563 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1564 node 'a'.
1565 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1566 we throw away the node 'a'. */
1568 #define STATE_NODE_CONTAINS(state,node) \
1569 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1571 static reg_errcode_t
1572 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1574 reg_errcode_t err;
1575 int null_cnt = 0;
1576 Idx str_idx = sctx->last_str_idx;
1577 re_node_set cur_dest;
1579 #ifdef DEBUG
1580 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1581 #endif
1583 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1584 transit to the last_node and the last_node itself. */
1585 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1586 if (__glibc_unlikely (err != REG_NOERROR))
1587 return err;
1588 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1589 if (__glibc_unlikely (err != REG_NOERROR))
1590 goto free_return;
1592 /* Then check each states in the state_log. */
1593 while (str_idx > 0)
1595 /* Update counters. */
1596 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1597 if (null_cnt > mctx->max_mb_elem_len)
1599 memset (sctx->sifted_states, '\0',
1600 sizeof (re_dfastate_t *) * str_idx);
1601 re_node_set_free (&cur_dest);
1602 return REG_NOERROR;
1604 re_node_set_empty (&cur_dest);
1605 --str_idx;
1607 if (mctx->state_log[str_idx])
1609 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1610 if (__glibc_unlikely (err != REG_NOERROR))
1611 goto free_return;
1614 /* Add all the nodes which satisfy the following conditions:
1615 - It can epsilon transit to a node in CUR_DEST.
1616 - It is in CUR_SRC.
1617 And update state_log. */
1618 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1619 if (__glibc_unlikely (err != REG_NOERROR))
1620 goto free_return;
1622 err = REG_NOERROR;
1623 free_return:
1624 re_node_set_free (&cur_dest);
1625 return err;
1628 static reg_errcode_t
1629 __attribute_warn_unused_result__
1630 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1631 Idx str_idx, re_node_set *cur_dest)
1633 const re_dfa_t *const dfa = mctx->dfa;
1634 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1635 Idx i;
1637 /* Then build the next sifted state.
1638 We build the next sifted state on 'cur_dest', and update
1639 'sifted_states[str_idx]' with 'cur_dest'.
1640 Note:
1641 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1642 'cur_src' points the node_set of the old 'state_log[str_idx]'
1643 (with the epsilon nodes pre-filtered out). */
1644 for (i = 0; i < cur_src->nelem; i++)
1646 Idx prev_node = cur_src->elems[i];
1647 int naccepted = 0;
1648 bool ok;
1650 #ifdef DEBUG
1651 re_token_type_t type = dfa->nodes[prev_node].type;
1652 assert (!IS_EPSILON_NODE (type));
1653 #endif
1654 #ifdef RE_ENABLE_I18N
1655 /* If the node may accept "multi byte". */
1656 if (dfa->nodes[prev_node].accept_mb)
1657 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1658 str_idx, sctx->last_str_idx);
1659 #endif /* RE_ENABLE_I18N */
1661 /* We don't check backreferences here.
1662 See update_cur_sifted_state(). */
1663 if (!naccepted
1664 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1665 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1666 dfa->nexts[prev_node]))
1667 naccepted = 1;
1669 if (naccepted == 0)
1670 continue;
1672 if (sctx->limits.nelem)
1674 Idx to_idx = str_idx + naccepted;
1675 if (check_dst_limits (mctx, &sctx->limits,
1676 dfa->nexts[prev_node], to_idx,
1677 prev_node, str_idx))
1678 continue;
1680 ok = re_node_set_insert (cur_dest, prev_node);
1681 if (__glibc_unlikely (! ok))
1682 return REG_ESPACE;
1685 return REG_NOERROR;
1688 /* Helper functions. */
1690 static reg_errcode_t
1691 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1693 Idx top = mctx->state_log_top;
1695 if ((next_state_log_idx >= mctx->input.bufs_len
1696 && mctx->input.bufs_len < mctx->input.len)
1697 || (next_state_log_idx >= mctx->input.valid_len
1698 && mctx->input.valid_len < mctx->input.len))
1700 reg_errcode_t err;
1701 err = extend_buffers (mctx, next_state_log_idx + 1);
1702 if (__glibc_unlikely (err != REG_NOERROR))
1703 return err;
1706 if (top < next_state_log_idx)
1708 memset (mctx->state_log + top + 1, '\0',
1709 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1710 mctx->state_log_top = next_state_log_idx;
1712 return REG_NOERROR;
1715 static reg_errcode_t
1716 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1717 re_dfastate_t **src, Idx num)
1719 Idx st_idx;
1720 reg_errcode_t err;
1721 for (st_idx = 0; st_idx < num; ++st_idx)
1723 if (dst[st_idx] == NULL)
1724 dst[st_idx] = src[st_idx];
1725 else if (src[st_idx] != NULL)
1727 re_node_set merged_set;
1728 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1729 &src[st_idx]->nodes);
1730 if (__glibc_unlikely (err != REG_NOERROR))
1731 return err;
1732 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1733 re_node_set_free (&merged_set);
1734 if (__glibc_unlikely (err != REG_NOERROR))
1735 return err;
1738 return REG_NOERROR;
1741 static reg_errcode_t
1742 update_cur_sifted_state (const re_match_context_t *mctx,
1743 re_sift_context_t *sctx, Idx str_idx,
1744 re_node_set *dest_nodes)
1746 const re_dfa_t *const dfa = mctx->dfa;
1747 reg_errcode_t err = REG_NOERROR;
1748 const re_node_set *candidates;
1749 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1750 : &mctx->state_log[str_idx]->nodes);
1752 if (dest_nodes->nelem == 0)
1753 sctx->sifted_states[str_idx] = NULL;
1754 else
1756 if (candidates)
1758 /* At first, add the nodes which can epsilon transit to a node in
1759 DEST_NODE. */
1760 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1761 if (__glibc_unlikely (err != REG_NOERROR))
1762 return err;
1764 /* Then, check the limitations in the current sift_context. */
1765 if (sctx->limits.nelem)
1767 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1768 mctx->bkref_ents, str_idx);
1769 if (__glibc_unlikely (err != REG_NOERROR))
1770 return err;
1774 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1775 if (__glibc_unlikely (err != REG_NOERROR))
1776 return err;
1779 if (candidates && mctx->state_log[str_idx]->has_backref)
1781 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1782 if (__glibc_unlikely (err != REG_NOERROR))
1783 return err;
1785 return REG_NOERROR;
1788 static reg_errcode_t
1789 __attribute_warn_unused_result__
1790 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1791 const re_node_set *candidates)
1793 reg_errcode_t err = REG_NOERROR;
1794 Idx i;
1796 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1797 if (__glibc_unlikely (err != REG_NOERROR))
1798 return err;
1800 if (!state->inveclosure.alloc)
1802 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1803 if (__glibc_unlikely (err != REG_NOERROR))
1804 return REG_ESPACE;
1805 for (i = 0; i < dest_nodes->nelem; i++)
1807 err = re_node_set_merge (&state->inveclosure,
1808 dfa->inveclosures + dest_nodes->elems[i]);
1809 if (__glibc_unlikely (err != REG_NOERROR))
1810 return REG_ESPACE;
1813 return re_node_set_add_intersect (dest_nodes, candidates,
1814 &state->inveclosure);
1817 static reg_errcode_t
1818 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1819 const re_node_set *candidates)
1821 Idx ecl_idx;
1822 reg_errcode_t err;
1823 re_node_set *inv_eclosure = dfa->inveclosures + node;
1824 re_node_set except_nodes;
1825 re_node_set_init_empty (&except_nodes);
1826 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1828 Idx cur_node = inv_eclosure->elems[ecl_idx];
1829 if (cur_node == node)
1830 continue;
1831 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1833 Idx edst1 = dfa->edests[cur_node].elems[0];
1834 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1835 ? dfa->edests[cur_node].elems[1] : -1);
1836 if ((!re_node_set_contains (inv_eclosure, edst1)
1837 && re_node_set_contains (dest_nodes, edst1))
1838 || (edst2 > 0
1839 && !re_node_set_contains (inv_eclosure, edst2)
1840 && re_node_set_contains (dest_nodes, edst2)))
1842 err = re_node_set_add_intersect (&except_nodes, candidates,
1843 dfa->inveclosures + cur_node);
1844 if (__glibc_unlikely (err != REG_NOERROR))
1846 re_node_set_free (&except_nodes);
1847 return err;
1852 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854 Idx cur_node = inv_eclosure->elems[ecl_idx];
1855 if (!re_node_set_contains (&except_nodes, cur_node))
1857 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1858 re_node_set_remove_at (dest_nodes, idx);
1861 re_node_set_free (&except_nodes);
1862 return REG_NOERROR;
1865 static bool
1866 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1867 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1869 const re_dfa_t *const dfa = mctx->dfa;
1870 Idx lim_idx, src_pos, dst_pos;
1872 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1873 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1874 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1876 Idx subexp_idx;
1877 struct re_backref_cache_entry *ent;
1878 ent = mctx->bkref_ents + limits->elems[lim_idx];
1879 subexp_idx = dfa->nodes[ent->node].opr.idx;
1881 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1882 subexp_idx, dst_node, dst_idx,
1883 dst_bkref_idx);
1884 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1885 subexp_idx, src_node, src_idx,
1886 src_bkref_idx);
1888 /* In case of:
1889 <src> <dst> ( <subexp> )
1890 ( <subexp> ) <src> <dst>
1891 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1892 if (src_pos == dst_pos)
1893 continue; /* This is unrelated limitation. */
1894 else
1895 return true;
1897 return false;
1900 static int
1901 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1902 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1904 const re_dfa_t *const dfa = mctx->dfa;
1905 const re_node_set *eclosures = dfa->eclosures + from_node;
1906 Idx node_idx;
1908 /* Else, we are on the boundary: examine the nodes on the epsilon
1909 closure. */
1910 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1912 Idx node = eclosures->elems[node_idx];
1913 switch (dfa->nodes[node].type)
1915 case OP_BACK_REF:
1916 if (bkref_idx != -1)
1918 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1921 Idx dst;
1922 int cpos;
1924 if (ent->node != node)
1925 continue;
1927 if (subexp_idx < BITSET_WORD_BITS
1928 && !(ent->eps_reachable_subexps_map
1929 & ((bitset_word_t) 1 << subexp_idx)))
1930 continue;
1932 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1933 OP_CLOSE_SUBEXP cases below. But, if the
1934 destination node is the same node as the source
1935 node, don't recurse because it would cause an
1936 infinite loop: a regex that exhibits this behavior
1937 is ()\1*\1* */
1938 dst = dfa->edests[node].elems[0];
1939 if (dst == from_node)
1941 if (boundaries & 1)
1942 return -1;
1943 else /* if (boundaries & 2) */
1944 return 0;
1947 cpos =
1948 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1949 dst, bkref_idx);
1950 if (cpos == -1 /* && (boundaries & 1) */)
1951 return -1;
1952 if (cpos == 0 && (boundaries & 2))
1953 return 0;
1955 if (subexp_idx < BITSET_WORD_BITS)
1956 ent->eps_reachable_subexps_map
1957 &= ~((bitset_word_t) 1 << subexp_idx);
1959 while (ent++->more);
1961 break;
1963 case OP_OPEN_SUBEXP:
1964 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1965 return -1;
1966 break;
1968 case OP_CLOSE_SUBEXP:
1969 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1970 return 0;
1971 break;
1973 default:
1974 break;
1978 return (boundaries & 2) ? 1 : 0;
1981 static int
1982 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1983 Idx subexp_idx, Idx from_node, Idx str_idx,
1984 Idx bkref_idx)
1986 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1987 int boundaries;
1989 /* If we are outside the range of the subexpression, return -1 or 1. */
1990 if (str_idx < lim->subexp_from)
1991 return -1;
1993 if (lim->subexp_to < str_idx)
1994 return 1;
1996 /* If we are within the subexpression, return 0. */
1997 boundaries = (str_idx == lim->subexp_from);
1998 boundaries |= (str_idx == lim->subexp_to) << 1;
1999 if (boundaries == 0)
2000 return 0;
2002 /* Else, examine epsilon closure. */
2003 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2004 from_node, bkref_idx);
2007 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2008 which are against limitations from DEST_NODES. */
2010 static reg_errcode_t
2011 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2012 const re_node_set *candidates, re_node_set *limits,
2013 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2015 reg_errcode_t err;
2016 Idx node_idx, lim_idx;
2018 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2020 Idx subexp_idx;
2021 struct re_backref_cache_entry *ent;
2022 ent = bkref_ents + limits->elems[lim_idx];
2024 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2025 continue; /* This is unrelated limitation. */
2027 subexp_idx = dfa->nodes[ent->node].opr.idx;
2028 if (ent->subexp_to == str_idx)
2030 Idx ops_node = -1;
2031 Idx cls_node = -1;
2032 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2034 Idx node = dest_nodes->elems[node_idx];
2035 re_token_type_t type = dfa->nodes[node].type;
2036 if (type == OP_OPEN_SUBEXP
2037 && subexp_idx == dfa->nodes[node].opr.idx)
2038 ops_node = node;
2039 else if (type == OP_CLOSE_SUBEXP
2040 && subexp_idx == dfa->nodes[node].opr.idx)
2041 cls_node = node;
2044 /* Check the limitation of the open subexpression. */
2045 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2046 if (ops_node >= 0)
2048 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2049 candidates);
2050 if (__glibc_unlikely (err != REG_NOERROR))
2051 return err;
2054 /* Check the limitation of the close subexpression. */
2055 if (cls_node >= 0)
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2058 Idx node = dest_nodes->elems[node_idx];
2059 if (!re_node_set_contains (dfa->inveclosures + node,
2060 cls_node)
2061 && !re_node_set_contains (dfa->eclosures + node,
2062 cls_node))
2064 /* It is against this limitation.
2065 Remove it form the current sifted state. */
2066 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067 candidates);
2068 if (__glibc_unlikely (err != REG_NOERROR))
2069 return err;
2070 --node_idx;
2074 else /* (ent->subexp_to != str_idx) */
2076 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2078 Idx node = dest_nodes->elems[node_idx];
2079 re_token_type_t type = dfa->nodes[node].type;
2080 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2082 if (subexp_idx != dfa->nodes[node].opr.idx)
2083 continue;
2084 /* It is against this limitation.
2085 Remove it form the current sifted state. */
2086 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2087 candidates);
2088 if (__glibc_unlikely (err != REG_NOERROR))
2089 return err;
2094 return REG_NOERROR;
2097 static reg_errcode_t
2098 __attribute_warn_unused_result__
2099 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2100 Idx str_idx, const re_node_set *candidates)
2102 const re_dfa_t *const dfa = mctx->dfa;
2103 reg_errcode_t err;
2104 Idx node_idx, node;
2105 re_sift_context_t local_sctx;
2106 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2108 if (first_idx == -1)
2109 return REG_NOERROR;
2111 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2113 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2115 Idx enabled_idx;
2116 re_token_type_t type;
2117 struct re_backref_cache_entry *entry;
2118 node = candidates->elems[node_idx];
2119 type = dfa->nodes[node].type;
2120 /* Avoid infinite loop for the REs like "()\1+". */
2121 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2122 continue;
2123 if (type != OP_BACK_REF)
2124 continue;
2126 entry = mctx->bkref_ents + first_idx;
2127 enabled_idx = first_idx;
2130 Idx subexp_len;
2131 Idx to_idx;
2132 Idx dst_node;
2133 bool ok;
2134 re_dfastate_t *cur_state;
2136 if (entry->node != node)
2137 continue;
2138 subexp_len = entry->subexp_to - entry->subexp_from;
2139 to_idx = str_idx + subexp_len;
2140 dst_node = (subexp_len ? dfa->nexts[node]
2141 : dfa->edests[node].elems[0]);
2143 if (to_idx > sctx->last_str_idx
2144 || sctx->sifted_states[to_idx] == NULL
2145 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2146 || check_dst_limits (mctx, &sctx->limits, node,
2147 str_idx, dst_node, to_idx))
2148 continue;
2150 if (local_sctx.sifted_states == NULL)
2152 local_sctx = *sctx;
2153 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2154 if (__glibc_unlikely (err != REG_NOERROR))
2155 goto free_return;
2157 local_sctx.last_node = node;
2158 local_sctx.last_str_idx = str_idx;
2159 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2160 if (__glibc_unlikely (! ok))
2162 err = REG_ESPACE;
2163 goto free_return;
2165 cur_state = local_sctx.sifted_states[str_idx];
2166 err = sift_states_backward (mctx, &local_sctx);
2167 if (__glibc_unlikely (err != REG_NOERROR))
2168 goto free_return;
2169 if (sctx->limited_states != NULL)
2171 err = merge_state_array (dfa, sctx->limited_states,
2172 local_sctx.sifted_states,
2173 str_idx + 1);
2174 if (__glibc_unlikely (err != REG_NOERROR))
2175 goto free_return;
2177 local_sctx.sifted_states[str_idx] = cur_state;
2178 re_node_set_remove (&local_sctx.limits, enabled_idx);
2180 /* mctx->bkref_ents may have changed, reload the pointer. */
2181 entry = mctx->bkref_ents + enabled_idx;
2183 while (enabled_idx++, entry++->more);
2185 err = REG_NOERROR;
2186 free_return:
2187 if (local_sctx.sifted_states != NULL)
2189 re_node_set_free (&local_sctx.limits);
2192 return err;
2196 #ifdef RE_ENABLE_I18N
2197 static int
2198 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2199 Idx node_idx, Idx str_idx, Idx max_str_idx)
2201 const re_dfa_t *const dfa = mctx->dfa;
2202 int naccepted;
2203 /* Check the node can accept "multi byte". */
2204 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2205 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2206 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2207 dfa->nexts[node_idx]))
2208 /* The node can't accept the "multi byte", or the
2209 destination was already thrown away, then the node
2210 could't accept the current input "multi byte". */
2211 naccepted = 0;
2212 /* Otherwise, it is sure that the node could accept
2213 'naccepted' bytes input. */
2214 return naccepted;
2216 #endif /* RE_ENABLE_I18N */
2219 /* Functions for state transition. */
2221 /* Return the next state to which the current state STATE will transit by
2222 accepting the current input byte, and update STATE_LOG if necessary.
2223 If STATE can accept a multibyte char/collating element/back reference
2224 update the destination of STATE_LOG. */
2226 static re_dfastate_t *
2227 __attribute_warn_unused_result__
2228 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2229 re_dfastate_t *state)
2231 re_dfastate_t **trtable;
2232 unsigned char ch;
2234 #ifdef RE_ENABLE_I18N
2235 /* If the current state can accept multibyte. */
2236 if (__glibc_unlikely (state->accept_mb))
2238 *err = transit_state_mb (mctx, state);
2239 if (__glibc_unlikely (*err != REG_NOERROR))
2240 return NULL;
2242 #endif /* RE_ENABLE_I18N */
2244 /* Then decide the next state with the single byte. */
2245 #if 0
2246 if (0)
2247 /* don't use transition table */
2248 return transit_state_sb (err, mctx, state);
2249 #endif
2251 /* Use transition table */
2252 ch = re_string_fetch_byte (&mctx->input);
2253 for (;;)
2255 trtable = state->trtable;
2256 if (__glibc_likely (trtable != NULL))
2257 return trtable[ch];
2259 trtable = state->word_trtable;
2260 if (__glibc_likely (trtable != NULL))
2262 unsigned int context;
2263 context
2264 = re_string_context_at (&mctx->input,
2265 re_string_cur_idx (&mctx->input) - 1,
2266 mctx->eflags);
2267 if (IS_WORD_CONTEXT (context))
2268 return trtable[ch + SBC_MAX];
2269 else
2270 return trtable[ch];
2273 if (!build_trtable (mctx->dfa, state))
2275 *err = REG_ESPACE;
2276 return NULL;
2279 /* Retry, we now have a transition table. */
2283 /* Update the state_log if we need */
2284 static re_dfastate_t *
2285 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2286 re_dfastate_t *next_state)
2288 const re_dfa_t *const dfa = mctx->dfa;
2289 Idx cur_idx = re_string_cur_idx (&mctx->input);
2291 if (cur_idx > mctx->state_log_top)
2293 mctx->state_log[cur_idx] = next_state;
2294 mctx->state_log_top = cur_idx;
2296 else if (mctx->state_log[cur_idx] == 0)
2298 mctx->state_log[cur_idx] = next_state;
2300 else
2302 re_dfastate_t *pstate;
2303 unsigned int context;
2304 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2305 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2306 the destination of a multibyte char/collating element/
2307 back reference. Then the next state is the union set of
2308 these destinations and the results of the transition table. */
2309 pstate = mctx->state_log[cur_idx];
2310 log_nodes = pstate->entrance_nodes;
2311 if (next_state != NULL)
2313 table_nodes = next_state->entrance_nodes;
2314 *err = re_node_set_init_union (&next_nodes, table_nodes,
2315 log_nodes);
2316 if (__glibc_unlikely (*err != REG_NOERROR))
2317 return NULL;
2319 else
2320 next_nodes = *log_nodes;
2321 /* Note: We already add the nodes of the initial state,
2322 then we don't need to add them here. */
2324 context = re_string_context_at (&mctx->input,
2325 re_string_cur_idx (&mctx->input) - 1,
2326 mctx->eflags);
2327 next_state = mctx->state_log[cur_idx]
2328 = re_acquire_state_context (err, dfa, &next_nodes, context);
2329 /* We don't need to check errors here, since the return value of
2330 this function is next_state and ERR is already set. */
2332 if (table_nodes != NULL)
2333 re_node_set_free (&next_nodes);
2336 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2338 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2339 later. We must check them here, since the back references in the
2340 next state might use them. */
2341 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2342 cur_idx);
2343 if (__glibc_unlikely (*err != REG_NOERROR))
2344 return NULL;
2346 /* If the next state has back references. */
2347 if (next_state->has_backref)
2349 *err = transit_state_bkref (mctx, &next_state->nodes);
2350 if (__glibc_unlikely (*err != REG_NOERROR))
2351 return NULL;
2352 next_state = mctx->state_log[cur_idx];
2356 return next_state;
2359 /* Skip bytes in the input that correspond to part of a
2360 multi-byte match, then look in the log for a state
2361 from which to restart matching. */
2362 static re_dfastate_t *
2363 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2365 re_dfastate_t *cur_state;
2368 Idx max = mctx->state_log_top;
2369 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2373 if (++cur_str_idx > max)
2374 return NULL;
2375 re_string_skip_bytes (&mctx->input, 1);
2377 while (mctx->state_log[cur_str_idx] == NULL);
2379 cur_state = merge_state_with_log (err, mctx, NULL);
2381 while (*err == REG_NOERROR && cur_state == NULL);
2382 return cur_state;
2385 /* Helper functions for transit_state. */
2387 /* From the node set CUR_NODES, pick up the nodes whose types are
2388 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2389 expression. And register them to use them later for evaluating the
2390 corresponding back references. */
2392 static reg_errcode_t
2393 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2394 Idx str_idx)
2396 const re_dfa_t *const dfa = mctx->dfa;
2397 Idx node_idx;
2398 reg_errcode_t err;
2400 /* TODO: This isn't efficient.
2401 Because there might be more than one nodes whose types are
2402 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2403 nodes.
2404 E.g. RE: (a){2} */
2405 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2407 Idx node = cur_nodes->elems[node_idx];
2408 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2409 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2410 && (dfa->used_bkref_map
2411 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2413 err = match_ctx_add_subtop (mctx, node, str_idx);
2414 if (__glibc_unlikely (err != REG_NOERROR))
2415 return err;
2418 return REG_NOERROR;
2421 #if 0
2422 /* Return the next state to which the current state STATE will transit by
2423 accepting the current input byte. */
2425 static re_dfastate_t *
2426 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2427 re_dfastate_t *state)
2429 const re_dfa_t *const dfa = mctx->dfa;
2430 re_node_set next_nodes;
2431 re_dfastate_t *next_state;
2432 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2433 unsigned int context;
2435 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2436 if (__glibc_unlikely (*err != REG_NOERROR))
2437 return NULL;
2438 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2440 Idx cur_node = state->nodes.elems[node_cnt];
2441 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2443 *err = re_node_set_merge (&next_nodes,
2444 dfa->eclosures + dfa->nexts[cur_node]);
2445 if (__glibc_unlikely (*err != REG_NOERROR))
2447 re_node_set_free (&next_nodes);
2448 return NULL;
2452 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2453 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2454 /* We don't need to check errors here, since the return value of
2455 this function is next_state and ERR is already set. */
2457 re_node_set_free (&next_nodes);
2458 re_string_skip_bytes (&mctx->input, 1);
2459 return next_state;
2461 #endif
2463 #ifdef RE_ENABLE_I18N
2464 static reg_errcode_t
2465 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2467 const re_dfa_t *const dfa = mctx->dfa;
2468 reg_errcode_t err;
2469 Idx i;
2471 for (i = 0; i < pstate->nodes.nelem; ++i)
2473 re_node_set dest_nodes, *new_nodes;
2474 Idx cur_node_idx = pstate->nodes.elems[i];
2475 int naccepted;
2476 Idx dest_idx;
2477 unsigned int context;
2478 re_dfastate_t *dest_state;
2480 if (!dfa->nodes[cur_node_idx].accept_mb)
2481 continue;
2483 if (dfa->nodes[cur_node_idx].constraint)
2485 context = re_string_context_at (&mctx->input,
2486 re_string_cur_idx (&mctx->input),
2487 mctx->eflags);
2488 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2489 context))
2490 continue;
2493 /* How many bytes the node can accept? */
2494 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2495 re_string_cur_idx (&mctx->input));
2496 if (naccepted == 0)
2497 continue;
2499 /* The node can accepts 'naccepted' bytes. */
2500 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2501 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2502 : mctx->max_mb_elem_len);
2503 err = clean_state_log_if_needed (mctx, dest_idx);
2504 if (__glibc_unlikely (err != REG_NOERROR))
2505 return err;
2506 #ifdef DEBUG
2507 assert (dfa->nexts[cur_node_idx] != -1);
2508 #endif
2509 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2511 dest_state = mctx->state_log[dest_idx];
2512 if (dest_state == NULL)
2513 dest_nodes = *new_nodes;
2514 else
2516 err = re_node_set_init_union (&dest_nodes,
2517 dest_state->entrance_nodes, new_nodes);
2518 if (__glibc_unlikely (err != REG_NOERROR))
2519 return err;
2521 context = re_string_context_at (&mctx->input, dest_idx - 1,
2522 mctx->eflags);
2523 mctx->state_log[dest_idx]
2524 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2525 if (dest_state != NULL)
2526 re_node_set_free (&dest_nodes);
2527 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2528 && err != REG_NOERROR))
2529 return err;
2531 return REG_NOERROR;
2533 #endif /* RE_ENABLE_I18N */
2535 static reg_errcode_t
2536 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2538 const re_dfa_t *const dfa = mctx->dfa;
2539 reg_errcode_t err;
2540 Idx i;
2541 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2543 for (i = 0; i < nodes->nelem; ++i)
2545 Idx dest_str_idx, prev_nelem, bkc_idx;
2546 Idx node_idx = nodes->elems[i];
2547 unsigned int context;
2548 const re_token_t *node = dfa->nodes + node_idx;
2549 re_node_set *new_dest_nodes;
2551 /* Check whether 'node' is a backreference or not. */
2552 if (node->type != OP_BACK_REF)
2553 continue;
2555 if (node->constraint)
2557 context = re_string_context_at (&mctx->input, cur_str_idx,
2558 mctx->eflags);
2559 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2560 continue;
2563 /* 'node' is a backreference.
2564 Check the substring which the substring matched. */
2565 bkc_idx = mctx->nbkref_ents;
2566 err = get_subexp (mctx, node_idx, cur_str_idx);
2567 if (__glibc_unlikely (err != REG_NOERROR))
2568 goto free_return;
2570 /* And add the epsilon closures (which is 'new_dest_nodes') of
2571 the backreference to appropriate state_log. */
2572 #ifdef DEBUG
2573 assert (dfa->nexts[node_idx] != -1);
2574 #endif
2575 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2577 Idx subexp_len;
2578 re_dfastate_t *dest_state;
2579 struct re_backref_cache_entry *bkref_ent;
2580 bkref_ent = mctx->bkref_ents + bkc_idx;
2581 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2582 continue;
2583 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2584 new_dest_nodes = (subexp_len == 0
2585 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2586 : dfa->eclosures + dfa->nexts[node_idx]);
2587 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2588 - bkref_ent->subexp_from);
2589 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2590 mctx->eflags);
2591 dest_state = mctx->state_log[dest_str_idx];
2592 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2593 : mctx->state_log[cur_str_idx]->nodes.nelem);
2594 /* Add 'new_dest_node' to state_log. */
2595 if (dest_state == NULL)
2597 mctx->state_log[dest_str_idx]
2598 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2599 context);
2600 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2601 && err != REG_NOERROR))
2602 goto free_return;
2604 else
2606 re_node_set dest_nodes;
2607 err = re_node_set_init_union (&dest_nodes,
2608 dest_state->entrance_nodes,
2609 new_dest_nodes);
2610 if (__glibc_unlikely (err != REG_NOERROR))
2612 re_node_set_free (&dest_nodes);
2613 goto free_return;
2615 mctx->state_log[dest_str_idx]
2616 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2617 re_node_set_free (&dest_nodes);
2618 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2619 && err != REG_NOERROR))
2620 goto free_return;
2622 /* We need to check recursively if the backreference can epsilon
2623 transit. */
2624 if (subexp_len == 0
2625 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2627 err = check_subexp_matching_top (mctx, new_dest_nodes,
2628 cur_str_idx);
2629 if (__glibc_unlikely (err != REG_NOERROR))
2630 goto free_return;
2631 err = transit_state_bkref (mctx, new_dest_nodes);
2632 if (__glibc_unlikely (err != REG_NOERROR))
2633 goto free_return;
2637 err = REG_NOERROR;
2638 free_return:
2639 return err;
2642 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2643 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2644 Note that we might collect inappropriate candidates here.
2645 However, the cost of checking them strictly here is too high, then we
2646 delay these checking for prune_impossible_nodes(). */
2648 static reg_errcode_t
2649 __attribute_warn_unused_result__
2650 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2652 const re_dfa_t *const dfa = mctx->dfa;
2653 Idx subexp_num, sub_top_idx;
2654 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2655 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2656 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2657 if (cache_idx != -1)
2659 const struct re_backref_cache_entry *entry
2660 = mctx->bkref_ents + cache_idx;
2662 if (entry->node == bkref_node)
2663 return REG_NOERROR; /* We already checked it. */
2664 while (entry++->more);
2667 subexp_num = dfa->nodes[bkref_node].opr.idx;
2669 /* For each sub expression */
2670 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2672 reg_errcode_t err;
2673 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2674 re_sub_match_last_t *sub_last;
2675 Idx sub_last_idx, sl_str, bkref_str_off;
2677 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2678 continue; /* It isn't related. */
2680 sl_str = sub_top->str_idx;
2681 bkref_str_off = bkref_str_idx;
2682 /* At first, check the last node of sub expressions we already
2683 evaluated. */
2684 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2686 regoff_t sl_str_diff;
2687 sub_last = sub_top->lasts[sub_last_idx];
2688 sl_str_diff = sub_last->str_idx - sl_str;
2689 /* The matched string by the sub expression match with the substring
2690 at the back reference? */
2691 if (sl_str_diff > 0)
2693 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2694 > mctx->input.valid_len))
2696 /* Not enough chars for a successful match. */
2697 if (bkref_str_off + sl_str_diff > mctx->input.len)
2698 break;
2700 err = clean_state_log_if_needed (mctx,
2701 bkref_str_off
2702 + sl_str_diff);
2703 if (__glibc_unlikely (err != REG_NOERROR))
2704 return err;
2705 buf = (const char *) re_string_get_buffer (&mctx->input);
2707 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2708 /* We don't need to search this sub expression any more. */
2709 break;
2711 bkref_str_off += sl_str_diff;
2712 sl_str += sl_str_diff;
2713 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2714 bkref_str_idx);
2716 /* Reload buf, since the preceding call might have reallocated
2717 the buffer. */
2718 buf = (const char *) re_string_get_buffer (&mctx->input);
2720 if (err == REG_NOMATCH)
2721 continue;
2722 if (__glibc_unlikely (err != REG_NOERROR))
2723 return err;
2726 if (sub_last_idx < sub_top->nlasts)
2727 continue;
2728 if (sub_last_idx > 0)
2729 ++sl_str;
2730 /* Then, search for the other last nodes of the sub expression. */
2731 for (; sl_str <= bkref_str_idx; ++sl_str)
2733 Idx cls_node;
2734 regoff_t sl_str_off;
2735 const re_node_set *nodes;
2736 sl_str_off = sl_str - sub_top->str_idx;
2737 /* The matched string by the sub expression match with the substring
2738 at the back reference? */
2739 if (sl_str_off > 0)
2741 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2743 /* If we are at the end of the input, we cannot match. */
2744 if (bkref_str_off >= mctx->input.len)
2745 break;
2747 err = extend_buffers (mctx, bkref_str_off + 1);
2748 if (__glibc_unlikely (err != REG_NOERROR))
2749 return err;
2751 buf = (const char *) re_string_get_buffer (&mctx->input);
2753 if (buf [bkref_str_off++] != buf[sl_str - 1])
2754 break; /* We don't need to search this sub expression
2755 any more. */
2757 if (mctx->state_log[sl_str] == NULL)
2758 continue;
2759 /* Does this state have a ')' of the sub expression? */
2760 nodes = &mctx->state_log[sl_str]->nodes;
2761 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2762 OP_CLOSE_SUBEXP);
2763 if (cls_node == -1)
2764 continue; /* No. */
2765 if (sub_top->path == NULL)
2767 sub_top->path = calloc (sizeof (state_array_t),
2768 sl_str - sub_top->str_idx + 1);
2769 if (sub_top->path == NULL)
2770 return REG_ESPACE;
2772 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2773 in the current context? */
2774 err = check_arrival (mctx, sub_top->path, sub_top->node,
2775 sub_top->str_idx, cls_node, sl_str,
2776 OP_CLOSE_SUBEXP);
2777 if (err == REG_NOMATCH)
2778 continue;
2779 if (__glibc_unlikely (err != REG_NOERROR))
2780 return err;
2781 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2782 if (__glibc_unlikely (sub_last == NULL))
2783 return REG_ESPACE;
2784 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2785 bkref_str_idx);
2786 if (err == REG_NOMATCH)
2787 continue;
2790 return REG_NOERROR;
2793 /* Helper functions for get_subexp(). */
2795 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2796 If it can arrive, register the sub expression expressed with SUB_TOP
2797 and SUB_LAST. */
2799 static reg_errcode_t
2800 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2801 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2803 reg_errcode_t err;
2804 Idx to_idx;
2805 /* Can the subexpression arrive the back reference? */
2806 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2807 sub_last->str_idx, bkref_node, bkref_str,
2808 OP_OPEN_SUBEXP);
2809 if (err != REG_NOERROR)
2810 return err;
2811 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2812 sub_last->str_idx);
2813 if (__glibc_unlikely (err != REG_NOERROR))
2814 return err;
2815 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2816 return clean_state_log_if_needed (mctx, to_idx);
2819 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2820 Search '(' if FL_OPEN, or search ')' otherwise.
2821 TODO: This function isn't efficient...
2822 Because there might be more than one nodes whose types are
2823 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2824 nodes.
2825 E.g. RE: (a){2} */
2827 static Idx
2828 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2829 Idx subexp_idx, int type)
2831 Idx cls_idx;
2832 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2834 Idx cls_node = nodes->elems[cls_idx];
2835 const re_token_t *node = dfa->nodes + cls_node;
2836 if (node->type == type
2837 && node->opr.idx == subexp_idx)
2838 return cls_node;
2840 return -1;
2843 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2844 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2845 heavily reused.
2846 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2848 static reg_errcode_t
2849 __attribute_warn_unused_result__
2850 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2851 Idx top_str, Idx last_node, Idx last_str, int type)
2853 const re_dfa_t *const dfa = mctx->dfa;
2854 reg_errcode_t err = REG_NOERROR;
2855 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2856 re_dfastate_t *cur_state = NULL;
2857 re_node_set *cur_nodes, next_nodes;
2858 re_dfastate_t **backup_state_log;
2859 unsigned int context;
2861 subexp_num = dfa->nodes[top_node].opr.idx;
2862 /* Extend the buffer if we need. */
2863 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2865 re_dfastate_t **new_array;
2866 Idx old_alloc = path->alloc;
2867 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2868 Idx new_alloc;
2869 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2870 return REG_ESPACE;
2871 new_alloc = old_alloc + incr_alloc;
2872 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2873 return REG_ESPACE;
2874 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2875 if (__glibc_unlikely (new_array == NULL))
2876 return REG_ESPACE;
2877 path->array = new_array;
2878 path->alloc = new_alloc;
2879 memset (new_array + old_alloc, '\0',
2880 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2883 str_idx = path->next_idx ? path->next_idx : top_str;
2885 /* Temporary modify MCTX. */
2886 backup_state_log = mctx->state_log;
2887 backup_cur_idx = mctx->input.cur_idx;
2888 mctx->state_log = path->array;
2889 mctx->input.cur_idx = str_idx;
2891 /* Setup initial node set. */
2892 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2893 if (str_idx == top_str)
2895 err = re_node_set_init_1 (&next_nodes, top_node);
2896 if (__glibc_unlikely (err != REG_NOERROR))
2897 return err;
2898 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2899 if (__glibc_unlikely (err != REG_NOERROR))
2901 re_node_set_free (&next_nodes);
2902 return err;
2905 else
2907 cur_state = mctx->state_log[str_idx];
2908 if (cur_state && cur_state->has_backref)
2910 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2911 if (__glibc_unlikely (err != REG_NOERROR))
2912 return err;
2914 else
2915 re_node_set_init_empty (&next_nodes);
2917 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2919 if (next_nodes.nelem)
2921 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2922 subexp_num, type);
2923 if (__glibc_unlikely (err != REG_NOERROR))
2925 re_node_set_free (&next_nodes);
2926 return err;
2929 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2930 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2932 re_node_set_free (&next_nodes);
2933 return err;
2935 mctx->state_log[str_idx] = cur_state;
2938 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2940 re_node_set_empty (&next_nodes);
2941 if (mctx->state_log[str_idx + 1])
2943 err = re_node_set_merge (&next_nodes,
2944 &mctx->state_log[str_idx + 1]->nodes);
2945 if (__glibc_unlikely (err != REG_NOERROR))
2947 re_node_set_free (&next_nodes);
2948 return err;
2951 if (cur_state)
2953 err = check_arrival_add_next_nodes (mctx, str_idx,
2954 &cur_state->non_eps_nodes,
2955 &next_nodes);
2956 if (__glibc_unlikely (err != REG_NOERROR))
2958 re_node_set_free (&next_nodes);
2959 return err;
2962 ++str_idx;
2963 if (next_nodes.nelem)
2965 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2966 if (__glibc_unlikely (err != REG_NOERROR))
2968 re_node_set_free (&next_nodes);
2969 return err;
2971 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2972 subexp_num, type);
2973 if (__glibc_unlikely (err != REG_NOERROR))
2975 re_node_set_free (&next_nodes);
2976 return err;
2979 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2980 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2981 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2983 re_node_set_free (&next_nodes);
2984 return err;
2986 mctx->state_log[str_idx] = cur_state;
2987 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2989 re_node_set_free (&next_nodes);
2990 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2991 : &mctx->state_log[last_str]->nodes);
2992 path->next_idx = str_idx;
2994 /* Fix MCTX. */
2995 mctx->state_log = backup_state_log;
2996 mctx->input.cur_idx = backup_cur_idx;
2998 /* Then check the current node set has the node LAST_NODE. */
2999 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3000 return REG_NOERROR;
3002 return REG_NOMATCH;
3005 /* Helper functions for check_arrival. */
3007 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3008 to NEXT_NODES.
3009 TODO: This function is similar to the functions transit_state*(),
3010 however this function has many additional works.
3011 Can't we unify them? */
3013 static reg_errcode_t
3014 __attribute_warn_unused_result__
3015 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3016 re_node_set *cur_nodes, re_node_set *next_nodes)
3018 const re_dfa_t *const dfa = mctx->dfa;
3019 bool ok;
3020 Idx cur_idx;
3021 #ifdef RE_ENABLE_I18N
3022 reg_errcode_t err = REG_NOERROR;
3023 #endif
3024 re_node_set union_set;
3025 re_node_set_init_empty (&union_set);
3026 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3028 int naccepted = 0;
3029 Idx cur_node = cur_nodes->elems[cur_idx];
3030 #ifdef DEBUG
3031 re_token_type_t type = dfa->nodes[cur_node].type;
3032 assert (!IS_EPSILON_NODE (type));
3033 #endif
3034 #ifdef RE_ENABLE_I18N
3035 /* If the node may accept "multi byte". */
3036 if (dfa->nodes[cur_node].accept_mb)
3038 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3039 str_idx);
3040 if (naccepted > 1)
3042 re_dfastate_t *dest_state;
3043 Idx next_node = dfa->nexts[cur_node];
3044 Idx next_idx = str_idx + naccepted;
3045 dest_state = mctx->state_log[next_idx];
3046 re_node_set_empty (&union_set);
3047 if (dest_state)
3049 err = re_node_set_merge (&union_set, &dest_state->nodes);
3050 if (__glibc_unlikely (err != REG_NOERROR))
3052 re_node_set_free (&union_set);
3053 return err;
3056 ok = re_node_set_insert (&union_set, next_node);
3057 if (__glibc_unlikely (! ok))
3059 re_node_set_free (&union_set);
3060 return REG_ESPACE;
3062 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3063 &union_set);
3064 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3065 && err != REG_NOERROR))
3067 re_node_set_free (&union_set);
3068 return err;
3072 #endif /* RE_ENABLE_I18N */
3073 if (naccepted
3074 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3076 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3077 if (__glibc_unlikely (! ok))
3079 re_node_set_free (&union_set);
3080 return REG_ESPACE;
3084 re_node_set_free (&union_set);
3085 return REG_NOERROR;
3088 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3089 CUR_NODES, however exclude the nodes which are:
3090 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3091 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3094 static reg_errcode_t
3095 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3096 Idx ex_subexp, int type)
3098 reg_errcode_t err;
3099 Idx idx, outside_node;
3100 re_node_set new_nodes;
3101 #ifdef DEBUG
3102 assert (cur_nodes->nelem);
3103 #endif
3104 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3105 if (__glibc_unlikely (err != REG_NOERROR))
3106 return err;
3107 /* Create a new node set NEW_NODES with the nodes which are epsilon
3108 closures of the node in CUR_NODES. */
3110 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3112 Idx cur_node = cur_nodes->elems[idx];
3113 const re_node_set *eclosure = dfa->eclosures + cur_node;
3114 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3115 if (outside_node == -1)
3117 /* There are no problematic nodes, just merge them. */
3118 err = re_node_set_merge (&new_nodes, eclosure);
3119 if (__glibc_unlikely (err != REG_NOERROR))
3121 re_node_set_free (&new_nodes);
3122 return err;
3125 else
3127 /* There are problematic nodes, re-calculate incrementally. */
3128 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3129 ex_subexp, type);
3130 if (__glibc_unlikely (err != REG_NOERROR))
3132 re_node_set_free (&new_nodes);
3133 return err;
3137 re_node_set_free (cur_nodes);
3138 *cur_nodes = new_nodes;
3139 return REG_NOERROR;
3142 /* Helper function for check_arrival_expand_ecl.
3143 Check incrementally the epsilon closure of TARGET, and if it isn't
3144 problematic append it to DST_NODES. */
3146 static reg_errcode_t
3147 __attribute_warn_unused_result__
3148 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3149 Idx target, Idx ex_subexp, int type)
3151 Idx cur_node;
3152 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3154 bool ok;
3156 if (dfa->nodes[cur_node].type == type
3157 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3159 if (type == OP_CLOSE_SUBEXP)
3161 ok = re_node_set_insert (dst_nodes, cur_node);
3162 if (__glibc_unlikely (! ok))
3163 return REG_ESPACE;
3165 break;
3167 ok = re_node_set_insert (dst_nodes, cur_node);
3168 if (__glibc_unlikely (! ok))
3169 return REG_ESPACE;
3170 if (dfa->edests[cur_node].nelem == 0)
3171 break;
3172 if (dfa->edests[cur_node].nelem == 2)
3174 reg_errcode_t err;
3175 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3176 dfa->edests[cur_node].elems[1],
3177 ex_subexp, type);
3178 if (__glibc_unlikely (err != REG_NOERROR))
3179 return err;
3181 cur_node = dfa->edests[cur_node].elems[0];
3183 return REG_NOERROR;
3187 /* For all the back references in the current state, calculate the
3188 destination of the back references by the appropriate entry
3189 in MCTX->BKREF_ENTS. */
3191 static reg_errcode_t
3192 __attribute_warn_unused_result__
3193 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3194 Idx cur_str, Idx subexp_num, int type)
3196 const re_dfa_t *const dfa = mctx->dfa;
3197 reg_errcode_t err;
3198 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3199 struct re_backref_cache_entry *ent;
3201 if (cache_idx_start == -1)
3202 return REG_NOERROR;
3204 restart:
3205 ent = mctx->bkref_ents + cache_idx_start;
3208 Idx to_idx, next_node;
3210 /* Is this entry ENT is appropriate? */
3211 if (!re_node_set_contains (cur_nodes, ent->node))
3212 continue; /* No. */
3214 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3215 /* Calculate the destination of the back reference, and append it
3216 to MCTX->STATE_LOG. */
3217 if (to_idx == cur_str)
3219 /* The backreference did epsilon transit, we must re-check all the
3220 node in the current state. */
3221 re_node_set new_dests;
3222 reg_errcode_t err2, err3;
3223 next_node = dfa->edests[ent->node].elems[0];
3224 if (re_node_set_contains (cur_nodes, next_node))
3225 continue;
3226 err = re_node_set_init_1 (&new_dests, next_node);
3227 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3228 err3 = re_node_set_merge (cur_nodes, &new_dests);
3229 re_node_set_free (&new_dests);
3230 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3231 || err3 != REG_NOERROR))
3233 err = (err != REG_NOERROR ? err
3234 : (err2 != REG_NOERROR ? err2 : err3));
3235 return err;
3237 /* TODO: It is still inefficient... */
3238 goto restart;
3240 else
3242 re_node_set union_set;
3243 next_node = dfa->nexts[ent->node];
3244 if (mctx->state_log[to_idx])
3246 bool ok;
3247 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3248 next_node))
3249 continue;
3250 err = re_node_set_init_copy (&union_set,
3251 &mctx->state_log[to_idx]->nodes);
3252 ok = re_node_set_insert (&union_set, next_node);
3253 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3255 re_node_set_free (&union_set);
3256 err = err != REG_NOERROR ? err : REG_ESPACE;
3257 return err;
3260 else
3262 err = re_node_set_init_1 (&union_set, next_node);
3263 if (__glibc_unlikely (err != REG_NOERROR))
3264 return err;
3266 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3267 re_node_set_free (&union_set);
3268 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3269 && err != REG_NOERROR))
3270 return err;
3273 while (ent++->more);
3274 return REG_NOERROR;
3277 /* Build transition table for the state.
3278 Return true if successful. */
3280 static bool
3281 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3283 reg_errcode_t err;
3284 Idx i, j;
3285 int ch;
3286 bool need_word_trtable = false;
3287 bitset_word_t elem, mask;
3288 bool dests_node_malloced = false;
3289 bool dest_states_malloced = false;
3290 Idx ndests; /* Number of the destination states from 'state'. */
3291 re_dfastate_t **trtable;
3292 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3293 re_node_set follows, *dests_node;
3294 bitset_t *dests_ch;
3295 bitset_t acceptable;
3297 struct dests_alloc
3299 re_node_set dests_node[SBC_MAX];
3300 bitset_t dests_ch[SBC_MAX];
3301 } *dests_alloc;
3303 /* We build DFA states which corresponds to the destination nodes
3304 from 'state'. 'dests_node[i]' represents the nodes which i-th
3305 destination state contains, and 'dests_ch[i]' represents the
3306 characters which i-th destination state accepts. */
3307 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3308 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3309 else
3311 dests_alloc = re_malloc (struct dests_alloc, 1);
3312 if (__glibc_unlikely (dests_alloc == NULL))
3313 return false;
3314 dests_node_malloced = true;
3316 dests_node = dests_alloc->dests_node;
3317 dests_ch = dests_alloc->dests_ch;
3319 /* Initialize transition table. */
3320 state->word_trtable = state->trtable = NULL;
3322 /* At first, group all nodes belonging to 'state' into several
3323 destinations. */
3324 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3325 if (__glibc_unlikely (ndests <= 0))
3327 if (dests_node_malloced)
3328 re_free (dests_alloc);
3329 /* Return false in case of an error, true otherwise. */
3330 if (ndests == 0)
3332 state->trtable = (re_dfastate_t **)
3333 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3334 if (__glibc_unlikely (state->trtable == NULL))
3335 return false;
3336 return true;
3338 return false;
3341 err = re_node_set_alloc (&follows, ndests + 1);
3342 if (__glibc_unlikely (err != REG_NOERROR))
3343 goto out_free;
3345 /* Avoid arithmetic overflow in size calculation. */
3346 size_t ndests_max
3347 = ((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3348 / (3 * sizeof (re_dfastate_t *)));
3349 if (__glibc_unlikely (ndests_max < ndests))
3350 goto out_free;
3352 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3353 + ndests * 3 * sizeof (re_dfastate_t *)))
3354 dest_states = (re_dfastate_t **)
3355 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3356 else
3358 dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3359 if (__glibc_unlikely (dest_states == NULL))
3361 out_free:
3362 if (dest_states_malloced)
3363 re_free (dest_states);
3364 re_node_set_free (&follows);
3365 for (i = 0; i < ndests; ++i)
3366 re_node_set_free (dests_node + i);
3367 if (dests_node_malloced)
3368 re_free (dests_alloc);
3369 return false;
3371 dest_states_malloced = true;
3373 dest_states_word = dest_states + ndests;
3374 dest_states_nl = dest_states_word + ndests;
3375 bitset_empty (acceptable);
3377 /* Then build the states for all destinations. */
3378 for (i = 0; i < ndests; ++i)
3380 Idx next_node;
3381 re_node_set_empty (&follows);
3382 /* Merge the follows of this destination states. */
3383 for (j = 0; j < dests_node[i].nelem; ++j)
3385 next_node = dfa->nexts[dests_node[i].elems[j]];
3386 if (next_node != -1)
3388 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3389 if (__glibc_unlikely (err != REG_NOERROR))
3390 goto out_free;
3393 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3394 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3395 goto out_free;
3396 /* If the new state has context constraint,
3397 build appropriate states for these contexts. */
3398 if (dest_states[i]->has_constraint)
3400 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3401 CONTEXT_WORD);
3402 if (__glibc_unlikely (dest_states_word[i] == NULL
3403 && err != REG_NOERROR))
3404 goto out_free;
3406 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3407 need_word_trtable = true;
3409 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3410 CONTEXT_NEWLINE);
3411 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3412 goto out_free;
3414 else
3416 dest_states_word[i] = dest_states[i];
3417 dest_states_nl[i] = dest_states[i];
3419 bitset_merge (acceptable, dests_ch[i]);
3422 if (!__glibc_unlikely (need_word_trtable))
3424 /* We don't care about whether the following character is a word
3425 character, or we are in a single-byte character set so we can
3426 discern by looking at the character code: allocate a
3427 256-entry transition table. */
3428 trtable = state->trtable =
3429 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3430 if (__glibc_unlikely (trtable == NULL))
3431 goto out_free;
3433 /* For all characters ch...: */
3434 for (i = 0; i < BITSET_WORDS; ++i)
3435 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3436 elem;
3437 mask <<= 1, elem >>= 1, ++ch)
3438 if (__glibc_unlikely (elem & 1))
3440 /* There must be exactly one destination which accepts
3441 character ch. See group_nodes_into_DFAstates. */
3442 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3445 /* j-th destination accepts the word character ch. */
3446 if (dfa->word_char[i] & mask)
3447 trtable[ch] = dest_states_word[j];
3448 else
3449 trtable[ch] = dest_states[j];
3452 else
3454 /* We care about whether the following character is a word
3455 character, and we are in a multi-byte character set: discern
3456 by looking at the character code: build two 256-entry
3457 transition tables, one starting at trtable[0] and one
3458 starting at trtable[SBC_MAX]. */
3459 trtable = state->word_trtable =
3460 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3461 if (__glibc_unlikely (trtable == NULL))
3462 goto out_free;
3464 /* For all characters ch...: */
3465 for (i = 0; i < BITSET_WORDS; ++i)
3466 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3467 elem;
3468 mask <<= 1, elem >>= 1, ++ch)
3469 if (__glibc_unlikely (elem & 1))
3471 /* There must be exactly one destination which accepts
3472 character ch. See group_nodes_into_DFAstates. */
3473 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3476 /* j-th destination accepts the word character ch. */
3477 trtable[ch] = dest_states[j];
3478 trtable[ch + SBC_MAX] = dest_states_word[j];
3482 /* new line */
3483 if (bitset_contain (acceptable, NEWLINE_CHAR))
3485 /* The current state accepts newline character. */
3486 for (j = 0; j < ndests; ++j)
3487 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3489 /* k-th destination accepts newline character. */
3490 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3491 if (need_word_trtable)
3492 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3493 /* There must be only one destination which accepts
3494 newline. See group_nodes_into_DFAstates. */
3495 break;
3499 if (dest_states_malloced)
3500 re_free (dest_states);
3502 re_node_set_free (&follows);
3503 for (i = 0; i < ndests; ++i)
3504 re_node_set_free (dests_node + i);
3506 if (dests_node_malloced)
3507 re_free (dests_alloc);
3509 return true;
3512 /* Group all nodes belonging to STATE into several destinations.
3513 Then for all destinations, set the nodes belonging to the destination
3514 to DESTS_NODE[i] and set the characters accepted by the destination
3515 to DEST_CH[i]. This function return the number of destinations. */
3517 static Idx
3518 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3519 re_node_set *dests_node, bitset_t *dests_ch)
3521 reg_errcode_t err;
3522 bool ok;
3523 Idx i, j, k;
3524 Idx ndests; /* Number of the destinations from 'state'. */
3525 bitset_t accepts; /* Characters a node can accept. */
3526 const re_node_set *cur_nodes = &state->nodes;
3527 bitset_empty (accepts);
3528 ndests = 0;
3530 /* For all the nodes belonging to 'state', */
3531 for (i = 0; i < cur_nodes->nelem; ++i)
3533 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3534 re_token_type_t type = node->type;
3535 unsigned int constraint = node->constraint;
3537 /* Enumerate all single byte character this node can accept. */
3538 if (type == CHARACTER)
3539 bitset_set (accepts, node->opr.c);
3540 else if (type == SIMPLE_BRACKET)
3542 bitset_merge (accepts, node->opr.sbcset);
3544 else if (type == OP_PERIOD)
3546 #ifdef RE_ENABLE_I18N
3547 if (dfa->mb_cur_max > 1)
3548 bitset_merge (accepts, dfa->sb_char);
3549 else
3550 #endif
3551 bitset_set_all (accepts);
3552 if (!(dfa->syntax & RE_DOT_NEWLINE))
3553 bitset_clear (accepts, '\n');
3554 if (dfa->syntax & RE_DOT_NOT_NULL)
3555 bitset_clear (accepts, '\0');
3557 #ifdef RE_ENABLE_I18N
3558 else if (type == OP_UTF8_PERIOD)
3560 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3561 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3562 else
3563 bitset_merge (accepts, utf8_sb_map);
3564 if (!(dfa->syntax & RE_DOT_NEWLINE))
3565 bitset_clear (accepts, '\n');
3566 if (dfa->syntax & RE_DOT_NOT_NULL)
3567 bitset_clear (accepts, '\0');
3569 #endif
3570 else
3571 continue;
3573 /* Check the 'accepts' and sift the characters which are not
3574 match it the context. */
3575 if (constraint)
3577 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3579 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3580 bitset_empty (accepts);
3581 if (accepts_newline)
3582 bitset_set (accepts, NEWLINE_CHAR);
3583 else
3584 continue;
3586 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3588 bitset_empty (accepts);
3589 continue;
3592 if (constraint & NEXT_WORD_CONSTRAINT)
3594 bitset_word_t any_set = 0;
3595 if (type == CHARACTER && !node->word_char)
3597 bitset_empty (accepts);
3598 continue;
3600 #ifdef RE_ENABLE_I18N
3601 if (dfa->mb_cur_max > 1)
3602 for (j = 0; j < BITSET_WORDS; ++j)
3603 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3604 else
3605 #endif
3606 for (j = 0; j < BITSET_WORDS; ++j)
3607 any_set |= (accepts[j] &= dfa->word_char[j]);
3608 if (!any_set)
3609 continue;
3611 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3613 bitset_word_t any_set = 0;
3614 if (type == CHARACTER && node->word_char)
3616 bitset_empty (accepts);
3617 continue;
3619 #ifdef RE_ENABLE_I18N
3620 if (dfa->mb_cur_max > 1)
3621 for (j = 0; j < BITSET_WORDS; ++j)
3622 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3623 else
3624 #endif
3625 for (j = 0; j < BITSET_WORDS; ++j)
3626 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3627 if (!any_set)
3628 continue;
3632 /* Then divide 'accepts' into DFA states, or create a new
3633 state. Above, we make sure that accepts is not empty. */
3634 for (j = 0; j < ndests; ++j)
3636 bitset_t intersec; /* Intersection sets, see below. */
3637 bitset_t remains;
3638 /* Flags, see below. */
3639 bitset_word_t has_intersec, not_subset, not_consumed;
3641 /* Optimization, skip if this state doesn't accept the character. */
3642 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3643 continue;
3645 /* Enumerate the intersection set of this state and 'accepts'. */
3646 has_intersec = 0;
3647 for (k = 0; k < BITSET_WORDS; ++k)
3648 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3649 /* And skip if the intersection set is empty. */
3650 if (!has_intersec)
3651 continue;
3653 /* Then check if this state is a subset of 'accepts'. */
3654 not_subset = not_consumed = 0;
3655 for (k = 0; k < BITSET_WORDS; ++k)
3657 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3658 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3661 /* If this state isn't a subset of 'accepts', create a
3662 new group state, which has the 'remains'. */
3663 if (not_subset)
3665 bitset_copy (dests_ch[ndests], remains);
3666 bitset_copy (dests_ch[j], intersec);
3667 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3668 if (__glibc_unlikely (err != REG_NOERROR))
3669 goto error_return;
3670 ++ndests;
3673 /* Put the position in the current group. */
3674 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3675 if (__glibc_unlikely (! ok))
3676 goto error_return;
3678 /* If all characters are consumed, go to next node. */
3679 if (!not_consumed)
3680 break;
3682 /* Some characters remain, create a new group. */
3683 if (j == ndests)
3685 bitset_copy (dests_ch[ndests], accepts);
3686 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3687 if (__glibc_unlikely (err != REG_NOERROR))
3688 goto error_return;
3689 ++ndests;
3690 bitset_empty (accepts);
3693 return ndests;
3694 error_return:
3695 for (j = 0; j < ndests; ++j)
3696 re_node_set_free (dests_node + j);
3697 return -1;
3700 #ifdef RE_ENABLE_I18N
3701 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3702 Return the number of the bytes the node accepts.
3703 STR_IDX is the current index of the input string.
3705 This function handles the nodes which can accept one character, or
3706 one collating element like '.', '[a-z]', opposite to the other nodes
3707 can only accept one byte. */
3709 # ifdef _LIBC
3710 # include <locale/weight.h>
3711 # endif
3713 static int
3714 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3715 const re_string_t *input, Idx str_idx)
3717 const re_token_t *node = dfa->nodes + node_idx;
3718 int char_len, elem_len;
3719 Idx i;
3721 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3723 unsigned char c = re_string_byte_at (input, str_idx), d;
3724 if (__glibc_likely (c < 0xc2))
3725 return 0;
3727 if (str_idx + 2 > input->len)
3728 return 0;
3730 d = re_string_byte_at (input, str_idx + 1);
3731 if (c < 0xe0)
3732 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3733 else if (c < 0xf0)
3735 char_len = 3;
3736 if (c == 0xe0 && d < 0xa0)
3737 return 0;
3739 else if (c < 0xf8)
3741 char_len = 4;
3742 if (c == 0xf0 && d < 0x90)
3743 return 0;
3745 else if (c < 0xfc)
3747 char_len = 5;
3748 if (c == 0xf8 && d < 0x88)
3749 return 0;
3751 else if (c < 0xfe)
3753 char_len = 6;
3754 if (c == 0xfc && d < 0x84)
3755 return 0;
3757 else
3758 return 0;
3760 if (str_idx + char_len > input->len)
3761 return 0;
3763 for (i = 1; i < char_len; ++i)
3765 d = re_string_byte_at (input, str_idx + i);
3766 if (d < 0x80 || d > 0xbf)
3767 return 0;
3769 return char_len;
3772 char_len = re_string_char_size_at (input, str_idx);
3773 if (node->type == OP_PERIOD)
3775 if (char_len <= 1)
3776 return 0;
3777 /* FIXME: I don't think this if is needed, as both '\n'
3778 and '\0' are char_len == 1. */
3779 /* '.' accepts any one character except the following two cases. */
3780 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3781 re_string_byte_at (input, str_idx) == '\n') ||
3782 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3783 re_string_byte_at (input, str_idx) == '\0'))
3784 return 0;
3785 return char_len;
3788 elem_len = re_string_elem_size_at (input, str_idx);
3789 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3790 return 0;
3792 if (node->type == COMPLEX_BRACKET)
3794 const re_charset_t *cset = node->opr.mbcset;
3795 # ifdef _LIBC
3796 const unsigned char *pin
3797 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3798 Idx j;
3799 uint32_t nrules;
3800 # endif /* _LIBC */
3801 int match_len = 0;
3802 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3803 ? re_string_wchar_at (input, str_idx) : 0);
3805 /* match with multibyte character? */
3806 for (i = 0; i < cset->nmbchars; ++i)
3807 if (wc == cset->mbchars[i])
3809 match_len = char_len;
3810 goto check_node_accept_bytes_match;
3812 /* match with character_class? */
3813 for (i = 0; i < cset->nchar_classes; ++i)
3815 wctype_t wt = cset->char_classes[i];
3816 if (__iswctype (wc, wt))
3818 match_len = char_len;
3819 goto check_node_accept_bytes_match;
3823 # ifdef _LIBC
3824 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3825 if (nrules != 0)
3827 unsigned int in_collseq = 0;
3828 const int32_t *table, *indirect;
3829 const unsigned char *weights, *extra;
3830 const char *collseqwc;
3832 /* match with collating_symbol? */
3833 if (cset->ncoll_syms)
3834 extra = (const unsigned char *)
3835 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3836 for (i = 0; i < cset->ncoll_syms; ++i)
3838 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3839 /* Compare the length of input collating element and
3840 the length of current collating element. */
3841 if (*coll_sym != elem_len)
3842 continue;
3843 /* Compare each bytes. */
3844 for (j = 0; j < *coll_sym; j++)
3845 if (pin[j] != coll_sym[1 + j])
3846 break;
3847 if (j == *coll_sym)
3849 /* Match if every bytes is equal. */
3850 match_len = j;
3851 goto check_node_accept_bytes_match;
3855 if (cset->nranges)
3857 if (elem_len <= char_len)
3859 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3860 in_collseq = __collseq_table_lookup (collseqwc, wc);
3862 else
3863 in_collseq = find_collation_sequence_value (pin, elem_len);
3865 /* match with range expression? */
3866 /* FIXME: Implement rational ranges here, too. */
3867 for (i = 0; i < cset->nranges; ++i)
3868 if (cset->range_starts[i] <= in_collseq
3869 && in_collseq <= cset->range_ends[i])
3871 match_len = elem_len;
3872 goto check_node_accept_bytes_match;
3875 /* match with equivalence_class? */
3876 if (cset->nequiv_classes)
3878 const unsigned char *cp = pin;
3879 table = (const int32_t *)
3880 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3881 weights = (const unsigned char *)
3882 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3883 extra = (const unsigned char *)
3884 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3885 indirect = (const int32_t *)
3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3887 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3888 int32_t rule = idx >> 24;
3889 idx &= 0xffffff;
3890 if (idx > 0)
3892 size_t weight_len = weights[idx];
3893 for (i = 0; i < cset->nequiv_classes; ++i)
3895 int32_t equiv_class_idx = cset->equiv_classes[i];
3896 int32_t equiv_class_rule = equiv_class_idx >> 24;
3897 equiv_class_idx &= 0xffffff;
3898 if (weights[equiv_class_idx] == weight_len
3899 && equiv_class_rule == rule
3900 && memcmp (weights + idx + 1,
3901 weights + equiv_class_idx + 1,
3902 weight_len) == 0)
3904 match_len = elem_len;
3905 goto check_node_accept_bytes_match;
3911 else
3912 # endif /* _LIBC */
3914 /* match with range expression? */
3915 for (i = 0; i < cset->nranges; ++i)
3917 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3919 match_len = char_len;
3920 goto check_node_accept_bytes_match;
3924 check_node_accept_bytes_match:
3925 if (!cset->non_match)
3926 return match_len;
3927 else
3929 if (match_len > 0)
3930 return 0;
3931 else
3932 return (elem_len > char_len) ? elem_len : char_len;
3935 return 0;
3938 # ifdef _LIBC
3939 static unsigned int
3940 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3942 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3943 if (nrules == 0)
3945 if (mbs_len == 1)
3947 /* No valid character. Match it as a single byte character. */
3948 const unsigned char *collseq = (const unsigned char *)
3949 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3950 return collseq[mbs[0]];
3952 return UINT_MAX;
3954 else
3956 int32_t idx;
3957 const unsigned char *extra = (const unsigned char *)
3958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3959 int32_t extrasize = (const unsigned char *)
3960 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3962 for (idx = 0; idx < extrasize;)
3964 int mbs_cnt;
3965 bool found = false;
3966 int32_t elem_mbs_len;
3967 /* Skip the name of collating element name. */
3968 idx = idx + extra[idx] + 1;
3969 elem_mbs_len = extra[idx++];
3970 if (mbs_len == elem_mbs_len)
3972 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3973 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3974 break;
3975 if (mbs_cnt == elem_mbs_len)
3976 /* Found the entry. */
3977 found = true;
3979 /* Skip the byte sequence of the collating element. */
3980 idx += elem_mbs_len;
3981 /* Adjust for the alignment. */
3982 idx = (idx + 3) & ~3;
3983 /* Skip the collation sequence value. */
3984 idx += sizeof (uint32_t);
3985 /* Skip the wide char sequence of the collating element. */
3986 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3987 /* If we found the entry, return the sequence value. */
3988 if (found)
3989 return *(uint32_t *) (extra + idx);
3990 /* Skip the collation sequence value. */
3991 idx += sizeof (uint32_t);
3993 return UINT_MAX;
3996 # endif /* _LIBC */
3997 #endif /* RE_ENABLE_I18N */
3999 /* Check whether the node accepts the byte which is IDX-th
4000 byte of the INPUT. */
4002 static bool
4003 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4004 Idx idx)
4006 unsigned char ch;
4007 ch = re_string_byte_at (&mctx->input, idx);
4008 switch (node->type)
4010 case CHARACTER:
4011 if (node->opr.c != ch)
4012 return false;
4013 break;
4015 case SIMPLE_BRACKET:
4016 if (!bitset_contain (node->opr.sbcset, ch))
4017 return false;
4018 break;
4020 #ifdef RE_ENABLE_I18N
4021 case OP_UTF8_PERIOD:
4022 if (ch >= ASCII_CHARS)
4023 return false;
4024 FALLTHROUGH;
4025 #endif
4026 case OP_PERIOD:
4027 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4028 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4029 return false;
4030 break;
4032 default:
4033 return false;
4036 if (node->constraint)
4038 /* The node has constraints. Check whether the current context
4039 satisfies the constraints. */
4040 unsigned int context = re_string_context_at (&mctx->input, idx,
4041 mctx->eflags);
4042 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4043 return false;
4046 return true;
4049 /* Extend the buffers, if the buffers have run out. */
4051 static reg_errcode_t
4052 __attribute_warn_unused_result__
4053 extend_buffers (re_match_context_t *mctx, int min_len)
4055 reg_errcode_t ret;
4056 re_string_t *pstr = &mctx->input;
4058 /* Avoid overflow. */
4059 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4060 <= pstr->bufs_len))
4061 return REG_ESPACE;
4063 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4064 ret = re_string_realloc_buffers (pstr,
4065 MAX (min_len,
4066 MIN (pstr->len, pstr->bufs_len * 2)));
4067 if (__glibc_unlikely (ret != REG_NOERROR))
4068 return ret;
4070 if (mctx->state_log != NULL)
4072 /* And double the length of state_log. */
4073 /* XXX We have no indication of the size of this buffer. If this
4074 allocation fail we have no indication that the state_log array
4075 does not have the right size. */
4076 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4077 pstr->bufs_len + 1);
4078 if (__glibc_unlikely (new_array == NULL))
4079 return REG_ESPACE;
4080 mctx->state_log = new_array;
4083 /* Then reconstruct the buffers. */
4084 if (pstr->icase)
4086 #ifdef RE_ENABLE_I18N
4087 if (pstr->mb_cur_max > 1)
4089 ret = build_wcs_upper_buffer (pstr);
4090 if (__glibc_unlikely (ret != REG_NOERROR))
4091 return ret;
4093 else
4094 #endif /* RE_ENABLE_I18N */
4095 build_upper_buffer (pstr);
4097 else
4099 #ifdef RE_ENABLE_I18N
4100 if (pstr->mb_cur_max > 1)
4101 build_wcs_buffer (pstr);
4102 else
4103 #endif /* RE_ENABLE_I18N */
4105 if (pstr->trans != NULL)
4106 re_string_translate_buffer (pstr);
4109 return REG_NOERROR;
4113 /* Functions for matching context. */
4115 /* Initialize MCTX. */
4117 static reg_errcode_t
4118 __attribute_warn_unused_result__
4119 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4121 mctx->eflags = eflags;
4122 mctx->match_last = -1;
4123 if (n > 0)
4125 /* Avoid overflow. */
4126 size_t max_object_size =
4127 MAX (sizeof (struct re_backref_cache_entry),
4128 sizeof (re_sub_match_top_t *));
4129 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4130 return REG_ESPACE;
4132 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4133 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4134 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4135 return REG_ESPACE;
4137 /* Already zero-ed by the caller.
4138 else
4139 mctx->bkref_ents = NULL;
4140 mctx->nbkref_ents = 0;
4141 mctx->nsub_tops = 0; */
4142 mctx->abkref_ents = n;
4143 mctx->max_mb_elem_len = 1;
4144 mctx->asub_tops = n;
4145 return REG_NOERROR;
4148 /* Clean the entries which depend on the current input in MCTX.
4149 This function must be invoked when the matcher changes the start index
4150 of the input, or changes the input string. */
4152 static void
4153 match_ctx_clean (re_match_context_t *mctx)
4155 Idx st_idx;
4156 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4158 Idx sl_idx;
4159 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4160 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4162 re_sub_match_last_t *last = top->lasts[sl_idx];
4163 re_free (last->path.array);
4164 re_free (last);
4166 re_free (top->lasts);
4167 if (top->path)
4169 re_free (top->path->array);
4170 re_free (top->path);
4172 re_free (top);
4175 mctx->nsub_tops = 0;
4176 mctx->nbkref_ents = 0;
4179 /* Free all the memory associated with MCTX. */
4181 static void
4182 match_ctx_free (re_match_context_t *mctx)
4184 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4185 match_ctx_clean (mctx);
4186 re_free (mctx->sub_tops);
4187 re_free (mctx->bkref_ents);
4190 /* Add a new backreference entry to MCTX.
4191 Note that we assume that caller never call this function with duplicate
4192 entry, and call with STR_IDX which isn't smaller than any existing entry.
4195 static reg_errcode_t
4196 __attribute_warn_unused_result__
4197 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4198 Idx to)
4200 if (mctx->nbkref_ents >= mctx->abkref_ents)
4202 struct re_backref_cache_entry* new_entry;
4203 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4204 mctx->abkref_ents * 2);
4205 if (__glibc_unlikely (new_entry == NULL))
4207 re_free (mctx->bkref_ents);
4208 return REG_ESPACE;
4210 mctx->bkref_ents = new_entry;
4211 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4212 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4213 mctx->abkref_ents *= 2;
4215 if (mctx->nbkref_ents > 0
4216 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4217 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4219 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4220 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4221 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4222 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4224 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4225 If bit N is clear, means that this entry won't epsilon-transition to
4226 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4227 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4228 such node.
4230 A backreference does not epsilon-transition unless it is empty, so set
4231 to all zeros if FROM != TO. */
4232 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4233 = (from == to ? -1 : 0);
4235 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4236 if (mctx->max_mb_elem_len < to - from)
4237 mctx->max_mb_elem_len = to - from;
4238 return REG_NOERROR;
4241 /* Return the first entry with the same str_idx, or -1 if none is
4242 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4244 static Idx
4245 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4247 Idx left, right, mid, last;
4248 last = right = mctx->nbkref_ents;
4249 for (left = 0; left < right;)
4251 mid = (left + right) / 2;
4252 if (mctx->bkref_ents[mid].str_idx < str_idx)
4253 left = mid + 1;
4254 else
4255 right = mid;
4257 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4258 return left;
4259 else
4260 return -1;
4263 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4264 at STR_IDX. */
4266 static reg_errcode_t
4267 __attribute_warn_unused_result__
4268 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4270 #ifdef DEBUG
4271 assert (mctx->sub_tops != NULL);
4272 assert (mctx->asub_tops > 0);
4273 #endif
4274 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4276 Idx new_asub_tops = mctx->asub_tops * 2;
4277 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4278 re_sub_match_top_t *,
4279 new_asub_tops);
4280 if (__glibc_unlikely (new_array == NULL))
4281 return REG_ESPACE;
4282 mctx->sub_tops = new_array;
4283 mctx->asub_tops = new_asub_tops;
4285 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4286 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4287 return REG_ESPACE;
4288 mctx->sub_tops[mctx->nsub_tops]->node = node;
4289 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4290 return REG_NOERROR;
4293 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4294 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4296 static re_sub_match_last_t *
4297 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4299 re_sub_match_last_t *new_entry;
4300 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4302 Idx new_alasts = 2 * subtop->alasts + 1;
4303 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4304 re_sub_match_last_t *,
4305 new_alasts);
4306 if (__glibc_unlikely (new_array == NULL))
4307 return NULL;
4308 subtop->lasts = new_array;
4309 subtop->alasts = new_alasts;
4311 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4312 if (__glibc_likely (new_entry != NULL))
4314 subtop->lasts[subtop->nlasts] = new_entry;
4315 new_entry->node = node;
4316 new_entry->str_idx = str_idx;
4317 ++subtop->nlasts;
4319 return new_entry;
4322 static void
4323 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4324 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4326 sctx->sifted_states = sifted_sts;
4327 sctx->limited_states = limited_sts;
4328 sctx->last_node = last_node;
4329 sctx->last_str_idx = last_str_idx;
4330 re_node_set_init_empty (&sctx->limits);