sys_stat: Fix 'implicit declaration of function' warning on OS/2 kLIBC.
[gnulib.git] / lib / regexec.c
blob91d5a797b82e2679ceab74238416de06693e46ea
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
175 /* Entry point for POSIX code. */
177 /* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
189 We return 0 if we find a match and REG_NOMATCH if not. */
192 regexec (const regex_t *__restrict preg, const char *__restrict string,
193 size_t nmatch, regmatch_t pmatch[], int eflags)
195 reg_errcode_t err;
196 Idx start, length;
197 re_dfa_t *dfa = preg->buffer;
199 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200 return REG_BADPAT;
202 if (eflags & REG_STARTEND)
204 start = pmatch[0].rm_so;
205 length = pmatch[0].rm_eo;
207 else
209 start = 0;
210 length = strlen (string);
213 lock_lock (dfa->lock);
214 if (preg->no_sub)
215 err = re_search_internal (preg, string, length, start, length,
216 length, 0, NULL, eflags);
217 else
218 err = re_search_internal (preg, string, length, start, length,
219 length, nmatch, pmatch, eflags);
220 lock_unlock (dfa->lock);
221 return err != REG_NOERROR;
224 #ifdef _LIBC
225 libc_hidden_def (__regexec)
227 # include <shlib-compat.h>
228 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
230 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231 __typeof__ (__regexec) __compat_regexec;
234 attribute_compat_text_section
235 __compat_regexec (const regex_t *__restrict preg,
236 const char *__restrict string, size_t nmatch,
237 regmatch_t pmatch[], int eflags)
239 return regexec (preg, string, nmatch, pmatch,
240 eflags & (REG_NOTBOL | REG_NOTEOL));
242 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243 # endif
244 #endif
246 /* Entry points for GNU code. */
248 /* re_match, re_search, re_match_2, re_search_2
250 The former two functions operate on STRING with length LENGTH,
251 while the later two operate on concatenation of STRING1 and STRING2
252 with lengths LENGTH1 and LENGTH2, respectively.
254 re_match() matches the compiled pattern in BUFP against the string,
255 starting at index START.
257 re_search() first tries matching at index START, then it tries to match
258 starting from index START + 1, and so on. The last start position tried
259 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
260 way as re_match().)
262 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263 the first STOP characters of the concatenation of the strings should be
264 concerned.
266 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
267 and all groups is stored in REGS. (For the "_2" variants, the offsets are
268 computed relative to the concatenation, not relative to the individual
269 strings.)
271 On success, re_match* functions return the length of the match, re_search*
272 return the position of the start of the match. Return value -1 means no
273 match was found and -2 indicates an internal error. */
275 regoff_t
276 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 Idx start, struct re_registers *regs)
279 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
281 #ifdef _LIBC
282 weak_alias (__re_match, re_match)
283 #endif
285 regoff_t
286 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 Idx start, regoff_t range, struct re_registers *regs)
289 return re_search_stub (bufp, string, length, start, range, length, regs,
290 false);
292 #ifdef _LIBC
293 weak_alias (__re_search, re_search)
294 #endif
296 regoff_t
297 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 const char *string2, Idx length2, Idx start,
299 struct re_registers *regs, Idx stop)
301 return re_search_2_stub (bufp, string1, length1, string2, length2,
302 start, 0, regs, stop, true);
304 #ifdef _LIBC
305 weak_alias (__re_match_2, re_match_2)
306 #endif
308 regoff_t
309 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 const char *string2, Idx length2, Idx start, regoff_t range,
311 struct re_registers *regs, Idx stop)
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
314 start, range, regs, stop, false);
316 #ifdef _LIBC
317 weak_alias (__re_search_2, re_search_2)
318 #endif
320 static regoff_t
321 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 Idx length1, const char *string2, Idx length2, Idx start,
323 regoff_t range, struct re_registers *regs,
324 Idx stop, bool ret_len)
326 const char *str;
327 regoff_t rval;
328 Idx len;
329 char *s = NULL;
331 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
332 || INT_ADD_WRAPV (length1, length2, &len))))
333 return -2;
335 /* Concatenate the strings. */
336 if (length2 > 0)
337 if (length1 > 0)
339 s = re_malloc (char, len);
341 if (__glibc_unlikely (s == NULL))
342 return -2;
343 #ifdef _LIBC
344 memcpy (__mempcpy (s, string1, length1), string2, length2);
345 #else
346 memcpy (s, string1, length1);
347 memcpy (s + length1, string2, length2);
348 #endif
349 str = s;
351 else
352 str = string2;
353 else
354 str = string1;
356 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
357 ret_len);
358 re_free (s);
359 return rval;
362 /* The parameters have the same meaning as those of re_search.
363 Additional parameters:
364 If RET_LEN is true the length of the match is returned (re_match style);
365 otherwise the position of the match is returned. */
367 static regoff_t
368 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
369 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
370 bool ret_len)
372 reg_errcode_t result;
373 regmatch_t *pmatch;
374 Idx nregs;
375 regoff_t rval;
376 int eflags = 0;
377 re_dfa_t *dfa = bufp->buffer;
378 Idx last_start = start + range;
380 /* Check for out-of-range. */
381 if (__glibc_unlikely (start < 0 || start > length))
382 return -1;
383 if (__glibc_unlikely (length < last_start
384 || (0 <= range && last_start < start)))
385 last_start = length;
386 else if (__glibc_unlikely (last_start < 0
387 || (range < 0 && start <= last_start)))
388 last_start = 0;
390 lock_lock (dfa->lock);
392 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
393 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
395 /* Compile fastmap if we haven't yet. */
396 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
397 re_compile_fastmap (bufp);
399 if (__glibc_unlikely (bufp->no_sub))
400 regs = NULL;
402 /* We need at least 1 register. */
403 if (regs == NULL)
404 nregs = 1;
405 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
406 && regs->num_regs <= bufp->re_nsub))
408 nregs = regs->num_regs;
409 if (__glibc_unlikely (nregs < 1))
411 /* Nothing can be copied to regs. */
412 regs = NULL;
413 nregs = 1;
416 else
417 nregs = bufp->re_nsub + 1;
418 pmatch = re_malloc (regmatch_t, nregs);
419 if (__glibc_unlikely (pmatch == NULL))
421 rval = -2;
422 goto out;
425 result = re_search_internal (bufp, string, length, start, last_start, stop,
426 nregs, pmatch, eflags);
428 rval = 0;
430 /* I hope we needn't fill their regs with -1's when no match was found. */
431 if (result != REG_NOERROR)
432 rval = result == REG_NOMATCH ? -1 : -2;
433 else if (regs != NULL)
435 /* If caller wants register contents data back, copy them. */
436 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
437 bufp->regs_allocated);
438 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
439 rval = -2;
442 if (__glibc_likely (rval == 0))
444 if (ret_len)
446 assert (pmatch[0].rm_so == start);
447 rval = pmatch[0].rm_eo - start;
449 else
450 rval = pmatch[0].rm_so;
452 re_free (pmatch);
453 out:
454 lock_unlock (dfa->lock);
455 return rval;
458 static unsigned
459 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
460 int regs_allocated)
462 int rval = REGS_REALLOCATE;
463 Idx i;
464 Idx need_regs = nregs + 1;
465 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
466 uses. */
468 /* Have the register data arrays been allocated? */
469 if (regs_allocated == REGS_UNALLOCATED)
470 { /* No. So allocate them with malloc. */
471 regs->start = re_malloc (regoff_t, need_regs);
472 if (__glibc_unlikely (regs->start == NULL))
473 return REGS_UNALLOCATED;
474 regs->end = re_malloc (regoff_t, need_regs);
475 if (__glibc_unlikely (regs->end == NULL))
477 re_free (regs->start);
478 return REGS_UNALLOCATED;
480 regs->num_regs = need_regs;
482 else if (regs_allocated == REGS_REALLOCATE)
483 { /* Yes. If we need more elements than were already
484 allocated, reallocate them. If we need fewer, just
485 leave it alone. */
486 if (__glibc_unlikely (need_regs > regs->num_regs))
488 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
489 regoff_t *new_end;
490 if (__glibc_unlikely (new_start == NULL))
491 return REGS_UNALLOCATED;
492 new_end = re_realloc (regs->end, regoff_t, need_regs);
493 if (__glibc_unlikely (new_end == NULL))
495 re_free (new_start);
496 return REGS_UNALLOCATED;
498 regs->start = new_start;
499 regs->end = new_end;
500 regs->num_regs = need_regs;
503 else
505 assert (regs_allocated == REGS_FIXED);
506 /* This function may not be called with REGS_FIXED and nregs too big. */
507 assert (regs->num_regs >= nregs);
508 rval = REGS_FIXED;
511 /* Copy the regs. */
512 for (i = 0; i < nregs; ++i)
514 regs->start[i] = pmatch[i].rm_so;
515 regs->end[i] = pmatch[i].rm_eo;
517 for ( ; i < regs->num_regs; ++i)
518 regs->start[i] = regs->end[i] = -1;
520 return rval;
523 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
524 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
525 this memory for recording register information. STARTS and ENDS
526 must be allocated using the malloc library routine, and must each
527 be at least NUM_REGS * sizeof (regoff_t) bytes long.
529 If NUM_REGS == 0, then subsequent matches should allocate their own
530 register data.
532 Unless this function is called, the first search or match using
533 PATTERN_BUFFER will allocate its own register data, without
534 freeing the old data. */
536 void
537 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
538 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
540 if (num_regs)
542 bufp->regs_allocated = REGS_REALLOCATE;
543 regs->num_regs = num_regs;
544 regs->start = starts;
545 regs->end = ends;
547 else
549 bufp->regs_allocated = REGS_UNALLOCATED;
550 regs->num_regs = 0;
551 regs->start = regs->end = NULL;
554 #ifdef _LIBC
555 weak_alias (__re_set_registers, re_set_registers)
556 #endif
558 /* Entry points compatible with 4.2 BSD regex library. We don't define
559 them unless specifically requested. */
561 #if defined _REGEX_RE_COMP || defined _LIBC
563 # ifdef _LIBC
564 weak_function
565 # endif
566 re_exec (const char *s)
568 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
570 #endif /* _REGEX_RE_COMP */
572 /* Internal entry point. */
574 /* Searches for a compiled pattern PREG in the string STRING, whose
575 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
576 meaning as with regexec. LAST_START is START + RANGE, where
577 START and RANGE have the same meaning as with re_search.
578 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
579 otherwise return the error code.
580 Note: We assume front end functions already check ranges.
581 (0 <= LAST_START && LAST_START <= LENGTH) */
583 static reg_errcode_t
584 __attribute_warn_unused_result__
585 re_search_internal (const regex_t *preg, const char *string, Idx length,
586 Idx start, Idx last_start, Idx stop, size_t nmatch,
587 regmatch_t pmatch[], int eflags)
589 reg_errcode_t err;
590 const re_dfa_t *dfa = preg->buffer;
591 Idx left_lim, right_lim;
592 int incr;
593 bool fl_longest_match;
594 int match_kind;
595 Idx match_first;
596 Idx match_last = -1;
597 Idx extra_nmatch;
598 bool sb;
599 int ch;
600 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
601 re_match_context_t mctx = { .dfa = dfa };
602 #else
603 re_match_context_t mctx;
604 #endif
605 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
606 && start != last_start && !preg->can_be_null)
607 ? preg->fastmap : NULL);
608 RE_TRANSLATE_TYPE t = preg->translate;
610 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
611 memset (&mctx, '\0', sizeof (re_match_context_t));
612 mctx.dfa = dfa;
613 #endif
615 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
616 nmatch -= extra_nmatch;
618 /* Check if the DFA haven't been compiled. */
619 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
620 || dfa->init_state_word == NULL
621 || dfa->init_state_nl == NULL
622 || dfa->init_state_begbuf == NULL))
623 return REG_NOMATCH;
625 #ifdef DEBUG
626 /* We assume front-end functions already check them. */
627 assert (0 <= last_start && last_start <= length);
628 #endif
630 /* If initial states with non-begbuf contexts have no elements,
631 the regex must be anchored. If preg->newline_anchor is set,
632 we'll never use init_state_nl, so do not check it. */
633 if (dfa->init_state->nodes.nelem == 0
634 && dfa->init_state_word->nodes.nelem == 0
635 && (dfa->init_state_nl->nodes.nelem == 0
636 || !preg->newline_anchor))
638 if (start != 0 && last_start != 0)
639 return REG_NOMATCH;
640 start = last_start = 0;
643 /* We must check the longest matching, if nmatch > 0. */
644 fl_longest_match = (nmatch != 0 || dfa->nbackref);
646 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
647 preg->translate, (preg->syntax & RE_ICASE) != 0,
648 dfa);
649 if (__glibc_unlikely (err != REG_NOERROR))
650 goto free_return;
651 mctx.input.stop = stop;
652 mctx.input.raw_stop = stop;
653 mctx.input.newline_anchor = preg->newline_anchor;
655 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
656 if (__glibc_unlikely (err != REG_NOERROR))
657 goto free_return;
659 /* We will log all the DFA states through which the dfa pass,
660 if nmatch > 1, or this dfa has "multibyte node", which is a
661 back-reference or a node which can accept multibyte character or
662 multi character collating element. */
663 if (nmatch > 1 || dfa->has_mb_node)
665 /* Avoid overflow. */
666 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
667 <= mctx.input.bufs_len)))
669 err = REG_ESPACE;
670 goto free_return;
673 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
674 if (__glibc_unlikely (mctx.state_log == NULL))
676 err = REG_ESPACE;
677 goto free_return;
680 else
681 mctx.state_log = NULL;
683 match_first = start;
684 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
685 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
687 /* Check incrementally whether the input string matches. */
688 incr = (last_start < start) ? -1 : 1;
689 left_lim = (last_start < start) ? last_start : start;
690 right_lim = (last_start < start) ? start : last_start;
691 sb = dfa->mb_cur_max == 1;
692 match_kind =
693 (fastmap
694 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
695 | (start <= last_start ? 2 : 0)
696 | (t != NULL ? 1 : 0))
697 : 8);
699 for (;; match_first += incr)
701 err = REG_NOMATCH;
702 if (match_first < left_lim || right_lim < match_first)
703 goto free_return;
705 /* Advance as rapidly as possible through the string, until we
706 find a plausible place to start matching. This may be done
707 with varying efficiency, so there are various possibilities:
708 only the most common of them are specialized, in order to
709 save on code size. We use a switch statement for speed. */
710 switch (match_kind)
712 case 8:
713 /* No fastmap. */
714 break;
716 case 7:
717 /* Fastmap with single-byte translation, match forward. */
718 while (__glibc_likely (match_first < right_lim)
719 && !fastmap[t[(unsigned char) string[match_first]]])
720 ++match_first;
721 goto forward_match_found_start_or_reached_end;
723 case 6:
724 /* Fastmap without translation, match forward. */
725 while (__glibc_likely (match_first < right_lim)
726 && !fastmap[(unsigned char) string[match_first]])
727 ++match_first;
729 forward_match_found_start_or_reached_end:
730 if (__glibc_unlikely (match_first == right_lim))
732 ch = match_first >= length
733 ? 0 : (unsigned char) string[match_first];
734 if (!fastmap[t ? t[ch] : ch])
735 goto free_return;
737 break;
739 case 4:
740 case 5:
741 /* Fastmap without multi-byte translation, match backwards. */
742 while (match_first >= left_lim)
744 ch = match_first >= length
745 ? 0 : (unsigned char) string[match_first];
746 if (fastmap[t ? t[ch] : ch])
747 break;
748 --match_first;
750 if (match_first < left_lim)
751 goto free_return;
752 break;
754 default:
755 /* In this case, we can't determine easily the current byte,
756 since it might be a component byte of a multibyte
757 character. Then we use the constructed buffer instead. */
758 for (;;)
760 /* If MATCH_FIRST is out of the valid range, reconstruct the
761 buffers. */
762 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
763 if (__glibc_unlikely (offset
764 >= (__re_size_t) mctx.input.valid_raw_len))
766 err = re_string_reconstruct (&mctx.input, match_first,
767 eflags);
768 if (__glibc_unlikely (err != REG_NOERROR))
769 goto free_return;
771 offset = match_first - mctx.input.raw_mbs_idx;
773 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
774 Note that MATCH_FIRST must not be smaller than 0. */
775 ch = (match_first >= length
776 ? 0 : re_string_byte_at (&mctx.input, offset));
777 if (fastmap[ch])
778 break;
779 match_first += incr;
780 if (match_first < left_lim || match_first > right_lim)
782 err = REG_NOMATCH;
783 goto free_return;
786 break;
789 /* Reconstruct the buffers so that the matcher can assume that
790 the matching starts from the beginning of the buffer. */
791 err = re_string_reconstruct (&mctx.input, match_first, eflags);
792 if (__glibc_unlikely (err != REG_NOERROR))
793 goto free_return;
795 #ifdef RE_ENABLE_I18N
796 /* Don't consider this char as a possible match start if it part,
797 yet isn't the head, of a multibyte character. */
798 if (!sb && !re_string_first_byte (&mctx.input, 0))
799 continue;
800 #endif
802 /* It seems to be appropriate one, then use the matcher. */
803 /* We assume that the matching starts from 0. */
804 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
805 match_last = check_matching (&mctx, fl_longest_match,
806 start <= last_start ? &match_first : NULL);
807 if (match_last != -1)
809 if (__glibc_unlikely (match_last == -2))
811 err = REG_ESPACE;
812 goto free_return;
814 else
816 mctx.match_last = match_last;
817 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
819 re_dfastate_t *pstate = mctx.state_log[match_last];
820 mctx.last_node = check_halt_state_context (&mctx, pstate,
821 match_last);
823 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
824 || dfa->nbackref)
826 err = prune_impossible_nodes (&mctx);
827 if (err == REG_NOERROR)
828 break;
829 if (__glibc_unlikely (err != REG_NOMATCH))
830 goto free_return;
831 match_last = -1;
833 else
834 break; /* We found a match. */
838 match_ctx_clean (&mctx);
841 #ifdef DEBUG
842 assert (match_last != -1);
843 assert (err == REG_NOERROR);
844 #endif
846 /* Set pmatch[] if we need. */
847 if (nmatch > 0)
849 Idx reg_idx;
851 /* Initialize registers. */
852 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
853 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
855 /* Set the points where matching start/end. */
856 pmatch[0].rm_so = 0;
857 pmatch[0].rm_eo = mctx.match_last;
858 /* FIXME: This function should fail if mctx.match_last exceeds
859 the maximum possible regoff_t value. We need a new error
860 code REG_OVERFLOW. */
862 if (!preg->no_sub && nmatch > 1)
864 err = set_regs (preg, &mctx, nmatch, pmatch,
865 dfa->has_plural_match && dfa->nbackref > 0);
866 if (__glibc_unlikely (err != REG_NOERROR))
867 goto free_return;
870 /* At last, add the offset to each register, since we slid
871 the buffers so that we could assume that the matching starts
872 from 0. */
873 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
874 if (pmatch[reg_idx].rm_so != -1)
876 #ifdef RE_ENABLE_I18N
877 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
879 pmatch[reg_idx].rm_so =
880 (pmatch[reg_idx].rm_so == mctx.input.valid_len
881 ? mctx.input.valid_raw_len
882 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
883 pmatch[reg_idx].rm_eo =
884 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
885 ? mctx.input.valid_raw_len
886 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
888 #else
889 assert (mctx.input.offsets_needed == 0);
890 #endif
891 pmatch[reg_idx].rm_so += match_first;
892 pmatch[reg_idx].rm_eo += match_first;
894 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
896 pmatch[nmatch + reg_idx].rm_so = -1;
897 pmatch[nmatch + reg_idx].rm_eo = -1;
900 if (dfa->subexp_map)
901 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
902 if (dfa->subexp_map[reg_idx] != reg_idx)
904 pmatch[reg_idx + 1].rm_so
905 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
906 pmatch[reg_idx + 1].rm_eo
907 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
911 free_return:
912 re_free (mctx.state_log);
913 if (dfa->nbackref)
914 match_ctx_free (&mctx);
915 re_string_destruct (&mctx.input);
916 return err;
919 static reg_errcode_t
920 __attribute_warn_unused_result__
921 prune_impossible_nodes (re_match_context_t *mctx)
923 const re_dfa_t *const dfa = mctx->dfa;
924 Idx halt_node, match_last;
925 reg_errcode_t ret;
926 re_dfastate_t **sifted_states;
927 re_dfastate_t **lim_states = NULL;
928 re_sift_context_t sctx;
929 #ifdef DEBUG
930 assert (mctx->state_log != NULL);
931 #endif
932 match_last = mctx->match_last;
933 halt_node = mctx->last_node;
935 /* Avoid overflow. */
936 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
937 <= match_last))
938 return REG_ESPACE;
940 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
941 if (__glibc_unlikely (sifted_states == NULL))
943 ret = REG_ESPACE;
944 goto free_return;
946 if (dfa->nbackref)
948 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
949 if (__glibc_unlikely (lim_states == NULL))
951 ret = REG_ESPACE;
952 goto free_return;
954 while (1)
956 memset (lim_states, '\0',
957 sizeof (re_dfastate_t *) * (match_last + 1));
958 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
959 match_last);
960 ret = sift_states_backward (mctx, &sctx);
961 re_node_set_free (&sctx.limits);
962 if (__glibc_unlikely (ret != REG_NOERROR))
963 goto free_return;
964 if (sifted_states[0] != NULL || lim_states[0] != NULL)
965 break;
968 --match_last;
969 if (match_last < 0)
971 ret = REG_NOMATCH;
972 goto free_return;
974 } while (mctx->state_log[match_last] == NULL
975 || !mctx->state_log[match_last]->halt);
976 halt_node = check_halt_state_context (mctx,
977 mctx->state_log[match_last],
978 match_last);
980 ret = merge_state_array (dfa, sifted_states, lim_states,
981 match_last + 1);
982 re_free (lim_states);
983 lim_states = NULL;
984 if (__glibc_unlikely (ret != REG_NOERROR))
985 goto free_return;
987 else
989 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
990 ret = sift_states_backward (mctx, &sctx);
991 re_node_set_free (&sctx.limits);
992 if (__glibc_unlikely (ret != REG_NOERROR))
993 goto free_return;
994 if (sifted_states[0] == NULL)
996 ret = REG_NOMATCH;
997 goto free_return;
1000 re_free (mctx->state_log);
1001 mctx->state_log = sifted_states;
1002 sifted_states = NULL;
1003 mctx->last_node = halt_node;
1004 mctx->match_last = match_last;
1005 ret = REG_NOERROR;
1006 free_return:
1007 re_free (sifted_states);
1008 re_free (lim_states);
1009 return ret;
1012 /* Acquire an initial state and return it.
1013 We must select appropriate initial state depending on the context,
1014 since initial states may have constraints like "\<", "^", etc.. */
1016 static inline re_dfastate_t *
1017 __attribute__ ((always_inline))
1018 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1019 Idx idx)
1021 const re_dfa_t *const dfa = mctx->dfa;
1022 if (dfa->init_state->has_constraint)
1024 unsigned int context;
1025 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1026 if (IS_WORD_CONTEXT (context))
1027 return dfa->init_state_word;
1028 else if (IS_ORDINARY_CONTEXT (context))
1029 return dfa->init_state;
1030 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1031 return dfa->init_state_begbuf;
1032 else if (IS_NEWLINE_CONTEXT (context))
1033 return dfa->init_state_nl;
1034 else if (IS_BEGBUF_CONTEXT (context))
1036 /* It is relatively rare case, then calculate on demand. */
1037 return re_acquire_state_context (err, dfa,
1038 dfa->init_state->entrance_nodes,
1039 context);
1041 else
1042 /* Must not happen? */
1043 return dfa->init_state;
1045 else
1046 return dfa->init_state;
1049 /* Check whether the regular expression match input string INPUT or not,
1050 and return the index where the matching end. Return -1 if
1051 there is no match, and return -2 in case of an error.
1052 FL_LONGEST_MATCH means we want the POSIX longest matching.
1053 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1054 next place where we may want to try matching.
1055 Note that the matcher assumes that the matching starts from the current
1056 index of the buffer. */
1058 static Idx
1059 __attribute_warn_unused_result__
1060 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1061 Idx *p_match_first)
1063 const re_dfa_t *const dfa = mctx->dfa;
1064 reg_errcode_t err;
1065 Idx match = 0;
1066 Idx match_last = -1;
1067 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1068 re_dfastate_t *cur_state;
1069 bool at_init_state = p_match_first != NULL;
1070 Idx next_start_idx = cur_str_idx;
1072 err = REG_NOERROR;
1073 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1074 /* An initial state must not be NULL (invalid). */
1075 if (__glibc_unlikely (cur_state == NULL))
1077 assert (err == REG_ESPACE);
1078 return -2;
1081 if (mctx->state_log != NULL)
1083 mctx->state_log[cur_str_idx] = cur_state;
1085 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1086 later. E.g. Processing back references. */
1087 if (__glibc_unlikely (dfa->nbackref))
1089 at_init_state = false;
1090 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1091 if (__glibc_unlikely (err != REG_NOERROR))
1092 return err;
1094 if (cur_state->has_backref)
1096 err = transit_state_bkref (mctx, &cur_state->nodes);
1097 if (__glibc_unlikely (err != REG_NOERROR))
1098 return err;
1103 /* If the RE accepts NULL string. */
1104 if (__glibc_unlikely (cur_state->halt))
1106 if (!cur_state->has_constraint
1107 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1109 if (!fl_longest_match)
1110 return cur_str_idx;
1111 else
1113 match_last = cur_str_idx;
1114 match = 1;
1119 while (!re_string_eoi (&mctx->input))
1121 re_dfastate_t *old_state = cur_state;
1122 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1124 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1125 && mctx->input.bufs_len < mctx->input.len)
1126 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1127 && mctx->input.valid_len < mctx->input.len))
1129 err = extend_buffers (mctx, next_char_idx + 1);
1130 if (__glibc_unlikely (err != REG_NOERROR))
1132 assert (err == REG_ESPACE);
1133 return -2;
1137 cur_state = transit_state (&err, mctx, cur_state);
1138 if (mctx->state_log != NULL)
1139 cur_state = merge_state_with_log (&err, mctx, cur_state);
1141 if (cur_state == NULL)
1143 /* Reached the invalid state or an error. Try to recover a valid
1144 state using the state log, if available and if we have not
1145 already found a valid (even if not the longest) match. */
1146 if (__glibc_unlikely (err != REG_NOERROR))
1147 return -2;
1149 if (mctx->state_log == NULL
1150 || (match && !fl_longest_match)
1151 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1152 break;
1155 if (__glibc_unlikely (at_init_state))
1157 if (old_state == cur_state)
1158 next_start_idx = next_char_idx;
1159 else
1160 at_init_state = false;
1163 if (cur_state->halt)
1165 /* Reached a halt state.
1166 Check the halt state can satisfy the current context. */
1167 if (!cur_state->has_constraint
1168 || check_halt_state_context (mctx, cur_state,
1169 re_string_cur_idx (&mctx->input)))
1171 /* We found an appropriate halt state. */
1172 match_last = re_string_cur_idx (&mctx->input);
1173 match = 1;
1175 /* We found a match, do not modify match_first below. */
1176 p_match_first = NULL;
1177 if (!fl_longest_match)
1178 break;
1183 if (p_match_first)
1184 *p_match_first += next_start_idx;
1186 return match_last;
1189 /* Check NODE match the current context. */
1191 static bool
1192 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1194 re_token_type_t type = dfa->nodes[node].type;
1195 unsigned int constraint = dfa->nodes[node].constraint;
1196 if (type != END_OF_RE)
1197 return false;
1198 if (!constraint)
1199 return true;
1200 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1201 return false;
1202 return true;
1205 /* Check the halt state STATE match the current context.
1206 Return 0 if not match, if the node, STATE has, is a halt node and
1207 match the context, return the node. */
1209 static Idx
1210 check_halt_state_context (const re_match_context_t *mctx,
1211 const re_dfastate_t *state, Idx idx)
1213 Idx i;
1214 unsigned int context;
1215 #ifdef DEBUG
1216 assert (state->halt);
1217 #endif
1218 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1219 for (i = 0; i < state->nodes.nelem; ++i)
1220 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1221 return state->nodes.elems[i];
1222 return 0;
1225 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1226 corresponding to the DFA).
1227 Return the destination node, and update EPS_VIA_NODES;
1228 return -1 in case of errors. */
1230 static Idx
1231 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1232 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1233 struct re_fail_stack_t *fs)
1235 const re_dfa_t *const dfa = mctx->dfa;
1236 Idx i;
1237 bool ok;
1238 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1240 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1241 re_node_set *edests = &dfa->edests[node];
1242 Idx dest_node;
1243 ok = re_node_set_insert (eps_via_nodes, node);
1244 if (__glibc_unlikely (! ok))
1245 return -2;
1246 /* Pick up a valid destination, or return -1 if none
1247 is found. */
1248 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1250 Idx candidate = edests->elems[i];
1251 if (!re_node_set_contains (cur_nodes, candidate))
1252 continue;
1253 if (dest_node == -1)
1254 dest_node = candidate;
1256 else
1258 /* In order to avoid infinite loop like "(a*)*", return the second
1259 epsilon-transition if the first was already considered. */
1260 if (re_node_set_contains (eps_via_nodes, dest_node))
1261 return candidate;
1263 /* Otherwise, push the second epsilon-transition on the fail stack. */
1264 else if (fs != NULL
1265 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1266 eps_via_nodes))
1267 return -2;
1269 /* We know we are going to exit. */
1270 break;
1273 return dest_node;
1275 else
1277 Idx naccepted = 0;
1278 re_token_type_t type = dfa->nodes[node].type;
1280 #ifdef RE_ENABLE_I18N
1281 if (dfa->nodes[node].accept_mb)
1282 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1283 else
1284 #endif /* RE_ENABLE_I18N */
1285 if (type == OP_BACK_REF)
1287 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1288 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1289 if (fs != NULL)
1291 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1292 return -1;
1293 else if (naccepted)
1295 char *buf = (char *) re_string_get_buffer (&mctx->input);
1296 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1297 naccepted) != 0)
1298 return -1;
1302 if (naccepted == 0)
1304 Idx dest_node;
1305 ok = re_node_set_insert (eps_via_nodes, node);
1306 if (__glibc_unlikely (! ok))
1307 return -2;
1308 dest_node = dfa->edests[node].elems[0];
1309 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1310 dest_node))
1311 return dest_node;
1315 if (naccepted != 0
1316 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1318 Idx dest_node = dfa->nexts[node];
1319 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1320 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1321 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1322 dest_node)))
1323 return -1;
1324 re_node_set_empty (eps_via_nodes);
1325 return dest_node;
1328 return -1;
1331 static reg_errcode_t
1332 __attribute_warn_unused_result__
1333 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1334 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1336 reg_errcode_t err;
1337 Idx num = fs->num++;
1338 if (fs->num == fs->alloc)
1340 struct re_fail_stack_ent_t *new_array;
1341 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1342 fs->alloc * 2);
1343 if (new_array == NULL)
1344 return REG_ESPACE;
1345 fs->alloc *= 2;
1346 fs->stack = new_array;
1348 fs->stack[num].idx = str_idx;
1349 fs->stack[num].node = dest_node;
1350 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1351 if (fs->stack[num].regs == NULL)
1352 return REG_ESPACE;
1353 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1354 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1355 return err;
1358 static Idx
1359 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1360 regmatch_t *regs, re_node_set *eps_via_nodes)
1362 Idx num = --fs->num;
1363 assert (num >= 0);
1364 *pidx = fs->stack[num].idx;
1365 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1366 re_node_set_free (eps_via_nodes);
1367 re_free (fs->stack[num].regs);
1368 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1369 return fs->stack[num].node;
1372 /* Set the positions where the subexpressions are starts/ends to registers
1373 PMATCH.
1374 Note: We assume that pmatch[0] is already set, and
1375 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1377 static reg_errcode_t
1378 __attribute_warn_unused_result__
1379 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1380 regmatch_t *pmatch, bool fl_backtrack)
1382 const re_dfa_t *dfa = preg->buffer;
1383 Idx idx, cur_node;
1384 re_node_set eps_via_nodes;
1385 struct re_fail_stack_t *fs;
1386 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1387 regmatch_t *prev_idx_match;
1388 bool prev_idx_match_malloced = false;
1390 #ifdef DEBUG
1391 assert (nmatch > 1);
1392 assert (mctx->state_log != NULL);
1393 #endif
1394 if (fl_backtrack)
1396 fs = &fs_body;
1397 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1398 if (fs->stack == NULL)
1399 return REG_ESPACE;
1401 else
1402 fs = NULL;
1404 cur_node = dfa->init_node;
1405 re_node_set_init_empty (&eps_via_nodes);
1407 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1408 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1409 else
1411 prev_idx_match = re_malloc (regmatch_t, nmatch);
1412 if (prev_idx_match == NULL)
1414 free_fail_stack_return (fs);
1415 return REG_ESPACE;
1417 prev_idx_match_malloced = true;
1419 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1421 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1423 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1425 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1427 Idx reg_idx;
1428 if (fs)
1430 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1431 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1432 break;
1433 if (reg_idx == nmatch)
1435 re_node_set_free (&eps_via_nodes);
1436 if (prev_idx_match_malloced)
1437 re_free (prev_idx_match);
1438 return free_fail_stack_return (fs);
1440 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1441 &eps_via_nodes);
1443 else
1445 re_node_set_free (&eps_via_nodes);
1446 if (prev_idx_match_malloced)
1447 re_free (prev_idx_match);
1448 return REG_NOERROR;
1452 /* Proceed to next node. */
1453 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1454 &eps_via_nodes, fs);
1456 if (__glibc_unlikely (cur_node < 0))
1458 if (__glibc_unlikely (cur_node == -2))
1460 re_node_set_free (&eps_via_nodes);
1461 if (prev_idx_match_malloced)
1462 re_free (prev_idx_match);
1463 free_fail_stack_return (fs);
1464 return REG_ESPACE;
1466 if (fs)
1467 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1468 &eps_via_nodes);
1469 else
1471 re_node_set_free (&eps_via_nodes);
1472 if (prev_idx_match_malloced)
1473 re_free (prev_idx_match);
1474 return REG_NOMATCH;
1478 re_node_set_free (&eps_via_nodes);
1479 if (prev_idx_match_malloced)
1480 re_free (prev_idx_match);
1481 return free_fail_stack_return (fs);
1484 static reg_errcode_t
1485 free_fail_stack_return (struct re_fail_stack_t *fs)
1487 if (fs)
1489 Idx fs_idx;
1490 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1492 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1493 re_free (fs->stack[fs_idx].regs);
1495 re_free (fs->stack);
1497 return REG_NOERROR;
1500 static void
1501 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1502 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1504 int type = dfa->nodes[cur_node].type;
1505 if (type == OP_OPEN_SUBEXP)
1507 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1509 /* We are at the first node of this sub expression. */
1510 if (reg_num < nmatch)
1512 pmatch[reg_num].rm_so = cur_idx;
1513 pmatch[reg_num].rm_eo = -1;
1516 else if (type == OP_CLOSE_SUBEXP)
1518 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1519 if (reg_num < nmatch)
1521 /* We are at the last node of this sub expression. */
1522 if (pmatch[reg_num].rm_so < cur_idx)
1524 pmatch[reg_num].rm_eo = cur_idx;
1525 /* This is a non-empty match or we are not inside an optional
1526 subexpression. Accept this right away. */
1527 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1529 else
1531 if (dfa->nodes[cur_node].opt_subexp
1532 && prev_idx_match[reg_num].rm_so != -1)
1533 /* We transited through an empty match for an optional
1534 subexpression, like (a?)*, and this is not the subexp's
1535 first match. Copy back the old content of the registers
1536 so that matches of an inner subexpression are undone as
1537 well, like in ((a?))*. */
1538 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1539 else
1540 /* We completed a subexpression, but it may be part of
1541 an optional one, so do not update PREV_IDX_MATCH. */
1542 pmatch[reg_num].rm_eo = cur_idx;
1548 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1549 and sift the nodes in each states according to the following rules.
1550 Updated state_log will be wrote to STATE_LOG.
1552 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1553 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1554 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1555 the LAST_NODE, we throw away the node 'a'.
1556 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1557 string 's' and transit to 'b':
1558 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1559 away the node 'a'.
1560 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1561 thrown away, we throw away the node 'a'.
1562 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1563 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1564 node 'a'.
1565 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1566 we throw away the node 'a'. */
1568 #define STATE_NODE_CONTAINS(state,node) \
1569 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1571 static reg_errcode_t
1572 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1574 reg_errcode_t err;
1575 int null_cnt = 0;
1576 Idx str_idx = sctx->last_str_idx;
1577 re_node_set cur_dest;
1579 #ifdef DEBUG
1580 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1581 #endif
1583 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1584 transit to the last_node and the last_node itself. */
1585 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1586 if (__glibc_unlikely (err != REG_NOERROR))
1587 return err;
1588 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1589 if (__glibc_unlikely (err != REG_NOERROR))
1590 goto free_return;
1592 /* Then check each states in the state_log. */
1593 while (str_idx > 0)
1595 /* Update counters. */
1596 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1597 if (null_cnt > mctx->max_mb_elem_len)
1599 memset (sctx->sifted_states, '\0',
1600 sizeof (re_dfastate_t *) * str_idx);
1601 re_node_set_free (&cur_dest);
1602 return REG_NOERROR;
1604 re_node_set_empty (&cur_dest);
1605 --str_idx;
1607 if (mctx->state_log[str_idx])
1609 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1610 if (__glibc_unlikely (err != REG_NOERROR))
1611 goto free_return;
1614 /* Add all the nodes which satisfy the following conditions:
1615 - It can epsilon transit to a node in CUR_DEST.
1616 - It is in CUR_SRC.
1617 And update state_log. */
1618 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1619 if (__glibc_unlikely (err != REG_NOERROR))
1620 goto free_return;
1622 err = REG_NOERROR;
1623 free_return:
1624 re_node_set_free (&cur_dest);
1625 return err;
1628 static reg_errcode_t
1629 __attribute_warn_unused_result__
1630 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1631 Idx str_idx, re_node_set *cur_dest)
1633 const re_dfa_t *const dfa = mctx->dfa;
1634 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1635 Idx i;
1637 /* Then build the next sifted state.
1638 We build the next sifted state on 'cur_dest', and update
1639 'sifted_states[str_idx]' with 'cur_dest'.
1640 Note:
1641 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1642 'cur_src' points the node_set of the old 'state_log[str_idx]'
1643 (with the epsilon nodes pre-filtered out). */
1644 for (i = 0; i < cur_src->nelem; i++)
1646 Idx prev_node = cur_src->elems[i];
1647 int naccepted = 0;
1648 bool ok;
1650 #ifdef DEBUG
1651 re_token_type_t type = dfa->nodes[prev_node].type;
1652 assert (!IS_EPSILON_NODE (type));
1653 #endif
1654 #ifdef RE_ENABLE_I18N
1655 /* If the node may accept "multi byte". */
1656 if (dfa->nodes[prev_node].accept_mb)
1657 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1658 str_idx, sctx->last_str_idx);
1659 #endif /* RE_ENABLE_I18N */
1661 /* We don't check backreferences here.
1662 See update_cur_sifted_state(). */
1663 if (!naccepted
1664 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1665 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1666 dfa->nexts[prev_node]))
1667 naccepted = 1;
1669 if (naccepted == 0)
1670 continue;
1672 if (sctx->limits.nelem)
1674 Idx to_idx = str_idx + naccepted;
1675 if (check_dst_limits (mctx, &sctx->limits,
1676 dfa->nexts[prev_node], to_idx,
1677 prev_node, str_idx))
1678 continue;
1680 ok = re_node_set_insert (cur_dest, prev_node);
1681 if (__glibc_unlikely (! ok))
1682 return REG_ESPACE;
1685 return REG_NOERROR;
1688 /* Helper functions. */
1690 static reg_errcode_t
1691 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1693 Idx top = mctx->state_log_top;
1695 if ((next_state_log_idx >= mctx->input.bufs_len
1696 && mctx->input.bufs_len < mctx->input.len)
1697 || (next_state_log_idx >= mctx->input.valid_len
1698 && mctx->input.valid_len < mctx->input.len))
1700 reg_errcode_t err;
1701 err = extend_buffers (mctx, next_state_log_idx + 1);
1702 if (__glibc_unlikely (err != REG_NOERROR))
1703 return err;
1706 if (top < next_state_log_idx)
1708 memset (mctx->state_log + top + 1, '\0',
1709 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1710 mctx->state_log_top = next_state_log_idx;
1712 return REG_NOERROR;
1715 static reg_errcode_t
1716 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1717 re_dfastate_t **src, Idx num)
1719 Idx st_idx;
1720 reg_errcode_t err;
1721 for (st_idx = 0; st_idx < num; ++st_idx)
1723 if (dst[st_idx] == NULL)
1724 dst[st_idx] = src[st_idx];
1725 else if (src[st_idx] != NULL)
1727 re_node_set merged_set;
1728 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1729 &src[st_idx]->nodes);
1730 if (__glibc_unlikely (err != REG_NOERROR))
1731 return err;
1732 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1733 re_node_set_free (&merged_set);
1734 if (__glibc_unlikely (err != REG_NOERROR))
1735 return err;
1738 return REG_NOERROR;
1741 static reg_errcode_t
1742 update_cur_sifted_state (const re_match_context_t *mctx,
1743 re_sift_context_t *sctx, Idx str_idx,
1744 re_node_set *dest_nodes)
1746 const re_dfa_t *const dfa = mctx->dfa;
1747 reg_errcode_t err = REG_NOERROR;
1748 const re_node_set *candidates;
1749 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1750 : &mctx->state_log[str_idx]->nodes);
1752 if (dest_nodes->nelem == 0)
1753 sctx->sifted_states[str_idx] = NULL;
1754 else
1756 if (candidates)
1758 /* At first, add the nodes which can epsilon transit to a node in
1759 DEST_NODE. */
1760 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1761 if (__glibc_unlikely (err != REG_NOERROR))
1762 return err;
1764 /* Then, check the limitations in the current sift_context. */
1765 if (sctx->limits.nelem)
1767 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1768 mctx->bkref_ents, str_idx);
1769 if (__glibc_unlikely (err != REG_NOERROR))
1770 return err;
1774 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1775 if (__glibc_unlikely (err != REG_NOERROR))
1776 return err;
1779 if (candidates && mctx->state_log[str_idx]->has_backref)
1781 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1782 if (__glibc_unlikely (err != REG_NOERROR))
1783 return err;
1785 return REG_NOERROR;
1788 static reg_errcode_t
1789 __attribute_warn_unused_result__
1790 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1791 const re_node_set *candidates)
1793 reg_errcode_t err = REG_NOERROR;
1794 Idx i;
1796 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1797 if (__glibc_unlikely (err != REG_NOERROR))
1798 return err;
1800 if (!state->inveclosure.alloc)
1802 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1803 if (__glibc_unlikely (err != REG_NOERROR))
1804 return REG_ESPACE;
1805 for (i = 0; i < dest_nodes->nelem; i++)
1807 err = re_node_set_merge (&state->inveclosure,
1808 dfa->inveclosures + dest_nodes->elems[i]);
1809 if (__glibc_unlikely (err != REG_NOERROR))
1810 return REG_ESPACE;
1813 return re_node_set_add_intersect (dest_nodes, candidates,
1814 &state->inveclosure);
1817 static reg_errcode_t
1818 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1819 const re_node_set *candidates)
1821 Idx ecl_idx;
1822 reg_errcode_t err;
1823 re_node_set *inv_eclosure = dfa->inveclosures + node;
1824 re_node_set except_nodes;
1825 re_node_set_init_empty (&except_nodes);
1826 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1828 Idx cur_node = inv_eclosure->elems[ecl_idx];
1829 if (cur_node == node)
1830 continue;
1831 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1833 Idx edst1 = dfa->edests[cur_node].elems[0];
1834 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1835 ? dfa->edests[cur_node].elems[1] : -1);
1836 if ((!re_node_set_contains (inv_eclosure, edst1)
1837 && re_node_set_contains (dest_nodes, edst1))
1838 || (edst2 > 0
1839 && !re_node_set_contains (inv_eclosure, edst2)
1840 && re_node_set_contains (dest_nodes, edst2)))
1842 err = re_node_set_add_intersect (&except_nodes, candidates,
1843 dfa->inveclosures + cur_node);
1844 if (__glibc_unlikely (err != REG_NOERROR))
1846 re_node_set_free (&except_nodes);
1847 return err;
1852 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854 Idx cur_node = inv_eclosure->elems[ecl_idx];
1855 if (!re_node_set_contains (&except_nodes, cur_node))
1857 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1858 re_node_set_remove_at (dest_nodes, idx);
1861 re_node_set_free (&except_nodes);
1862 return REG_NOERROR;
1865 static bool
1866 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1867 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1869 const re_dfa_t *const dfa = mctx->dfa;
1870 Idx lim_idx, src_pos, dst_pos;
1872 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1873 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1874 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1876 Idx subexp_idx;
1877 struct re_backref_cache_entry *ent;
1878 ent = mctx->bkref_ents + limits->elems[lim_idx];
1879 subexp_idx = dfa->nodes[ent->node].opr.idx;
1881 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1882 subexp_idx, dst_node, dst_idx,
1883 dst_bkref_idx);
1884 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1885 subexp_idx, src_node, src_idx,
1886 src_bkref_idx);
1888 /* In case of:
1889 <src> <dst> ( <subexp> )
1890 ( <subexp> ) <src> <dst>
1891 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1892 if (src_pos == dst_pos)
1893 continue; /* This is unrelated limitation. */
1894 else
1895 return true;
1897 return false;
1900 static int
1901 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1902 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1904 const re_dfa_t *const dfa = mctx->dfa;
1905 const re_node_set *eclosures = dfa->eclosures + from_node;
1906 Idx node_idx;
1908 /* Else, we are on the boundary: examine the nodes on the epsilon
1909 closure. */
1910 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1912 Idx node = eclosures->elems[node_idx];
1913 switch (dfa->nodes[node].type)
1915 case OP_BACK_REF:
1916 if (bkref_idx != -1)
1918 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1921 Idx dst;
1922 int cpos;
1924 if (ent->node != node)
1925 continue;
1927 if (subexp_idx < BITSET_WORD_BITS
1928 && !(ent->eps_reachable_subexps_map
1929 & ((bitset_word_t) 1 << subexp_idx)))
1930 continue;
1932 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1933 OP_CLOSE_SUBEXP cases below. But, if the
1934 destination node is the same node as the source
1935 node, don't recurse because it would cause an
1936 infinite loop: a regex that exhibits this behavior
1937 is ()\1*\1* */
1938 dst = dfa->edests[node].elems[0];
1939 if (dst == from_node)
1941 if (boundaries & 1)
1942 return -1;
1943 else /* if (boundaries & 2) */
1944 return 0;
1947 cpos =
1948 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1949 dst, bkref_idx);
1950 if (cpos == -1 /* && (boundaries & 1) */)
1951 return -1;
1952 if (cpos == 0 && (boundaries & 2))
1953 return 0;
1955 if (subexp_idx < BITSET_WORD_BITS)
1956 ent->eps_reachable_subexps_map
1957 &= ~((bitset_word_t) 1 << subexp_idx);
1959 while (ent++->more);
1961 break;
1963 case OP_OPEN_SUBEXP:
1964 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1965 return -1;
1966 break;
1968 case OP_CLOSE_SUBEXP:
1969 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1970 return 0;
1971 break;
1973 default:
1974 break;
1978 return (boundaries & 2) ? 1 : 0;
1981 static int
1982 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1983 Idx subexp_idx, Idx from_node, Idx str_idx,
1984 Idx bkref_idx)
1986 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1987 int boundaries;
1989 /* If we are outside the range of the subexpression, return -1 or 1. */
1990 if (str_idx < lim->subexp_from)
1991 return -1;
1993 if (lim->subexp_to < str_idx)
1994 return 1;
1996 /* If we are within the subexpression, return 0. */
1997 boundaries = (str_idx == lim->subexp_from);
1998 boundaries |= (str_idx == lim->subexp_to) << 1;
1999 if (boundaries == 0)
2000 return 0;
2002 /* Else, examine epsilon closure. */
2003 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2004 from_node, bkref_idx);
2007 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2008 which are against limitations from DEST_NODES. */
2010 static reg_errcode_t
2011 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2012 const re_node_set *candidates, re_node_set *limits,
2013 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2015 reg_errcode_t err;
2016 Idx node_idx, lim_idx;
2018 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2020 Idx subexp_idx;
2021 struct re_backref_cache_entry *ent;
2022 ent = bkref_ents + limits->elems[lim_idx];
2024 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2025 continue; /* This is unrelated limitation. */
2027 subexp_idx = dfa->nodes[ent->node].opr.idx;
2028 if (ent->subexp_to == str_idx)
2030 Idx ops_node = -1;
2031 Idx cls_node = -1;
2032 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2034 Idx node = dest_nodes->elems[node_idx];
2035 re_token_type_t type = dfa->nodes[node].type;
2036 if (type == OP_OPEN_SUBEXP
2037 && subexp_idx == dfa->nodes[node].opr.idx)
2038 ops_node = node;
2039 else if (type == OP_CLOSE_SUBEXP
2040 && subexp_idx == dfa->nodes[node].opr.idx)
2041 cls_node = node;
2044 /* Check the limitation of the open subexpression. */
2045 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2046 if (ops_node >= 0)
2048 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2049 candidates);
2050 if (__glibc_unlikely (err != REG_NOERROR))
2051 return err;
2054 /* Check the limitation of the close subexpression. */
2055 if (cls_node >= 0)
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2058 Idx node = dest_nodes->elems[node_idx];
2059 if (!re_node_set_contains (dfa->inveclosures + node,
2060 cls_node)
2061 && !re_node_set_contains (dfa->eclosures + node,
2062 cls_node))
2064 /* It is against this limitation.
2065 Remove it form the current sifted state. */
2066 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067 candidates);
2068 if (__glibc_unlikely (err != REG_NOERROR))
2069 return err;
2070 --node_idx;
2074 else /* (ent->subexp_to != str_idx) */
2076 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2078 Idx node = dest_nodes->elems[node_idx];
2079 re_token_type_t type = dfa->nodes[node].type;
2080 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2082 if (subexp_idx != dfa->nodes[node].opr.idx)
2083 continue;
2084 /* It is against this limitation.
2085 Remove it form the current sifted state. */
2086 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2087 candidates);
2088 if (__glibc_unlikely (err != REG_NOERROR))
2089 return err;
2094 return REG_NOERROR;
2097 static reg_errcode_t
2098 __attribute_warn_unused_result__
2099 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2100 Idx str_idx, const re_node_set *candidates)
2102 const re_dfa_t *const dfa = mctx->dfa;
2103 reg_errcode_t err;
2104 Idx node_idx, node;
2105 re_sift_context_t local_sctx;
2106 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2108 if (first_idx == -1)
2109 return REG_NOERROR;
2111 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2113 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2115 Idx enabled_idx;
2116 re_token_type_t type;
2117 struct re_backref_cache_entry *entry;
2118 node = candidates->elems[node_idx];
2119 type = dfa->nodes[node].type;
2120 /* Avoid infinite loop for the REs like "()\1+". */
2121 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2122 continue;
2123 if (type != OP_BACK_REF)
2124 continue;
2126 entry = mctx->bkref_ents + first_idx;
2127 enabled_idx = first_idx;
2130 Idx subexp_len;
2131 Idx to_idx;
2132 Idx dst_node;
2133 bool ok;
2134 re_dfastate_t *cur_state;
2136 if (entry->node != node)
2137 continue;
2138 subexp_len = entry->subexp_to - entry->subexp_from;
2139 to_idx = str_idx + subexp_len;
2140 dst_node = (subexp_len ? dfa->nexts[node]
2141 : dfa->edests[node].elems[0]);
2143 if (to_idx > sctx->last_str_idx
2144 || sctx->sifted_states[to_idx] == NULL
2145 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2146 || check_dst_limits (mctx, &sctx->limits, node,
2147 str_idx, dst_node, to_idx))
2148 continue;
2150 if (local_sctx.sifted_states == NULL)
2152 local_sctx = *sctx;
2153 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2154 if (__glibc_unlikely (err != REG_NOERROR))
2155 goto free_return;
2157 local_sctx.last_node = node;
2158 local_sctx.last_str_idx = str_idx;
2159 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2160 if (__glibc_unlikely (! ok))
2162 err = REG_ESPACE;
2163 goto free_return;
2165 cur_state = local_sctx.sifted_states[str_idx];
2166 err = sift_states_backward (mctx, &local_sctx);
2167 if (__glibc_unlikely (err != REG_NOERROR))
2168 goto free_return;
2169 if (sctx->limited_states != NULL)
2171 err = merge_state_array (dfa, sctx->limited_states,
2172 local_sctx.sifted_states,
2173 str_idx + 1);
2174 if (__glibc_unlikely (err != REG_NOERROR))
2175 goto free_return;
2177 local_sctx.sifted_states[str_idx] = cur_state;
2178 re_node_set_remove (&local_sctx.limits, enabled_idx);
2180 /* mctx->bkref_ents may have changed, reload the pointer. */
2181 entry = mctx->bkref_ents + enabled_idx;
2183 while (enabled_idx++, entry++->more);
2185 err = REG_NOERROR;
2186 free_return:
2187 if (local_sctx.sifted_states != NULL)
2189 re_node_set_free (&local_sctx.limits);
2192 return err;
2196 #ifdef RE_ENABLE_I18N
2197 static int
2198 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2199 Idx node_idx, Idx str_idx, Idx max_str_idx)
2201 const re_dfa_t *const dfa = mctx->dfa;
2202 int naccepted;
2203 /* Check the node can accept "multi byte". */
2204 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2205 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2206 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2207 dfa->nexts[node_idx]))
2208 /* The node can't accept the "multi byte", or the
2209 destination was already thrown away, then the node
2210 couldn't accept the current input "multi byte". */
2211 naccepted = 0;
2212 /* Otherwise, it is sure that the node could accept
2213 'naccepted' bytes input. */
2214 return naccepted;
2216 #endif /* RE_ENABLE_I18N */
2219 /* Functions for state transition. */
2221 /* Return the next state to which the current state STATE will transit by
2222 accepting the current input byte, and update STATE_LOG if necessary.
2223 If STATE can accept a multibyte char/collating element/back reference
2224 update the destination of STATE_LOG. */
2226 static re_dfastate_t *
2227 __attribute_warn_unused_result__
2228 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2229 re_dfastate_t *state)
2231 re_dfastate_t **trtable;
2232 unsigned char ch;
2234 #ifdef RE_ENABLE_I18N
2235 /* If the current state can accept multibyte. */
2236 if (__glibc_unlikely (state->accept_mb))
2238 *err = transit_state_mb (mctx, state);
2239 if (__glibc_unlikely (*err != REG_NOERROR))
2240 return NULL;
2242 #endif /* RE_ENABLE_I18N */
2244 /* Then decide the next state with the single byte. */
2245 #if 0
2246 if (0)
2247 /* don't use transition table */
2248 return transit_state_sb (err, mctx, state);
2249 #endif
2251 /* Use transition table */
2252 ch = re_string_fetch_byte (&mctx->input);
2253 for (;;)
2255 trtable = state->trtable;
2256 if (__glibc_likely (trtable != NULL))
2257 return trtable[ch];
2259 trtable = state->word_trtable;
2260 if (__glibc_likely (trtable != NULL))
2262 unsigned int context;
2263 context
2264 = re_string_context_at (&mctx->input,
2265 re_string_cur_idx (&mctx->input) - 1,
2266 mctx->eflags);
2267 if (IS_WORD_CONTEXT (context))
2268 return trtable[ch + SBC_MAX];
2269 else
2270 return trtable[ch];
2273 if (!build_trtable (mctx->dfa, state))
2275 *err = REG_ESPACE;
2276 return NULL;
2279 /* Retry, we now have a transition table. */
2283 /* Update the state_log if we need */
2284 static re_dfastate_t *
2285 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2286 re_dfastate_t *next_state)
2288 const re_dfa_t *const dfa = mctx->dfa;
2289 Idx cur_idx = re_string_cur_idx (&mctx->input);
2291 if (cur_idx > mctx->state_log_top)
2293 mctx->state_log[cur_idx] = next_state;
2294 mctx->state_log_top = cur_idx;
2296 else if (mctx->state_log[cur_idx] == 0)
2298 mctx->state_log[cur_idx] = next_state;
2300 else
2302 re_dfastate_t *pstate;
2303 unsigned int context;
2304 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2305 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2306 the destination of a multibyte char/collating element/
2307 back reference. Then the next state is the union set of
2308 these destinations and the results of the transition table. */
2309 pstate = mctx->state_log[cur_idx];
2310 log_nodes = pstate->entrance_nodes;
2311 if (next_state != NULL)
2313 table_nodes = next_state->entrance_nodes;
2314 *err = re_node_set_init_union (&next_nodes, table_nodes,
2315 log_nodes);
2316 if (__glibc_unlikely (*err != REG_NOERROR))
2317 return NULL;
2319 else
2320 next_nodes = *log_nodes;
2321 /* Note: We already add the nodes of the initial state,
2322 then we don't need to add them here. */
2324 context = re_string_context_at (&mctx->input,
2325 re_string_cur_idx (&mctx->input) - 1,
2326 mctx->eflags);
2327 next_state = mctx->state_log[cur_idx]
2328 = re_acquire_state_context (err, dfa, &next_nodes, context);
2329 /* We don't need to check errors here, since the return value of
2330 this function is next_state and ERR is already set. */
2332 if (table_nodes != NULL)
2333 re_node_set_free (&next_nodes);
2336 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2338 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2339 later. We must check them here, since the back references in the
2340 next state might use them. */
2341 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2342 cur_idx);
2343 if (__glibc_unlikely (*err != REG_NOERROR))
2344 return NULL;
2346 /* If the next state has back references. */
2347 if (next_state->has_backref)
2349 *err = transit_state_bkref (mctx, &next_state->nodes);
2350 if (__glibc_unlikely (*err != REG_NOERROR))
2351 return NULL;
2352 next_state = mctx->state_log[cur_idx];
2356 return next_state;
2359 /* Skip bytes in the input that correspond to part of a
2360 multi-byte match, then look in the log for a state
2361 from which to restart matching. */
2362 static re_dfastate_t *
2363 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2365 re_dfastate_t *cur_state;
2368 Idx max = mctx->state_log_top;
2369 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2373 if (++cur_str_idx > max)
2374 return NULL;
2375 re_string_skip_bytes (&mctx->input, 1);
2377 while (mctx->state_log[cur_str_idx] == NULL);
2379 cur_state = merge_state_with_log (err, mctx, NULL);
2381 while (*err == REG_NOERROR && cur_state == NULL);
2382 return cur_state;
2385 /* Helper functions for transit_state. */
2387 /* From the node set CUR_NODES, pick up the nodes whose types are
2388 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2389 expression. And register them to use them later for evaluating the
2390 corresponding back references. */
2392 static reg_errcode_t
2393 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2394 Idx str_idx)
2396 const re_dfa_t *const dfa = mctx->dfa;
2397 Idx node_idx;
2398 reg_errcode_t err;
2400 /* TODO: This isn't efficient.
2401 Because there might be more than one nodes whose types are
2402 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2403 nodes.
2404 E.g. RE: (a){2} */
2405 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2407 Idx node = cur_nodes->elems[node_idx];
2408 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2409 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2410 && (dfa->used_bkref_map
2411 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2413 err = match_ctx_add_subtop (mctx, node, str_idx);
2414 if (__glibc_unlikely (err != REG_NOERROR))
2415 return err;
2418 return REG_NOERROR;
2421 #if 0
2422 /* Return the next state to which the current state STATE will transit by
2423 accepting the current input byte. */
2425 static re_dfastate_t *
2426 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2427 re_dfastate_t *state)
2429 const re_dfa_t *const dfa = mctx->dfa;
2430 re_node_set next_nodes;
2431 re_dfastate_t *next_state;
2432 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2433 unsigned int context;
2435 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2436 if (__glibc_unlikely (*err != REG_NOERROR))
2437 return NULL;
2438 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2440 Idx cur_node = state->nodes.elems[node_cnt];
2441 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2443 *err = re_node_set_merge (&next_nodes,
2444 dfa->eclosures + dfa->nexts[cur_node]);
2445 if (__glibc_unlikely (*err != REG_NOERROR))
2447 re_node_set_free (&next_nodes);
2448 return NULL;
2452 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2453 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2454 /* We don't need to check errors here, since the return value of
2455 this function is next_state and ERR is already set. */
2457 re_node_set_free (&next_nodes);
2458 re_string_skip_bytes (&mctx->input, 1);
2459 return next_state;
2461 #endif
2463 #ifdef RE_ENABLE_I18N
2464 static reg_errcode_t
2465 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2467 const re_dfa_t *const dfa = mctx->dfa;
2468 reg_errcode_t err;
2469 Idx i;
2471 for (i = 0; i < pstate->nodes.nelem; ++i)
2473 re_node_set dest_nodes, *new_nodes;
2474 Idx cur_node_idx = pstate->nodes.elems[i];
2475 int naccepted;
2476 Idx dest_idx;
2477 unsigned int context;
2478 re_dfastate_t *dest_state;
2480 if (!dfa->nodes[cur_node_idx].accept_mb)
2481 continue;
2483 if (dfa->nodes[cur_node_idx].constraint)
2485 context = re_string_context_at (&mctx->input,
2486 re_string_cur_idx (&mctx->input),
2487 mctx->eflags);
2488 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2489 context))
2490 continue;
2493 /* How many bytes the node can accept? */
2494 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2495 re_string_cur_idx (&mctx->input));
2496 if (naccepted == 0)
2497 continue;
2499 /* The node can accepts 'naccepted' bytes. */
2500 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2501 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2502 : mctx->max_mb_elem_len);
2503 err = clean_state_log_if_needed (mctx, dest_idx);
2504 if (__glibc_unlikely (err != REG_NOERROR))
2505 return err;
2506 #ifdef DEBUG
2507 assert (dfa->nexts[cur_node_idx] != -1);
2508 #endif
2509 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2511 dest_state = mctx->state_log[dest_idx];
2512 if (dest_state == NULL)
2513 dest_nodes = *new_nodes;
2514 else
2516 err = re_node_set_init_union (&dest_nodes,
2517 dest_state->entrance_nodes, new_nodes);
2518 if (__glibc_unlikely (err != REG_NOERROR))
2519 return err;
2521 context = re_string_context_at (&mctx->input, dest_idx - 1,
2522 mctx->eflags);
2523 mctx->state_log[dest_idx]
2524 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2525 if (dest_state != NULL)
2526 re_node_set_free (&dest_nodes);
2527 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2528 && err != REG_NOERROR))
2529 return err;
2531 return REG_NOERROR;
2533 #endif /* RE_ENABLE_I18N */
2535 static reg_errcode_t
2536 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2538 const re_dfa_t *const dfa = mctx->dfa;
2539 reg_errcode_t err;
2540 Idx i;
2541 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2543 for (i = 0; i < nodes->nelem; ++i)
2545 Idx dest_str_idx, prev_nelem, bkc_idx;
2546 Idx node_idx = nodes->elems[i];
2547 unsigned int context;
2548 const re_token_t *node = dfa->nodes + node_idx;
2549 re_node_set *new_dest_nodes;
2551 /* Check whether 'node' is a backreference or not. */
2552 if (node->type != OP_BACK_REF)
2553 continue;
2555 if (node->constraint)
2557 context = re_string_context_at (&mctx->input, cur_str_idx,
2558 mctx->eflags);
2559 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2560 continue;
2563 /* 'node' is a backreference.
2564 Check the substring which the substring matched. */
2565 bkc_idx = mctx->nbkref_ents;
2566 err = get_subexp (mctx, node_idx, cur_str_idx);
2567 if (__glibc_unlikely (err != REG_NOERROR))
2568 goto free_return;
2570 /* And add the epsilon closures (which is 'new_dest_nodes') of
2571 the backreference to appropriate state_log. */
2572 #ifdef DEBUG
2573 assert (dfa->nexts[node_idx] != -1);
2574 #endif
2575 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2577 Idx subexp_len;
2578 re_dfastate_t *dest_state;
2579 struct re_backref_cache_entry *bkref_ent;
2580 bkref_ent = mctx->bkref_ents + bkc_idx;
2581 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2582 continue;
2583 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2584 new_dest_nodes = (subexp_len == 0
2585 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2586 : dfa->eclosures + dfa->nexts[node_idx]);
2587 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2588 - bkref_ent->subexp_from);
2589 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2590 mctx->eflags);
2591 dest_state = mctx->state_log[dest_str_idx];
2592 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2593 : mctx->state_log[cur_str_idx]->nodes.nelem);
2594 /* Add 'new_dest_node' to state_log. */
2595 if (dest_state == NULL)
2597 mctx->state_log[dest_str_idx]
2598 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2599 context);
2600 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2601 && err != REG_NOERROR))
2602 goto free_return;
2604 else
2606 re_node_set dest_nodes;
2607 err = re_node_set_init_union (&dest_nodes,
2608 dest_state->entrance_nodes,
2609 new_dest_nodes);
2610 if (__glibc_unlikely (err != REG_NOERROR))
2612 re_node_set_free (&dest_nodes);
2613 goto free_return;
2615 mctx->state_log[dest_str_idx]
2616 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2617 re_node_set_free (&dest_nodes);
2618 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2619 && err != REG_NOERROR))
2620 goto free_return;
2622 /* We need to check recursively if the backreference can epsilon
2623 transit. */
2624 if (subexp_len == 0
2625 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2627 err = check_subexp_matching_top (mctx, new_dest_nodes,
2628 cur_str_idx);
2629 if (__glibc_unlikely (err != REG_NOERROR))
2630 goto free_return;
2631 err = transit_state_bkref (mctx, new_dest_nodes);
2632 if (__glibc_unlikely (err != REG_NOERROR))
2633 goto free_return;
2637 err = REG_NOERROR;
2638 free_return:
2639 return err;
2642 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2643 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2644 Note that we might collect inappropriate candidates here.
2645 However, the cost of checking them strictly here is too high, then we
2646 delay these checking for prune_impossible_nodes(). */
2648 static reg_errcode_t
2649 __attribute_warn_unused_result__
2650 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2652 const re_dfa_t *const dfa = mctx->dfa;
2653 Idx subexp_num, sub_top_idx;
2654 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2655 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2656 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2657 if (cache_idx != -1)
2659 const struct re_backref_cache_entry *entry
2660 = mctx->bkref_ents + cache_idx;
2662 if (entry->node == bkref_node)
2663 return REG_NOERROR; /* We already checked it. */
2664 while (entry++->more);
2667 subexp_num = dfa->nodes[bkref_node].opr.idx;
2669 /* For each sub expression */
2670 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2672 reg_errcode_t err;
2673 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2674 re_sub_match_last_t *sub_last;
2675 Idx sub_last_idx, sl_str, bkref_str_off;
2677 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2678 continue; /* It isn't related. */
2680 sl_str = sub_top->str_idx;
2681 bkref_str_off = bkref_str_idx;
2682 /* At first, check the last node of sub expressions we already
2683 evaluated. */
2684 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2686 regoff_t sl_str_diff;
2687 sub_last = sub_top->lasts[sub_last_idx];
2688 sl_str_diff = sub_last->str_idx - sl_str;
2689 /* The matched string by the sub expression match with the substring
2690 at the back reference? */
2691 if (sl_str_diff > 0)
2693 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2694 > mctx->input.valid_len))
2696 /* Not enough chars for a successful match. */
2697 if (bkref_str_off + sl_str_diff > mctx->input.len)
2698 break;
2700 err = clean_state_log_if_needed (mctx,
2701 bkref_str_off
2702 + sl_str_diff);
2703 if (__glibc_unlikely (err != REG_NOERROR))
2704 return err;
2705 buf = (const char *) re_string_get_buffer (&mctx->input);
2707 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2708 /* We don't need to search this sub expression any more. */
2709 break;
2711 bkref_str_off += sl_str_diff;
2712 sl_str += sl_str_diff;
2713 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2714 bkref_str_idx);
2716 /* Reload buf, since the preceding call might have reallocated
2717 the buffer. */
2718 buf = (const char *) re_string_get_buffer (&mctx->input);
2720 if (err == REG_NOMATCH)
2721 continue;
2722 if (__glibc_unlikely (err != REG_NOERROR))
2723 return err;
2726 if (sub_last_idx < sub_top->nlasts)
2727 continue;
2728 if (sub_last_idx > 0)
2729 ++sl_str;
2730 /* Then, search for the other last nodes of the sub expression. */
2731 for (; sl_str <= bkref_str_idx; ++sl_str)
2733 Idx cls_node;
2734 regoff_t sl_str_off;
2735 const re_node_set *nodes;
2736 sl_str_off = sl_str - sub_top->str_idx;
2737 /* The matched string by the sub expression match with the substring
2738 at the back reference? */
2739 if (sl_str_off > 0)
2741 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2743 /* If we are at the end of the input, we cannot match. */
2744 if (bkref_str_off >= mctx->input.len)
2745 break;
2747 err = extend_buffers (mctx, bkref_str_off + 1);
2748 if (__glibc_unlikely (err != REG_NOERROR))
2749 return err;
2751 buf = (const char *) re_string_get_buffer (&mctx->input);
2753 if (buf [bkref_str_off++] != buf[sl_str - 1])
2754 break; /* We don't need to search this sub expression
2755 any more. */
2757 if (mctx->state_log[sl_str] == NULL)
2758 continue;
2759 /* Does this state have a ')' of the sub expression? */
2760 nodes = &mctx->state_log[sl_str]->nodes;
2761 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2762 OP_CLOSE_SUBEXP);
2763 if (cls_node == -1)
2764 continue; /* No. */
2765 if (sub_top->path == NULL)
2767 sub_top->path = calloc (sizeof (state_array_t),
2768 sl_str - sub_top->str_idx + 1);
2769 if (sub_top->path == NULL)
2770 return REG_ESPACE;
2772 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2773 in the current context? */
2774 err = check_arrival (mctx, sub_top->path, sub_top->node,
2775 sub_top->str_idx, cls_node, sl_str,
2776 OP_CLOSE_SUBEXP);
2777 if (err == REG_NOMATCH)
2778 continue;
2779 if (__glibc_unlikely (err != REG_NOERROR))
2780 return err;
2781 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2782 if (__glibc_unlikely (sub_last == NULL))
2783 return REG_ESPACE;
2784 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2785 bkref_str_idx);
2786 buf = (const char *) re_string_get_buffer (&mctx->input);
2787 if (err == REG_NOMATCH)
2788 continue;
2789 if (__glibc_unlikely (err != REG_NOERROR))
2790 return err;
2793 return REG_NOERROR;
2796 /* Helper functions for get_subexp(). */
2798 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2799 If it can arrive, register the sub expression expressed with SUB_TOP
2800 and SUB_LAST. */
2802 static reg_errcode_t
2803 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2804 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2806 reg_errcode_t err;
2807 Idx to_idx;
2808 /* Can the subexpression arrive the back reference? */
2809 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2810 sub_last->str_idx, bkref_node, bkref_str,
2811 OP_OPEN_SUBEXP);
2812 if (err != REG_NOERROR)
2813 return err;
2814 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2815 sub_last->str_idx);
2816 if (__glibc_unlikely (err != REG_NOERROR))
2817 return err;
2818 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2819 return clean_state_log_if_needed (mctx, to_idx);
2822 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2823 Search '(' if FL_OPEN, or search ')' otherwise.
2824 TODO: This function isn't efficient...
2825 Because there might be more than one nodes whose types are
2826 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2827 nodes.
2828 E.g. RE: (a){2} */
2830 static Idx
2831 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2832 Idx subexp_idx, int type)
2834 Idx cls_idx;
2835 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2837 Idx cls_node = nodes->elems[cls_idx];
2838 const re_token_t *node = dfa->nodes + cls_node;
2839 if (node->type == type
2840 && node->opr.idx == subexp_idx)
2841 return cls_node;
2843 return -1;
2846 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2847 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2848 heavily reused.
2849 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2851 static reg_errcode_t
2852 __attribute_warn_unused_result__
2853 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2854 Idx top_str, Idx last_node, Idx last_str, int type)
2856 const re_dfa_t *const dfa = mctx->dfa;
2857 reg_errcode_t err = REG_NOERROR;
2858 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2859 re_dfastate_t *cur_state = NULL;
2860 re_node_set *cur_nodes, next_nodes;
2861 re_dfastate_t **backup_state_log;
2862 unsigned int context;
2864 subexp_num = dfa->nodes[top_node].opr.idx;
2865 /* Extend the buffer if we need. */
2866 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2868 re_dfastate_t **new_array;
2869 Idx old_alloc = path->alloc;
2870 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2871 Idx new_alloc;
2872 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2873 return REG_ESPACE;
2874 new_alloc = old_alloc + incr_alloc;
2875 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2876 return REG_ESPACE;
2877 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2878 if (__glibc_unlikely (new_array == NULL))
2879 return REG_ESPACE;
2880 path->array = new_array;
2881 path->alloc = new_alloc;
2882 memset (new_array + old_alloc, '\0',
2883 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2886 str_idx = path->next_idx ? path->next_idx : top_str;
2888 /* Temporary modify MCTX. */
2889 backup_state_log = mctx->state_log;
2890 backup_cur_idx = mctx->input.cur_idx;
2891 mctx->state_log = path->array;
2892 mctx->input.cur_idx = str_idx;
2894 /* Setup initial node set. */
2895 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2896 if (str_idx == top_str)
2898 err = re_node_set_init_1 (&next_nodes, top_node);
2899 if (__glibc_unlikely (err != REG_NOERROR))
2900 return err;
2901 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2902 if (__glibc_unlikely (err != REG_NOERROR))
2904 re_node_set_free (&next_nodes);
2905 return err;
2908 else
2910 cur_state = mctx->state_log[str_idx];
2911 if (cur_state && cur_state->has_backref)
2913 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2914 if (__glibc_unlikely (err != REG_NOERROR))
2915 return err;
2917 else
2918 re_node_set_init_empty (&next_nodes);
2920 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2922 if (next_nodes.nelem)
2924 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2925 subexp_num, type);
2926 if (__glibc_unlikely (err != REG_NOERROR))
2928 re_node_set_free (&next_nodes);
2929 return err;
2932 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2933 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2935 re_node_set_free (&next_nodes);
2936 return err;
2938 mctx->state_log[str_idx] = cur_state;
2941 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2943 re_node_set_empty (&next_nodes);
2944 if (mctx->state_log[str_idx + 1])
2946 err = re_node_set_merge (&next_nodes,
2947 &mctx->state_log[str_idx + 1]->nodes);
2948 if (__glibc_unlikely (err != REG_NOERROR))
2950 re_node_set_free (&next_nodes);
2951 return err;
2954 if (cur_state)
2956 err = check_arrival_add_next_nodes (mctx, str_idx,
2957 &cur_state->non_eps_nodes,
2958 &next_nodes);
2959 if (__glibc_unlikely (err != REG_NOERROR))
2961 re_node_set_free (&next_nodes);
2962 return err;
2965 ++str_idx;
2966 if (next_nodes.nelem)
2968 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2969 if (__glibc_unlikely (err != REG_NOERROR))
2971 re_node_set_free (&next_nodes);
2972 return err;
2974 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2975 subexp_num, type);
2976 if (__glibc_unlikely (err != REG_NOERROR))
2978 re_node_set_free (&next_nodes);
2979 return err;
2982 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2983 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2984 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2986 re_node_set_free (&next_nodes);
2987 return err;
2989 mctx->state_log[str_idx] = cur_state;
2990 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2992 re_node_set_free (&next_nodes);
2993 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2994 : &mctx->state_log[last_str]->nodes);
2995 path->next_idx = str_idx;
2997 /* Fix MCTX. */
2998 mctx->state_log = backup_state_log;
2999 mctx->input.cur_idx = backup_cur_idx;
3001 /* Then check the current node set has the node LAST_NODE. */
3002 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3003 return REG_NOERROR;
3005 return REG_NOMATCH;
3008 /* Helper functions for check_arrival. */
3010 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3011 to NEXT_NODES.
3012 TODO: This function is similar to the functions transit_state*(),
3013 however this function has many additional works.
3014 Can't we unify them? */
3016 static reg_errcode_t
3017 __attribute_warn_unused_result__
3018 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3019 re_node_set *cur_nodes, re_node_set *next_nodes)
3021 const re_dfa_t *const dfa = mctx->dfa;
3022 bool ok;
3023 Idx cur_idx;
3024 #ifdef RE_ENABLE_I18N
3025 reg_errcode_t err = REG_NOERROR;
3026 #endif
3027 re_node_set union_set;
3028 re_node_set_init_empty (&union_set);
3029 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3031 int naccepted = 0;
3032 Idx cur_node = cur_nodes->elems[cur_idx];
3033 #ifdef DEBUG
3034 re_token_type_t type = dfa->nodes[cur_node].type;
3035 assert (!IS_EPSILON_NODE (type));
3036 #endif
3037 #ifdef RE_ENABLE_I18N
3038 /* If the node may accept "multi byte". */
3039 if (dfa->nodes[cur_node].accept_mb)
3041 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3042 str_idx);
3043 if (naccepted > 1)
3045 re_dfastate_t *dest_state;
3046 Idx next_node = dfa->nexts[cur_node];
3047 Idx next_idx = str_idx + naccepted;
3048 dest_state = mctx->state_log[next_idx];
3049 re_node_set_empty (&union_set);
3050 if (dest_state)
3052 err = re_node_set_merge (&union_set, &dest_state->nodes);
3053 if (__glibc_unlikely (err != REG_NOERROR))
3055 re_node_set_free (&union_set);
3056 return err;
3059 ok = re_node_set_insert (&union_set, next_node);
3060 if (__glibc_unlikely (! ok))
3062 re_node_set_free (&union_set);
3063 return REG_ESPACE;
3065 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3066 &union_set);
3067 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3068 && err != REG_NOERROR))
3070 re_node_set_free (&union_set);
3071 return err;
3075 #endif /* RE_ENABLE_I18N */
3076 if (naccepted
3077 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3079 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3080 if (__glibc_unlikely (! ok))
3082 re_node_set_free (&union_set);
3083 return REG_ESPACE;
3087 re_node_set_free (&union_set);
3088 return REG_NOERROR;
3091 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3092 CUR_NODES, however exclude the nodes which are:
3093 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3094 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3097 static reg_errcode_t
3098 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3099 Idx ex_subexp, int type)
3101 reg_errcode_t err;
3102 Idx idx, outside_node;
3103 re_node_set new_nodes;
3104 #ifdef DEBUG
3105 assert (cur_nodes->nelem);
3106 #endif
3107 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3108 if (__glibc_unlikely (err != REG_NOERROR))
3109 return err;
3110 /* Create a new node set NEW_NODES with the nodes which are epsilon
3111 closures of the node in CUR_NODES. */
3113 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3115 Idx cur_node = cur_nodes->elems[idx];
3116 const re_node_set *eclosure = dfa->eclosures + cur_node;
3117 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3118 if (outside_node == -1)
3120 /* There are no problematic nodes, just merge them. */
3121 err = re_node_set_merge (&new_nodes, eclosure);
3122 if (__glibc_unlikely (err != REG_NOERROR))
3124 re_node_set_free (&new_nodes);
3125 return err;
3128 else
3130 /* There are problematic nodes, re-calculate incrementally. */
3131 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3132 ex_subexp, type);
3133 if (__glibc_unlikely (err != REG_NOERROR))
3135 re_node_set_free (&new_nodes);
3136 return err;
3140 re_node_set_free (cur_nodes);
3141 *cur_nodes = new_nodes;
3142 return REG_NOERROR;
3145 /* Helper function for check_arrival_expand_ecl.
3146 Check incrementally the epsilon closure of TARGET, and if it isn't
3147 problematic append it to DST_NODES. */
3149 static reg_errcode_t
3150 __attribute_warn_unused_result__
3151 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3152 Idx target, Idx ex_subexp, int type)
3154 Idx cur_node;
3155 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3157 bool ok;
3159 if (dfa->nodes[cur_node].type == type
3160 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3162 if (type == OP_CLOSE_SUBEXP)
3164 ok = re_node_set_insert (dst_nodes, cur_node);
3165 if (__glibc_unlikely (! ok))
3166 return REG_ESPACE;
3168 break;
3170 ok = re_node_set_insert (dst_nodes, cur_node);
3171 if (__glibc_unlikely (! ok))
3172 return REG_ESPACE;
3173 if (dfa->edests[cur_node].nelem == 0)
3174 break;
3175 if (dfa->edests[cur_node].nelem == 2)
3177 reg_errcode_t err;
3178 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3179 dfa->edests[cur_node].elems[1],
3180 ex_subexp, type);
3181 if (__glibc_unlikely (err != REG_NOERROR))
3182 return err;
3184 cur_node = dfa->edests[cur_node].elems[0];
3186 return REG_NOERROR;
3190 /* For all the back references in the current state, calculate the
3191 destination of the back references by the appropriate entry
3192 in MCTX->BKREF_ENTS. */
3194 static reg_errcode_t
3195 __attribute_warn_unused_result__
3196 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3197 Idx cur_str, Idx subexp_num, int type)
3199 const re_dfa_t *const dfa = mctx->dfa;
3200 reg_errcode_t err;
3201 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3202 struct re_backref_cache_entry *ent;
3204 if (cache_idx_start == -1)
3205 return REG_NOERROR;
3207 restart:
3208 ent = mctx->bkref_ents + cache_idx_start;
3211 Idx to_idx, next_node;
3213 /* Is this entry ENT is appropriate? */
3214 if (!re_node_set_contains (cur_nodes, ent->node))
3215 continue; /* No. */
3217 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3218 /* Calculate the destination of the back reference, and append it
3219 to MCTX->STATE_LOG. */
3220 if (to_idx == cur_str)
3222 /* The backreference did epsilon transit, we must re-check all the
3223 node in the current state. */
3224 re_node_set new_dests;
3225 reg_errcode_t err2, err3;
3226 next_node = dfa->edests[ent->node].elems[0];
3227 if (re_node_set_contains (cur_nodes, next_node))
3228 continue;
3229 err = re_node_set_init_1 (&new_dests, next_node);
3230 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3231 err3 = re_node_set_merge (cur_nodes, &new_dests);
3232 re_node_set_free (&new_dests);
3233 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3234 || err3 != REG_NOERROR))
3236 err = (err != REG_NOERROR ? err
3237 : (err2 != REG_NOERROR ? err2 : err3));
3238 return err;
3240 /* TODO: It is still inefficient... */
3241 goto restart;
3243 else
3245 re_node_set union_set;
3246 next_node = dfa->nexts[ent->node];
3247 if (mctx->state_log[to_idx])
3249 bool ok;
3250 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3251 next_node))
3252 continue;
3253 err = re_node_set_init_copy (&union_set,
3254 &mctx->state_log[to_idx]->nodes);
3255 ok = re_node_set_insert (&union_set, next_node);
3256 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3258 re_node_set_free (&union_set);
3259 err = err != REG_NOERROR ? err : REG_ESPACE;
3260 return err;
3263 else
3265 err = re_node_set_init_1 (&union_set, next_node);
3266 if (__glibc_unlikely (err != REG_NOERROR))
3267 return err;
3269 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3270 re_node_set_free (&union_set);
3271 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3272 && err != REG_NOERROR))
3273 return err;
3276 while (ent++->more);
3277 return REG_NOERROR;
3280 /* Build transition table for the state.
3281 Return true if successful. */
3283 static bool
3284 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3286 reg_errcode_t err;
3287 Idx i, j;
3288 int ch;
3289 bool need_word_trtable = false;
3290 bitset_word_t elem, mask;
3291 bool dests_node_malloced = false;
3292 bool dest_states_malloced = false;
3293 Idx ndests; /* Number of the destination states from 'state'. */
3294 re_dfastate_t **trtable;
3295 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3296 re_node_set follows, *dests_node;
3297 bitset_t *dests_ch;
3298 bitset_t acceptable;
3300 struct dests_alloc
3302 re_node_set dests_node[SBC_MAX];
3303 bitset_t dests_ch[SBC_MAX];
3304 } *dests_alloc;
3306 /* We build DFA states which corresponds to the destination nodes
3307 from 'state'. 'dests_node[i]' represents the nodes which i-th
3308 destination state contains, and 'dests_ch[i]' represents the
3309 characters which i-th destination state accepts. */
3310 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3311 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3312 else
3314 dests_alloc = re_malloc (struct dests_alloc, 1);
3315 if (__glibc_unlikely (dests_alloc == NULL))
3316 return false;
3317 dests_node_malloced = true;
3319 dests_node = dests_alloc->dests_node;
3320 dests_ch = dests_alloc->dests_ch;
3322 /* Initialize transition table. */
3323 state->word_trtable = state->trtable = NULL;
3325 /* At first, group all nodes belonging to 'state' into several
3326 destinations. */
3327 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3328 if (__glibc_unlikely (ndests <= 0))
3330 if (dests_node_malloced)
3331 re_free (dests_alloc);
3332 /* Return false in case of an error, true otherwise. */
3333 if (ndests == 0)
3335 state->trtable = (re_dfastate_t **)
3336 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3337 if (__glibc_unlikely (state->trtable == NULL))
3338 return false;
3339 return true;
3341 return false;
3344 err = re_node_set_alloc (&follows, ndests + 1);
3345 if (__glibc_unlikely (err != REG_NOERROR))
3346 goto out_free;
3348 /* Avoid arithmetic overflow in size calculation. */
3349 size_t ndests_max
3350 = ((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3351 / (3 * sizeof (re_dfastate_t *)));
3352 if (__glibc_unlikely (ndests_max < ndests))
3353 goto out_free;
3355 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3356 + ndests * 3 * sizeof (re_dfastate_t *)))
3357 dest_states = (re_dfastate_t **)
3358 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3359 else
3361 dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3362 if (__glibc_unlikely (dest_states == NULL))
3364 out_free:
3365 if (dest_states_malloced)
3366 re_free (dest_states);
3367 re_node_set_free (&follows);
3368 for (i = 0; i < ndests; ++i)
3369 re_node_set_free (dests_node + i);
3370 if (dests_node_malloced)
3371 re_free (dests_alloc);
3372 return false;
3374 dest_states_malloced = true;
3376 dest_states_word = dest_states + ndests;
3377 dest_states_nl = dest_states_word + ndests;
3378 bitset_empty (acceptable);
3380 /* Then build the states for all destinations. */
3381 for (i = 0; i < ndests; ++i)
3383 Idx next_node;
3384 re_node_set_empty (&follows);
3385 /* Merge the follows of this destination states. */
3386 for (j = 0; j < dests_node[i].nelem; ++j)
3388 next_node = dfa->nexts[dests_node[i].elems[j]];
3389 if (next_node != -1)
3391 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3392 if (__glibc_unlikely (err != REG_NOERROR))
3393 goto out_free;
3396 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3397 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3398 goto out_free;
3399 /* If the new state has context constraint,
3400 build appropriate states for these contexts. */
3401 if (dest_states[i]->has_constraint)
3403 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3404 CONTEXT_WORD);
3405 if (__glibc_unlikely (dest_states_word[i] == NULL
3406 && err != REG_NOERROR))
3407 goto out_free;
3409 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3410 need_word_trtable = true;
3412 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3413 CONTEXT_NEWLINE);
3414 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3415 goto out_free;
3417 else
3419 dest_states_word[i] = dest_states[i];
3420 dest_states_nl[i] = dest_states[i];
3422 bitset_merge (acceptable, dests_ch[i]);
3425 if (!__glibc_unlikely (need_word_trtable))
3427 /* We don't care about whether the following character is a word
3428 character, or we are in a single-byte character set so we can
3429 discern by looking at the character code: allocate a
3430 256-entry transition table. */
3431 trtable = state->trtable =
3432 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3433 if (__glibc_unlikely (trtable == NULL))
3434 goto out_free;
3436 /* For all characters ch...: */
3437 for (i = 0; i < BITSET_WORDS; ++i)
3438 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3439 elem;
3440 mask <<= 1, elem >>= 1, ++ch)
3441 if (__glibc_unlikely (elem & 1))
3443 /* There must be exactly one destination which accepts
3444 character ch. See group_nodes_into_DFAstates. */
3445 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3448 /* j-th destination accepts the word character ch. */
3449 if (dfa->word_char[i] & mask)
3450 trtable[ch] = dest_states_word[j];
3451 else
3452 trtable[ch] = dest_states[j];
3455 else
3457 /* We care about whether the following character is a word
3458 character, and we are in a multi-byte character set: discern
3459 by looking at the character code: build two 256-entry
3460 transition tables, one starting at trtable[0] and one
3461 starting at trtable[SBC_MAX]. */
3462 trtable = state->word_trtable =
3463 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3464 if (__glibc_unlikely (trtable == NULL))
3465 goto out_free;
3467 /* For all characters ch...: */
3468 for (i = 0; i < BITSET_WORDS; ++i)
3469 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3470 elem;
3471 mask <<= 1, elem >>= 1, ++ch)
3472 if (__glibc_unlikely (elem & 1))
3474 /* There must be exactly one destination which accepts
3475 character ch. See group_nodes_into_DFAstates. */
3476 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3479 /* j-th destination accepts the word character ch. */
3480 trtable[ch] = dest_states[j];
3481 trtable[ch + SBC_MAX] = dest_states_word[j];
3485 /* new line */
3486 if (bitset_contain (acceptable, NEWLINE_CHAR))
3488 /* The current state accepts newline character. */
3489 for (j = 0; j < ndests; ++j)
3490 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3492 /* k-th destination accepts newline character. */
3493 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3494 if (need_word_trtable)
3495 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3496 /* There must be only one destination which accepts
3497 newline. See group_nodes_into_DFAstates. */
3498 break;
3502 if (dest_states_malloced)
3503 re_free (dest_states);
3505 re_node_set_free (&follows);
3506 for (i = 0; i < ndests; ++i)
3507 re_node_set_free (dests_node + i);
3509 if (dests_node_malloced)
3510 re_free (dests_alloc);
3512 return true;
3515 /* Group all nodes belonging to STATE into several destinations.
3516 Then for all destinations, set the nodes belonging to the destination
3517 to DESTS_NODE[i] and set the characters accepted by the destination
3518 to DEST_CH[i]. This function return the number of destinations. */
3520 static Idx
3521 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3522 re_node_set *dests_node, bitset_t *dests_ch)
3524 reg_errcode_t err;
3525 bool ok;
3526 Idx i, j, k;
3527 Idx ndests; /* Number of the destinations from 'state'. */
3528 bitset_t accepts; /* Characters a node can accept. */
3529 const re_node_set *cur_nodes = &state->nodes;
3530 bitset_empty (accepts);
3531 ndests = 0;
3533 /* For all the nodes belonging to 'state', */
3534 for (i = 0; i < cur_nodes->nelem; ++i)
3536 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3537 re_token_type_t type = node->type;
3538 unsigned int constraint = node->constraint;
3540 /* Enumerate all single byte character this node can accept. */
3541 if (type == CHARACTER)
3542 bitset_set (accepts, node->opr.c);
3543 else if (type == SIMPLE_BRACKET)
3545 bitset_merge (accepts, node->opr.sbcset);
3547 else if (type == OP_PERIOD)
3549 #ifdef RE_ENABLE_I18N
3550 if (dfa->mb_cur_max > 1)
3551 bitset_merge (accepts, dfa->sb_char);
3552 else
3553 #endif
3554 bitset_set_all (accepts);
3555 if (!(dfa->syntax & RE_DOT_NEWLINE))
3556 bitset_clear (accepts, '\n');
3557 if (dfa->syntax & RE_DOT_NOT_NULL)
3558 bitset_clear (accepts, '\0');
3560 #ifdef RE_ENABLE_I18N
3561 else if (type == OP_UTF8_PERIOD)
3563 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3564 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3565 else
3566 bitset_merge (accepts, utf8_sb_map);
3567 if (!(dfa->syntax & RE_DOT_NEWLINE))
3568 bitset_clear (accepts, '\n');
3569 if (dfa->syntax & RE_DOT_NOT_NULL)
3570 bitset_clear (accepts, '\0');
3572 #endif
3573 else
3574 continue;
3576 /* Check the 'accepts' and sift the characters which are not
3577 match it the context. */
3578 if (constraint)
3580 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3582 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3583 bitset_empty (accepts);
3584 if (accepts_newline)
3585 bitset_set (accepts, NEWLINE_CHAR);
3586 else
3587 continue;
3589 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3591 bitset_empty (accepts);
3592 continue;
3595 if (constraint & NEXT_WORD_CONSTRAINT)
3597 bitset_word_t any_set = 0;
3598 if (type == CHARACTER && !node->word_char)
3600 bitset_empty (accepts);
3601 continue;
3603 #ifdef RE_ENABLE_I18N
3604 if (dfa->mb_cur_max > 1)
3605 for (j = 0; j < BITSET_WORDS; ++j)
3606 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3607 else
3608 #endif
3609 for (j = 0; j < BITSET_WORDS; ++j)
3610 any_set |= (accepts[j] &= dfa->word_char[j]);
3611 if (!any_set)
3612 continue;
3614 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3616 bitset_word_t any_set = 0;
3617 if (type == CHARACTER && node->word_char)
3619 bitset_empty (accepts);
3620 continue;
3622 #ifdef RE_ENABLE_I18N
3623 if (dfa->mb_cur_max > 1)
3624 for (j = 0; j < BITSET_WORDS; ++j)
3625 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3626 else
3627 #endif
3628 for (j = 0; j < BITSET_WORDS; ++j)
3629 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3630 if (!any_set)
3631 continue;
3635 /* Then divide 'accepts' into DFA states, or create a new
3636 state. Above, we make sure that accepts is not empty. */
3637 for (j = 0; j < ndests; ++j)
3639 bitset_t intersec; /* Intersection sets, see below. */
3640 bitset_t remains;
3641 /* Flags, see below. */
3642 bitset_word_t has_intersec, not_subset, not_consumed;
3644 /* Optimization, skip if this state doesn't accept the character. */
3645 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3646 continue;
3648 /* Enumerate the intersection set of this state and 'accepts'. */
3649 has_intersec = 0;
3650 for (k = 0; k < BITSET_WORDS; ++k)
3651 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3652 /* And skip if the intersection set is empty. */
3653 if (!has_intersec)
3654 continue;
3656 /* Then check if this state is a subset of 'accepts'. */
3657 not_subset = not_consumed = 0;
3658 for (k = 0; k < BITSET_WORDS; ++k)
3660 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3661 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3664 /* If this state isn't a subset of 'accepts', create a
3665 new group state, which has the 'remains'. */
3666 if (not_subset)
3668 bitset_copy (dests_ch[ndests], remains);
3669 bitset_copy (dests_ch[j], intersec);
3670 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3671 if (__glibc_unlikely (err != REG_NOERROR))
3672 goto error_return;
3673 ++ndests;
3676 /* Put the position in the current group. */
3677 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3678 if (__glibc_unlikely (! ok))
3679 goto error_return;
3681 /* If all characters are consumed, go to next node. */
3682 if (!not_consumed)
3683 break;
3685 /* Some characters remain, create a new group. */
3686 if (j == ndests)
3688 bitset_copy (dests_ch[ndests], accepts);
3689 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3690 if (__glibc_unlikely (err != REG_NOERROR))
3691 goto error_return;
3692 ++ndests;
3693 bitset_empty (accepts);
3696 return ndests;
3697 error_return:
3698 for (j = 0; j < ndests; ++j)
3699 re_node_set_free (dests_node + j);
3700 return -1;
3703 #ifdef RE_ENABLE_I18N
3704 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3705 Return the number of the bytes the node accepts.
3706 STR_IDX is the current index of the input string.
3708 This function handles the nodes which can accept one character, or
3709 one collating element like '.', '[a-z]', opposite to the other nodes
3710 can only accept one byte. */
3712 # ifdef _LIBC
3713 # include <locale/weight.h>
3714 # endif
3716 static int
3717 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3718 const re_string_t *input, Idx str_idx)
3720 const re_token_t *node = dfa->nodes + node_idx;
3721 int char_len, elem_len;
3722 Idx i;
3724 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3726 unsigned char c = re_string_byte_at (input, str_idx), d;
3727 if (__glibc_likely (c < 0xc2))
3728 return 0;
3730 if (str_idx + 2 > input->len)
3731 return 0;
3733 d = re_string_byte_at (input, str_idx + 1);
3734 if (c < 0xe0)
3735 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3736 else if (c < 0xf0)
3738 char_len = 3;
3739 if (c == 0xe0 && d < 0xa0)
3740 return 0;
3742 else if (c < 0xf8)
3744 char_len = 4;
3745 if (c == 0xf0 && d < 0x90)
3746 return 0;
3748 else if (c < 0xfc)
3750 char_len = 5;
3751 if (c == 0xf8 && d < 0x88)
3752 return 0;
3754 else if (c < 0xfe)
3756 char_len = 6;
3757 if (c == 0xfc && d < 0x84)
3758 return 0;
3760 else
3761 return 0;
3763 if (str_idx + char_len > input->len)
3764 return 0;
3766 for (i = 1; i < char_len; ++i)
3768 d = re_string_byte_at (input, str_idx + i);
3769 if (d < 0x80 || d > 0xbf)
3770 return 0;
3772 return char_len;
3775 char_len = re_string_char_size_at (input, str_idx);
3776 if (node->type == OP_PERIOD)
3778 if (char_len <= 1)
3779 return 0;
3780 /* FIXME: I don't think this if is needed, as both '\n'
3781 and '\0' are char_len == 1. */
3782 /* '.' accepts any one character except the following two cases. */
3783 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3784 re_string_byte_at (input, str_idx) == '\n') ||
3785 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3786 re_string_byte_at (input, str_idx) == '\0'))
3787 return 0;
3788 return char_len;
3791 elem_len = re_string_elem_size_at (input, str_idx);
3792 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3793 return 0;
3795 if (node->type == COMPLEX_BRACKET)
3797 const re_charset_t *cset = node->opr.mbcset;
3798 # ifdef _LIBC
3799 const unsigned char *pin
3800 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3801 Idx j;
3802 uint32_t nrules;
3803 # endif /* _LIBC */
3804 int match_len = 0;
3805 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3806 ? re_string_wchar_at (input, str_idx) : 0);
3808 /* match with multibyte character? */
3809 for (i = 0; i < cset->nmbchars; ++i)
3810 if (wc == cset->mbchars[i])
3812 match_len = char_len;
3813 goto check_node_accept_bytes_match;
3815 /* match with character_class? */
3816 for (i = 0; i < cset->nchar_classes; ++i)
3818 wctype_t wt = cset->char_classes[i];
3819 if (__iswctype (wc, wt))
3821 match_len = char_len;
3822 goto check_node_accept_bytes_match;
3826 # ifdef _LIBC
3827 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3828 if (nrules != 0)
3830 unsigned int in_collseq = 0;
3831 const int32_t *table, *indirect;
3832 const unsigned char *weights, *extra;
3833 const char *collseqwc;
3835 /* match with collating_symbol? */
3836 if (cset->ncoll_syms)
3837 extra = (const unsigned char *)
3838 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3839 for (i = 0; i < cset->ncoll_syms; ++i)
3841 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3842 /* Compare the length of input collating element and
3843 the length of current collating element. */
3844 if (*coll_sym != elem_len)
3845 continue;
3846 /* Compare each bytes. */
3847 for (j = 0; j < *coll_sym; j++)
3848 if (pin[j] != coll_sym[1 + j])
3849 break;
3850 if (j == *coll_sym)
3852 /* Match if every bytes is equal. */
3853 match_len = j;
3854 goto check_node_accept_bytes_match;
3858 if (cset->nranges)
3860 if (elem_len <= char_len)
3862 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3863 in_collseq = __collseq_table_lookup (collseqwc, wc);
3865 else
3866 in_collseq = find_collation_sequence_value (pin, elem_len);
3868 /* match with range expression? */
3869 /* FIXME: Implement rational ranges here, too. */
3870 for (i = 0; i < cset->nranges; ++i)
3871 if (cset->range_starts[i] <= in_collseq
3872 && in_collseq <= cset->range_ends[i])
3874 match_len = elem_len;
3875 goto check_node_accept_bytes_match;
3878 /* match with equivalence_class? */
3879 if (cset->nequiv_classes)
3881 const unsigned char *cp = pin;
3882 table = (const int32_t *)
3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3884 weights = (const unsigned char *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3886 extra = (const unsigned char *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3888 indirect = (const int32_t *)
3889 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3890 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3891 int32_t rule = idx >> 24;
3892 idx &= 0xffffff;
3893 if (idx > 0)
3895 size_t weight_len = weights[idx];
3896 for (i = 0; i < cset->nequiv_classes; ++i)
3898 int32_t equiv_class_idx = cset->equiv_classes[i];
3899 int32_t equiv_class_rule = equiv_class_idx >> 24;
3900 equiv_class_idx &= 0xffffff;
3901 if (weights[equiv_class_idx] == weight_len
3902 && equiv_class_rule == rule
3903 && memcmp (weights + idx + 1,
3904 weights + equiv_class_idx + 1,
3905 weight_len) == 0)
3907 match_len = elem_len;
3908 goto check_node_accept_bytes_match;
3914 else
3915 # endif /* _LIBC */
3917 /* match with range expression? */
3918 for (i = 0; i < cset->nranges; ++i)
3920 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3922 match_len = char_len;
3923 goto check_node_accept_bytes_match;
3927 check_node_accept_bytes_match:
3928 if (!cset->non_match)
3929 return match_len;
3930 else
3932 if (match_len > 0)
3933 return 0;
3934 else
3935 return (elem_len > char_len) ? elem_len : char_len;
3938 return 0;
3941 # ifdef _LIBC
3942 static unsigned int
3943 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3945 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3946 if (nrules == 0)
3948 if (mbs_len == 1)
3950 /* No valid character. Match it as a single byte character. */
3951 const unsigned char *collseq = (const unsigned char *)
3952 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3953 return collseq[mbs[0]];
3955 return UINT_MAX;
3957 else
3959 int32_t idx;
3960 const unsigned char *extra = (const unsigned char *)
3961 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3962 int32_t extrasize = (const unsigned char *)
3963 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3965 for (idx = 0; idx < extrasize;)
3967 int mbs_cnt;
3968 bool found = false;
3969 int32_t elem_mbs_len;
3970 /* Skip the name of collating element name. */
3971 idx = idx + extra[idx] + 1;
3972 elem_mbs_len = extra[idx++];
3973 if (mbs_len == elem_mbs_len)
3975 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3976 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3977 break;
3978 if (mbs_cnt == elem_mbs_len)
3979 /* Found the entry. */
3980 found = true;
3982 /* Skip the byte sequence of the collating element. */
3983 idx += elem_mbs_len;
3984 /* Adjust for the alignment. */
3985 idx = (idx + 3) & ~3;
3986 /* Skip the collation sequence value. */
3987 idx += sizeof (uint32_t);
3988 /* Skip the wide char sequence of the collating element. */
3989 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3990 /* If we found the entry, return the sequence value. */
3991 if (found)
3992 return *(uint32_t *) (extra + idx);
3993 /* Skip the collation sequence value. */
3994 idx += sizeof (uint32_t);
3996 return UINT_MAX;
3999 # endif /* _LIBC */
4000 #endif /* RE_ENABLE_I18N */
4002 /* Check whether the node accepts the byte which is IDX-th
4003 byte of the INPUT. */
4005 static bool
4006 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4007 Idx idx)
4009 unsigned char ch;
4010 ch = re_string_byte_at (&mctx->input, idx);
4011 switch (node->type)
4013 case CHARACTER:
4014 if (node->opr.c != ch)
4015 return false;
4016 break;
4018 case SIMPLE_BRACKET:
4019 if (!bitset_contain (node->opr.sbcset, ch))
4020 return false;
4021 break;
4023 #ifdef RE_ENABLE_I18N
4024 case OP_UTF8_PERIOD:
4025 if (ch >= ASCII_CHARS)
4026 return false;
4027 FALLTHROUGH;
4028 #endif
4029 case OP_PERIOD:
4030 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4031 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4032 return false;
4033 break;
4035 default:
4036 return false;
4039 if (node->constraint)
4041 /* The node has constraints. Check whether the current context
4042 satisfies the constraints. */
4043 unsigned int context = re_string_context_at (&mctx->input, idx,
4044 mctx->eflags);
4045 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4046 return false;
4049 return true;
4052 /* Extend the buffers, if the buffers have run out. */
4054 static reg_errcode_t
4055 __attribute_warn_unused_result__
4056 extend_buffers (re_match_context_t *mctx, int min_len)
4058 reg_errcode_t ret;
4059 re_string_t *pstr = &mctx->input;
4061 /* Avoid overflow. */
4062 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4063 <= pstr->bufs_len))
4064 return REG_ESPACE;
4066 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4067 ret = re_string_realloc_buffers (pstr,
4068 MAX (min_len,
4069 MIN (pstr->len, pstr->bufs_len * 2)));
4070 if (__glibc_unlikely (ret != REG_NOERROR))
4071 return ret;
4073 if (mctx->state_log != NULL)
4075 /* And double the length of state_log. */
4076 /* XXX We have no indication of the size of this buffer. If this
4077 allocation fail we have no indication that the state_log array
4078 does not have the right size. */
4079 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4080 pstr->bufs_len + 1);
4081 if (__glibc_unlikely (new_array == NULL))
4082 return REG_ESPACE;
4083 mctx->state_log = new_array;
4086 /* Then reconstruct the buffers. */
4087 if (pstr->icase)
4089 #ifdef RE_ENABLE_I18N
4090 if (pstr->mb_cur_max > 1)
4092 ret = build_wcs_upper_buffer (pstr);
4093 if (__glibc_unlikely (ret != REG_NOERROR))
4094 return ret;
4096 else
4097 #endif /* RE_ENABLE_I18N */
4098 build_upper_buffer (pstr);
4100 else
4102 #ifdef RE_ENABLE_I18N
4103 if (pstr->mb_cur_max > 1)
4104 build_wcs_buffer (pstr);
4105 else
4106 #endif /* RE_ENABLE_I18N */
4108 if (pstr->trans != NULL)
4109 re_string_translate_buffer (pstr);
4112 return REG_NOERROR;
4116 /* Functions for matching context. */
4118 /* Initialize MCTX. */
4120 static reg_errcode_t
4121 __attribute_warn_unused_result__
4122 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4124 mctx->eflags = eflags;
4125 mctx->match_last = -1;
4126 if (n > 0)
4128 /* Avoid overflow. */
4129 size_t max_object_size =
4130 MAX (sizeof (struct re_backref_cache_entry),
4131 sizeof (re_sub_match_top_t *));
4132 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4133 return REG_ESPACE;
4135 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4136 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4137 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4138 return REG_ESPACE;
4140 /* Already zero-ed by the caller.
4141 else
4142 mctx->bkref_ents = NULL;
4143 mctx->nbkref_ents = 0;
4144 mctx->nsub_tops = 0; */
4145 mctx->abkref_ents = n;
4146 mctx->max_mb_elem_len = 1;
4147 mctx->asub_tops = n;
4148 return REG_NOERROR;
4151 /* Clean the entries which depend on the current input in MCTX.
4152 This function must be invoked when the matcher changes the start index
4153 of the input, or changes the input string. */
4155 static void
4156 match_ctx_clean (re_match_context_t *mctx)
4158 Idx st_idx;
4159 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4161 Idx sl_idx;
4162 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4163 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4165 re_sub_match_last_t *last = top->lasts[sl_idx];
4166 re_free (last->path.array);
4167 re_free (last);
4169 re_free (top->lasts);
4170 if (top->path)
4172 re_free (top->path->array);
4173 re_free (top->path);
4175 re_free (top);
4178 mctx->nsub_tops = 0;
4179 mctx->nbkref_ents = 0;
4182 /* Free all the memory associated with MCTX. */
4184 static void
4185 match_ctx_free (re_match_context_t *mctx)
4187 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4188 match_ctx_clean (mctx);
4189 re_free (mctx->sub_tops);
4190 re_free (mctx->bkref_ents);
4193 /* Add a new backreference entry to MCTX.
4194 Note that we assume that caller never call this function with duplicate
4195 entry, and call with STR_IDX which isn't smaller than any existing entry.
4198 static reg_errcode_t
4199 __attribute_warn_unused_result__
4200 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4201 Idx to)
4203 if (mctx->nbkref_ents >= mctx->abkref_ents)
4205 struct re_backref_cache_entry* new_entry;
4206 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4207 mctx->abkref_ents * 2);
4208 if (__glibc_unlikely (new_entry == NULL))
4210 re_free (mctx->bkref_ents);
4211 return REG_ESPACE;
4213 mctx->bkref_ents = new_entry;
4214 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4215 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4216 mctx->abkref_ents *= 2;
4218 if (mctx->nbkref_ents > 0
4219 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4220 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4222 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4223 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4224 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4225 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4227 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4228 If bit N is clear, means that this entry won't epsilon-transition to
4229 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4230 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4231 such node.
4233 A backreference does not epsilon-transition unless it is empty, so set
4234 to all zeros if FROM != TO. */
4235 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4236 = (from == to ? -1 : 0);
4238 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4239 if (mctx->max_mb_elem_len < to - from)
4240 mctx->max_mb_elem_len = to - from;
4241 return REG_NOERROR;
4244 /* Return the first entry with the same str_idx, or -1 if none is
4245 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4247 static Idx
4248 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4250 Idx left, right, mid, last;
4251 last = right = mctx->nbkref_ents;
4252 for (left = 0; left < right;)
4254 mid = (left + right) / 2;
4255 if (mctx->bkref_ents[mid].str_idx < str_idx)
4256 left = mid + 1;
4257 else
4258 right = mid;
4260 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4261 return left;
4262 else
4263 return -1;
4266 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4267 at STR_IDX. */
4269 static reg_errcode_t
4270 __attribute_warn_unused_result__
4271 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4273 #ifdef DEBUG
4274 assert (mctx->sub_tops != NULL);
4275 assert (mctx->asub_tops > 0);
4276 #endif
4277 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4279 Idx new_asub_tops = mctx->asub_tops * 2;
4280 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4281 re_sub_match_top_t *,
4282 new_asub_tops);
4283 if (__glibc_unlikely (new_array == NULL))
4284 return REG_ESPACE;
4285 mctx->sub_tops = new_array;
4286 mctx->asub_tops = new_asub_tops;
4288 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4289 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4290 return REG_ESPACE;
4291 mctx->sub_tops[mctx->nsub_tops]->node = node;
4292 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4293 return REG_NOERROR;
4296 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4297 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4299 static re_sub_match_last_t *
4300 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4302 re_sub_match_last_t *new_entry;
4303 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4305 Idx new_alasts = 2 * subtop->alasts + 1;
4306 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4307 re_sub_match_last_t *,
4308 new_alasts);
4309 if (__glibc_unlikely (new_array == NULL))
4310 return NULL;
4311 subtop->lasts = new_array;
4312 subtop->alasts = new_alasts;
4314 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4315 if (__glibc_likely (new_entry != NULL))
4317 subtop->lasts[subtop->nlasts] = new_entry;
4318 new_entry->node = node;
4319 new_entry->str_idx = str_idx;
4320 ++subtop->nlasts;
4322 return new_entry;
4325 static void
4326 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4327 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4329 sctx->sifted_states = sifted_sts;
4330 sctx->limited_states = limited_sts;
4331 sctx->last_node = last_node;
4332 sctx->last_str_idx = last_str_idx;
4333 re_node_set_init_empty (&sctx->limits);