exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / regexec.c
blob9f065dfa020ce1d740d7985e2d0d60db1bc1986d
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs, regmatch_t *prevregs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
70 static int sift_states_iter_mb (const re_match_context_t *mctx,
71 re_sift_context_t *sctx,
72 Idx node_idx, Idx str_idx, Idx max_str_idx);
73 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74 re_sift_context_t *sctx);
75 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76 re_sift_context_t *sctx, Idx str_idx,
77 re_node_set *cur_dest);
78 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx str_idx,
81 re_node_set *dest_nodes);
82 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83 re_node_set *dest_nodes,
84 const re_node_set *candidates);
85 static bool check_dst_limits (const re_match_context_t *mctx,
86 const re_node_set *limits,
87 Idx dst_node, Idx dst_idx, Idx src_node,
88 Idx src_idx);
89 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90 int boundaries, Idx subexp_idx,
91 Idx from_node, Idx bkref_idx);
92 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93 Idx limit, Idx subexp_idx,
94 Idx node, Idx str_idx,
95 Idx bkref_idx);
96 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates,
99 re_node_set *limits,
100 struct re_backref_cache_entry *bkref_ents,
101 Idx str_idx);
102 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103 re_sift_context_t *sctx,
104 Idx str_idx, const re_node_set *candidates);
105 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
106 re_dfastate_t **dst,
107 re_dfastate_t **src, Idx num);
108 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
109 re_match_context_t *mctx);
110 static re_dfastate_t *transit_state (reg_errcode_t *err,
111 re_match_context_t *mctx,
112 re_dfastate_t *state);
113 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
114 re_match_context_t *mctx,
115 re_dfastate_t *next_state);
116 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117 re_node_set *cur_nodes,
118 Idx str_idx);
119 #if 0
120 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
121 re_match_context_t *mctx,
122 re_dfastate_t *pstate);
123 #endif
124 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125 re_dfastate_t *pstate);
126 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127 const re_node_set *nodes);
128 static reg_errcode_t get_subexp (re_match_context_t *mctx,
129 Idx bkref_node, Idx bkref_str_idx);
130 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131 const re_sub_match_top_t *sub_top,
132 re_sub_match_last_t *sub_last,
133 Idx bkref_node, Idx bkref_str);
134 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
135 Idx subexp_idx, int type);
136 static reg_errcode_t check_arrival (re_match_context_t *mctx,
137 state_array_t *path, Idx top_node,
138 Idx top_str, Idx last_node, Idx last_str,
139 int type);
140 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
141 Idx str_idx,
142 re_node_set *cur_nodes,
143 re_node_set *next_nodes);
144 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145 re_node_set *cur_nodes,
146 Idx ex_subexp, int type);
147 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148 re_node_set *dst_nodes,
149 Idx target, Idx ex_subexp,
150 int type);
151 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152 re_node_set *cur_nodes, Idx cur_str,
153 Idx subexp_num, int type);
154 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
155 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156 const re_string_t *input, Idx idx);
157 #ifdef _LIBC
158 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
159 size_t name_len);
160 #endif
161 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162 const re_dfastate_t *state,
163 re_node_set *states_node,
164 bitset_t *states_ch);
165 static bool check_node_accept (const re_match_context_t *mctx,
166 const re_token_t *node, Idx idx);
167 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
169 /* Entry point for POSIX code. */
171 /* regexec searches for a given pattern, specified by PREG, in the
172 string STRING.
174 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
175 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
176 least NMATCH elements, and we set them to the offsets of the
177 corresponding matched substrings.
179 EFLAGS specifies "execution flags" which affect matching: if
180 REG_NOTBOL is set, then ^ does not match at the beginning of the
181 string; if REG_NOTEOL is set, then $ does not match at the end.
183 Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
184 EFLAGS is invalid. */
187 regexec (const regex_t *__restrict preg, const char *__restrict string,
188 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
190 reg_errcode_t err;
191 Idx start, length;
192 re_dfa_t *dfa = preg->buffer;
194 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
195 return REG_BADPAT;
197 if (eflags & REG_STARTEND)
199 start = pmatch[0].rm_so;
200 length = pmatch[0].rm_eo;
202 else
204 start = 0;
205 length = strlen (string);
208 lock_lock (dfa->lock);
209 if (preg->no_sub)
210 err = re_search_internal (preg, string, length, start, length,
211 length, 0, NULL, eflags);
212 else
213 err = re_search_internal (preg, string, length, start, length,
214 length, nmatch, pmatch, eflags);
215 lock_unlock (dfa->lock);
216 return err != REG_NOERROR;
219 #ifdef _LIBC
220 libc_hidden_def (__regexec)
222 # include <shlib-compat.h>
223 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
225 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
226 __typeof__ (__regexec) __compat_regexec;
229 attribute_compat_text_section
230 __compat_regexec (const regex_t *__restrict preg,
231 const char *__restrict string, size_t nmatch,
232 regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
234 return regexec (preg, string, nmatch, pmatch,
235 eflags & (REG_NOTBOL | REG_NOTEOL));
237 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
238 # endif
239 #endif
241 /* Entry points for GNU code. */
243 /* re_match, re_search, re_match_2, re_search_2
245 The former two functions operate on STRING with length LENGTH,
246 while the later two operate on concatenation of STRING1 and STRING2
247 with lengths LENGTH1 and LENGTH2, respectively.
249 re_match() matches the compiled pattern in BUFP against the string,
250 starting at index START.
252 re_search() first tries matching at index START, then it tries to match
253 starting from index START + 1, and so on. The last start position tried
254 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
255 way as re_match().)
257 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258 the first STOP characters of the concatenation of the strings should be
259 concerned.
261 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262 and all groups is stored in REGS. (For the "_2" variants, the offsets are
263 computed relative to the concatenation, not relative to the individual
264 strings.)
266 On success, re_match* functions return the length of the match, re_search*
267 return the position of the start of the match. They return -1 on
268 match failure, -2 on error. */
270 regoff_t
271 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
272 Idx start, struct re_registers *regs)
274 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
276 #ifdef _LIBC
277 weak_alias (__re_match, re_match)
278 #endif
280 regoff_t
281 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
282 Idx start, regoff_t range, struct re_registers *regs)
284 return re_search_stub (bufp, string, length, start, range, length, regs,
285 false);
287 #ifdef _LIBC
288 weak_alias (__re_search, re_search)
289 #endif
291 regoff_t
292 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
293 const char *string2, Idx length2, Idx start,
294 struct re_registers *regs, Idx stop)
296 return re_search_2_stub (bufp, string1, length1, string2, length2,
297 start, 0, regs, stop, true);
299 #ifdef _LIBC
300 weak_alias (__re_match_2, re_match_2)
301 #endif
303 regoff_t
304 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
305 const char *string2, Idx length2, Idx start, regoff_t range,
306 struct re_registers *regs, Idx stop)
308 return re_search_2_stub (bufp, string1, length1, string2, length2,
309 start, range, regs, stop, false);
311 #ifdef _LIBC
312 weak_alias (__re_search_2, re_search_2)
313 #endif
315 static regoff_t
316 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
317 Idx length1, const char *string2, Idx length2, Idx start,
318 regoff_t range, struct re_registers *regs,
319 Idx stop, bool ret_len)
321 const char *str;
322 regoff_t rval;
323 Idx len;
324 char *s = NULL;
326 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
327 || ckd_add (&len, length1, length2))))
328 return -2;
330 /* Concatenate the strings. */
331 if (length2 > 0)
332 if (length1 > 0)
334 s = re_malloc (char, len);
336 if (__glibc_unlikely (s == NULL))
337 return -2;
338 #ifdef _LIBC
339 memcpy (__mempcpy (s, string1, length1), string2, length2);
340 #else
341 memcpy (s, string1, length1);
342 memcpy (s + length1, string2, length2);
343 #endif
344 str = s;
346 else
347 str = string2;
348 else
349 str = string1;
351 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
352 ret_len);
353 re_free (s);
354 return rval;
357 /* The parameters have the same meaning as those of re_search.
358 Additional parameters:
359 If RET_LEN is true the length of the match is returned (re_match style);
360 otherwise the position of the match is returned. */
362 static regoff_t
363 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
364 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
365 bool ret_len)
367 reg_errcode_t result;
368 regmatch_t *pmatch;
369 Idx nregs;
370 regoff_t rval;
371 int eflags = 0;
372 re_dfa_t *dfa = bufp->buffer;
373 Idx last_start = start + range;
375 /* Check for out-of-range. */
376 if (__glibc_unlikely (start < 0 || start > length))
377 return -1;
378 if (__glibc_unlikely (length < last_start
379 || (0 <= range && last_start < start)))
380 last_start = length;
381 else if (__glibc_unlikely (last_start < 0
382 || (range < 0 && start <= last_start)))
383 last_start = 0;
385 lock_lock (dfa->lock);
387 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
388 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
390 /* Compile fastmap if we haven't yet. */
391 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
392 re_compile_fastmap (bufp);
394 if (__glibc_unlikely (bufp->no_sub))
395 regs = NULL;
397 /* We need at least 1 register. */
398 if (regs == NULL)
399 nregs = 1;
400 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
401 && regs->num_regs <= bufp->re_nsub))
403 nregs = regs->num_regs;
404 if (__glibc_unlikely (nregs < 1))
406 /* Nothing can be copied to regs. */
407 regs = NULL;
408 nregs = 1;
411 else
412 nregs = bufp->re_nsub + 1;
413 pmatch = re_malloc (regmatch_t, nregs);
414 if (__glibc_unlikely (pmatch == NULL))
416 rval = -2;
417 goto out;
420 result = re_search_internal (bufp, string, length, start, last_start, stop,
421 nregs, pmatch, eflags);
423 rval = 0;
425 /* I hope we needn't fill their regs with -1's when no match was found. */
426 if (result != REG_NOERROR)
427 rval = result == REG_NOMATCH ? -1 : -2;
428 else if (regs != NULL)
430 /* If caller wants register contents data back, copy them. */
431 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
432 bufp->regs_allocated);
433 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
434 rval = -2;
437 if (__glibc_likely (rval == 0))
439 if (ret_len)
441 DEBUG_ASSERT (pmatch[0].rm_so == start);
442 rval = pmatch[0].rm_eo - start;
444 else
445 rval = pmatch[0].rm_so;
447 re_free (pmatch);
448 out:
449 lock_unlock (dfa->lock);
450 return rval;
453 static unsigned
454 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
455 int regs_allocated)
457 int rval = REGS_REALLOCATE;
458 Idx i;
459 Idx need_regs = nregs + 1;
460 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
461 uses. */
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated == REGS_UNALLOCATED)
465 { /* No. So allocate them with malloc. */
466 regs->start = re_malloc (regoff_t, need_regs);
467 if (__glibc_unlikely (regs->start == NULL))
468 return REGS_UNALLOCATED;
469 regs->end = re_malloc (regoff_t, need_regs);
470 if (__glibc_unlikely (regs->end == NULL))
472 re_free (regs->start);
473 return REGS_UNALLOCATED;
475 regs->num_regs = need_regs;
477 else if (regs_allocated == REGS_REALLOCATE)
478 { /* Yes. If we need more elements than were already
479 allocated, reallocate them. If we need fewer, just
480 leave it alone. */
481 if (__glibc_unlikely (need_regs > regs->num_regs))
483 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
484 regoff_t *new_end;
485 if (__glibc_unlikely (new_start == NULL))
486 return REGS_UNALLOCATED;
487 new_end = re_realloc (regs->end, regoff_t, need_regs);
488 if (__glibc_unlikely (new_end == NULL))
490 re_free (new_start);
491 return REGS_UNALLOCATED;
493 regs->start = new_start;
494 regs->end = new_end;
495 regs->num_regs = need_regs;
498 else
500 DEBUG_ASSERT (regs_allocated == REGS_FIXED);
501 /* This function may not be called with REGS_FIXED and nregs too big. */
502 DEBUG_ASSERT (nregs <= regs->num_regs);
503 rval = REGS_FIXED;
506 /* Copy the regs. */
507 for (i = 0; i < nregs; ++i)
509 regs->start[i] = pmatch[i].rm_so;
510 regs->end[i] = pmatch[i].rm_eo;
512 for ( ; i < regs->num_regs; ++i)
513 regs->start[i] = regs->end[i] = -1;
515 return rval;
518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
520 this memory for recording register information. STARTS and ENDS
521 must be allocated using the malloc library routine, and must each
522 be at least NUM_REGS * sizeof (regoff_t) bytes long.
524 If NUM_REGS == 0, then subsequent matches should allocate their own
525 register data.
527 Unless this function is called, the first search or match using
528 PATTERN_BUFFER will allocate its own register data, without
529 freeing the old data. */
531 void
532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
533 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
535 if (num_regs)
537 bufp->regs_allocated = REGS_REALLOCATE;
538 regs->num_regs = num_regs;
539 regs->start = starts;
540 regs->end = ends;
542 else
544 bufp->regs_allocated = REGS_UNALLOCATED;
545 regs->num_regs = 0;
546 regs->start = regs->end = NULL;
549 #ifdef _LIBC
550 weak_alias (__re_set_registers, re_set_registers)
551 #endif
553 /* Entry points compatible with 4.2 BSD regex library. We don't define
554 them unless specifically requested. */
556 #if defined _REGEX_RE_COMP || defined _LIBC
558 # ifdef _LIBC
559 weak_function
560 # endif
561 re_exec (const char *s)
563 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
565 #endif /* _REGEX_RE_COMP */
567 /* Internal entry point. */
569 /* Searches for a compiled pattern PREG in the string STRING, whose
570 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
571 meaning as with regexec. LAST_START is START + RANGE, where
572 START and RANGE have the same meaning as with re_search.
573 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574 otherwise return the error code.
575 Note: We assume front end functions already check ranges.
576 (0 <= LAST_START && LAST_START <= LENGTH) */
578 static reg_errcode_t
579 __attribute_warn_unused_result__
580 re_search_internal (const regex_t *preg, const char *string, Idx length,
581 Idx start, Idx last_start, Idx stop, size_t nmatch,
582 regmatch_t pmatch[], int eflags)
584 reg_errcode_t err;
585 const re_dfa_t *dfa = preg->buffer;
586 Idx left_lim, right_lim;
587 int incr;
588 bool fl_longest_match;
589 int match_kind;
590 Idx match_first;
591 Idx match_last = -1;
592 Idx extra_nmatch;
593 bool sb;
594 int ch;
595 re_match_context_t mctx = { .dfa = dfa };
596 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
597 && start != last_start && !preg->can_be_null)
598 ? preg->fastmap : NULL);
599 RE_TRANSLATE_TYPE t = preg->translate;
601 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
602 nmatch -= extra_nmatch;
604 /* Check if the DFA haven't been compiled. */
605 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
606 || dfa->init_state_word == NULL
607 || dfa->init_state_nl == NULL
608 || dfa->init_state_begbuf == NULL))
609 return REG_NOMATCH;
611 /* We assume front-end functions already check them. */
612 DEBUG_ASSERT (0 <= last_start && last_start <= length);
614 /* If initial states with non-begbuf contexts have no elements,
615 the regex must be anchored. If preg->newline_anchor is set,
616 we'll never use init_state_nl, so do not check it. */
617 if (dfa->init_state->nodes.nelem == 0
618 && dfa->init_state_word->nodes.nelem == 0
619 && (dfa->init_state_nl->nodes.nelem == 0
620 || !preg->newline_anchor))
622 if (start != 0 && last_start != 0)
623 return REG_NOMATCH;
624 start = last_start = 0;
627 /* We must check the longest matching, if nmatch > 0. */
628 fl_longest_match = (nmatch != 0 || dfa->nbackref);
630 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
631 preg->translate, (preg->syntax & RE_ICASE) != 0,
632 dfa);
633 if (__glibc_unlikely (err != REG_NOERROR))
634 goto free_return;
635 mctx.input.stop = stop;
636 mctx.input.raw_stop = stop;
637 mctx.input.newline_anchor = preg->newline_anchor;
639 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
640 if (__glibc_unlikely (err != REG_NOERROR))
641 goto free_return;
643 /* We will log all the DFA states through which the dfa pass,
644 if nmatch > 1, or this dfa has "multibyte node", which is a
645 back-reference or a node which can accept multibyte character or
646 multi character collating element. */
647 if (nmatch > 1 || dfa->has_mb_node)
649 /* Avoid overflow. */
650 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
651 <= mctx.input.bufs_len)))
653 err = REG_ESPACE;
654 goto free_return;
657 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
658 if (__glibc_unlikely (mctx.state_log == NULL))
660 err = REG_ESPACE;
661 goto free_return;
665 match_first = start;
666 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
667 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
669 /* Check incrementally whether the input string matches. */
670 incr = (last_start < start) ? -1 : 1;
671 left_lim = (last_start < start) ? last_start : start;
672 right_lim = (last_start < start) ? start : last_start;
673 sb = dfa->mb_cur_max == 1;
674 match_kind =
675 (fastmap
676 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
677 | (start <= last_start ? 2 : 0)
678 | (t != NULL ? 1 : 0))
679 : 8);
681 for (;; match_first += incr)
683 err = REG_NOMATCH;
684 if (match_first < left_lim || right_lim < match_first)
685 goto free_return;
687 /* Advance as rapidly as possible through the string, until we
688 find a plausible place to start matching. This may be done
689 with varying efficiency, so there are various possibilities:
690 only the most common of them are specialized, in order to
691 save on code size. We use a switch statement for speed. */
692 switch (match_kind)
694 case 8:
695 /* No fastmap. */
696 break;
698 case 7:
699 /* Fastmap with single-byte translation, match forward. */
700 while (__glibc_likely (match_first < right_lim)
701 && !fastmap[t[(unsigned char) string[match_first]]])
702 ++match_first;
703 goto forward_match_found_start_or_reached_end;
705 case 6:
706 /* Fastmap without translation, match forward. */
707 while (__glibc_likely (match_first < right_lim)
708 && !fastmap[(unsigned char) string[match_first]])
709 ++match_first;
711 forward_match_found_start_or_reached_end:
712 if (__glibc_unlikely (match_first == right_lim))
714 ch = match_first >= length
715 ? 0 : (unsigned char) string[match_first];
716 if (!fastmap[t ? t[ch] : ch])
717 goto free_return;
719 break;
721 case 4:
722 case 5:
723 /* Fastmap without multi-byte translation, match backwards. */
724 while (match_first >= left_lim)
726 ch = match_first >= length
727 ? 0 : (unsigned char) string[match_first];
728 if (fastmap[t ? t[ch] : ch])
729 break;
730 --match_first;
732 if (match_first < left_lim)
733 goto free_return;
734 break;
736 default:
737 /* In this case, we can't determine easily the current byte,
738 since it might be a component byte of a multibyte
739 character. Then we use the constructed buffer instead. */
740 for (;;)
742 /* If MATCH_FIRST is out of the valid range, reconstruct the
743 buffers. */
744 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
745 if (__glibc_unlikely (offset
746 >= (__re_size_t) mctx.input.valid_raw_len))
748 err = re_string_reconstruct (&mctx.input, match_first,
749 eflags);
750 if (__glibc_unlikely (err != REG_NOERROR))
751 goto free_return;
753 offset = match_first - mctx.input.raw_mbs_idx;
755 /* Use buffer byte if OFFSET is in buffer, otherwise '\0'. */
756 ch = (offset < mctx.input.valid_len
757 ? re_string_byte_at (&mctx.input, offset) : 0);
758 if (fastmap[ch])
759 break;
760 match_first += incr;
761 if (match_first < left_lim || match_first > right_lim)
763 err = REG_NOMATCH;
764 goto free_return;
767 break;
770 /* Reconstruct the buffers so that the matcher can assume that
771 the matching starts from the beginning of the buffer. */
772 err = re_string_reconstruct (&mctx.input, match_first, eflags);
773 if (__glibc_unlikely (err != REG_NOERROR))
774 goto free_return;
776 /* Don't consider this char as a possible match start if it part,
777 yet isn't the head, of a multibyte character. */
778 if (!sb && !re_string_first_byte (&mctx.input, 0))
779 continue;
781 /* It seems to be appropriate one, then use the matcher. */
782 /* We assume that the matching starts from 0. */
783 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
784 match_last = check_matching (&mctx, fl_longest_match,
785 start <= last_start ? &match_first : NULL);
786 if (match_last != -1)
788 if (__glibc_unlikely (match_last == -2))
790 err = REG_ESPACE;
791 goto free_return;
793 else
795 mctx.match_last = match_last;
796 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
798 re_dfastate_t *pstate = mctx.state_log[match_last];
799 mctx.last_node = check_halt_state_context (&mctx, pstate,
800 match_last);
802 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
803 || dfa->nbackref)
805 err = prune_impossible_nodes (&mctx);
806 if (err == REG_NOERROR)
807 break;
808 if (__glibc_unlikely (err != REG_NOMATCH))
809 goto free_return;
810 match_last = -1;
812 else
813 break; /* We found a match. */
817 match_ctx_clean (&mctx);
820 DEBUG_ASSERT (match_last != -1);
821 DEBUG_ASSERT (err == REG_NOERROR);
823 /* Set pmatch[] if we need. */
824 if (nmatch > 0)
826 Idx reg_idx;
828 /* Initialize registers. */
829 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
830 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
832 /* Set the points where matching start/end. */
833 pmatch[0].rm_so = 0;
834 pmatch[0].rm_eo = mctx.match_last;
835 /* FIXME: This function should fail if mctx.match_last exceeds
836 the maximum possible regoff_t value. We need a new error
837 code REG_OVERFLOW. */
839 if (!preg->no_sub && nmatch > 1)
841 err = set_regs (preg, &mctx, nmatch, pmatch,
842 dfa->has_plural_match && dfa->nbackref > 0);
843 if (__glibc_unlikely (err != REG_NOERROR))
844 goto free_return;
847 /* At last, add the offset to each register, since we slid
848 the buffers so that we could assume that the matching starts
849 from 0. */
850 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
851 if (pmatch[reg_idx].rm_so != -1)
853 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
855 pmatch[reg_idx].rm_so =
856 (pmatch[reg_idx].rm_so == mctx.input.valid_len
857 ? mctx.input.valid_raw_len
858 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
859 pmatch[reg_idx].rm_eo =
860 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
861 ? mctx.input.valid_raw_len
862 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
864 pmatch[reg_idx].rm_so += match_first;
865 pmatch[reg_idx].rm_eo += match_first;
867 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
869 pmatch[nmatch + reg_idx].rm_so = -1;
870 pmatch[nmatch + reg_idx].rm_eo = -1;
873 if (dfa->subexp_map)
874 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
875 if (dfa->subexp_map[reg_idx] != reg_idx)
877 pmatch[reg_idx + 1].rm_so
878 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
879 pmatch[reg_idx + 1].rm_eo
880 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
884 free_return:
885 re_free (mctx.state_log);
886 if (dfa->nbackref)
887 match_ctx_free (&mctx);
888 re_string_destruct (&mctx.input);
889 return err;
892 static reg_errcode_t
893 __attribute_warn_unused_result__
894 prune_impossible_nodes (re_match_context_t *mctx)
896 const re_dfa_t *const dfa = mctx->dfa;
897 Idx halt_node, match_last;
898 reg_errcode_t ret;
899 re_dfastate_t **sifted_states;
900 re_dfastate_t **lim_states = NULL;
901 re_sift_context_t sctx;
902 DEBUG_ASSERT (mctx->state_log != NULL);
903 match_last = mctx->match_last;
904 halt_node = mctx->last_node;
906 /* Avoid overflow. */
907 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
908 <= match_last))
909 return REG_ESPACE;
911 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
912 if (__glibc_unlikely (sifted_states == NULL))
914 ret = REG_ESPACE;
915 goto free_return;
917 if (dfa->nbackref)
919 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
920 if (__glibc_unlikely (lim_states == NULL))
922 ret = REG_ESPACE;
923 goto free_return;
925 while (1)
927 memset (lim_states, '\0',
928 sizeof (re_dfastate_t *) * (match_last + 1));
929 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
930 match_last);
931 ret = sift_states_backward (mctx, &sctx);
932 re_node_set_free (&sctx.limits);
933 if (__glibc_unlikely (ret != REG_NOERROR))
934 goto free_return;
935 if (sifted_states[0] != NULL || lim_states[0] != NULL)
936 break;
939 --match_last;
940 if (match_last < 0)
942 ret = REG_NOMATCH;
943 goto free_return;
945 } while (mctx->state_log[match_last] == NULL
946 || !mctx->state_log[match_last]->halt);
947 halt_node = check_halt_state_context (mctx,
948 mctx->state_log[match_last],
949 match_last);
951 ret = merge_state_array (dfa, sifted_states, lim_states,
952 match_last + 1);
953 re_free (lim_states);
954 lim_states = NULL;
955 if (__glibc_unlikely (ret != REG_NOERROR))
956 goto free_return;
958 else
960 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
961 ret = sift_states_backward (mctx, &sctx);
962 re_node_set_free (&sctx.limits);
963 if (__glibc_unlikely (ret != REG_NOERROR))
964 goto free_return;
965 if (sifted_states[0] == NULL)
967 ret = REG_NOMATCH;
968 goto free_return;
971 re_free (mctx->state_log);
972 mctx->state_log = sifted_states;
973 sifted_states = NULL;
974 mctx->last_node = halt_node;
975 mctx->match_last = match_last;
976 ret = REG_NOERROR;
977 free_return:
978 re_free (sifted_states);
979 re_free (lim_states);
980 return ret;
983 /* Acquire an initial state and return it.
984 We must select appropriate initial state depending on the context,
985 since initial states may have constraints like "\<", "^", etc.. */
987 static __always_inline re_dfastate_t *
988 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
989 Idx idx)
991 const re_dfa_t *const dfa = mctx->dfa;
992 if (dfa->init_state->has_constraint)
994 unsigned int context;
995 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
996 if (IS_WORD_CONTEXT (context))
997 return dfa->init_state_word;
998 else if (IS_ORDINARY_CONTEXT (context))
999 return dfa->init_state;
1000 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1001 return dfa->init_state_begbuf;
1002 else if (IS_NEWLINE_CONTEXT (context))
1003 return dfa->init_state_nl;
1004 else if (IS_BEGBUF_CONTEXT (context))
1006 /* It is relatively rare case, then calculate on demand. */
1007 return re_acquire_state_context (err, dfa,
1008 dfa->init_state->entrance_nodes,
1009 context);
1011 else
1012 /* Must not happen? */
1013 return dfa->init_state;
1015 else
1016 return dfa->init_state;
1019 /* Check whether the regular expression match input string INPUT or not,
1020 and return the index where the matching end. Return -1 if
1021 there is no match, and return -2 in case of an error.
1022 FL_LONGEST_MATCH means we want the POSIX longest matching.
1023 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1024 next place where we may want to try matching.
1025 Note that the matcher assumes that the matching starts from the current
1026 index of the buffer. */
1028 static Idx
1029 __attribute_warn_unused_result__
1030 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1031 Idx *p_match_first)
1033 const re_dfa_t *const dfa = mctx->dfa;
1034 reg_errcode_t err;
1035 Idx match = 0;
1036 Idx match_last = -1;
1037 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1038 re_dfastate_t *cur_state;
1039 bool at_init_state = p_match_first != NULL;
1040 Idx next_start_idx = cur_str_idx;
1042 err = REG_NOERROR;
1043 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1044 /* An initial state must not be NULL (invalid). */
1045 if (__glibc_unlikely (cur_state == NULL))
1047 DEBUG_ASSERT (err == REG_ESPACE);
1048 return -2;
1051 if (mctx->state_log != NULL)
1053 mctx->state_log[cur_str_idx] = cur_state;
1055 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1056 later. E.g. Processing back references. */
1057 if (__glibc_unlikely (dfa->nbackref))
1059 at_init_state = false;
1060 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1061 if (__glibc_unlikely (err != REG_NOERROR))
1062 return err;
1064 if (cur_state->has_backref)
1066 err = transit_state_bkref (mctx, &cur_state->nodes);
1067 if (__glibc_unlikely (err != REG_NOERROR))
1068 return err;
1073 /* If the RE accepts NULL string. */
1074 if (__glibc_unlikely (cur_state->halt))
1076 if (!cur_state->has_constraint
1077 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1079 if (!fl_longest_match)
1080 return cur_str_idx;
1081 else
1083 match_last = cur_str_idx;
1084 match = 1;
1089 while (!re_string_eoi (&mctx->input))
1091 re_dfastate_t *old_state = cur_state;
1092 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1094 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1095 && mctx->input.bufs_len < mctx->input.len)
1096 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1097 && mctx->input.valid_len < mctx->input.len))
1099 err = extend_buffers (mctx, next_char_idx + 1);
1100 if (__glibc_unlikely (err != REG_NOERROR))
1102 DEBUG_ASSERT (err == REG_ESPACE);
1103 return -2;
1107 cur_state = transit_state (&err, mctx, cur_state);
1108 if (mctx->state_log != NULL)
1109 cur_state = merge_state_with_log (&err, mctx, cur_state);
1111 if (cur_state == NULL)
1113 /* Reached the invalid state or an error. Try to recover a valid
1114 state using the state log, if available and if we have not
1115 already found a valid (even if not the longest) match. */
1116 if (__glibc_unlikely (err != REG_NOERROR))
1117 return -2;
1119 if (mctx->state_log == NULL
1120 || (match && !fl_longest_match)
1121 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1122 break;
1125 if (__glibc_unlikely (at_init_state))
1127 if (old_state == cur_state)
1128 next_start_idx = next_char_idx;
1129 else
1130 at_init_state = false;
1133 if (cur_state->halt)
1135 /* Reached a halt state.
1136 Check the halt state can satisfy the current context. */
1137 if (!cur_state->has_constraint
1138 || check_halt_state_context (mctx, cur_state,
1139 re_string_cur_idx (&mctx->input)))
1141 /* We found an appropriate halt state. */
1142 match_last = re_string_cur_idx (&mctx->input);
1143 match = 1;
1145 /* We found a match, do not modify match_first below. */
1146 p_match_first = NULL;
1147 if (!fl_longest_match)
1148 break;
1153 if (p_match_first)
1154 *p_match_first += next_start_idx;
1156 return match_last;
1159 /* Check NODE match the current context. */
1161 static bool
1162 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1164 re_token_type_t type = dfa->nodes[node].type;
1165 unsigned int constraint = dfa->nodes[node].constraint;
1166 if (type != END_OF_RE)
1167 return false;
1168 if (!constraint)
1169 return true;
1170 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1171 return false;
1172 return true;
1175 /* Check the halt state STATE match the current context.
1176 Return 0 if not match, if the node, STATE has, is a halt node and
1177 match the context, return the node. */
1179 static Idx
1180 check_halt_state_context (const re_match_context_t *mctx,
1181 const re_dfastate_t *state, Idx idx)
1183 Idx i;
1184 unsigned int context;
1185 DEBUG_ASSERT (state->halt);
1186 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1187 for (i = 0; i < state->nodes.nelem; ++i)
1188 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1189 return state->nodes.elems[i];
1190 return 0;
1193 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1194 corresponding to the DFA).
1195 Return the destination node, and update EPS_VIA_NODES;
1196 return -1 on match failure, -2 on error. */
1198 static Idx
1199 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1200 regmatch_t *prevregs,
1201 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1202 struct re_fail_stack_t *fs)
1204 const re_dfa_t *const dfa = mctx->dfa;
1205 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1207 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1208 re_node_set *edests = &dfa->edests[node];
1210 if (! re_node_set_contains (eps_via_nodes, node))
1212 bool ok = re_node_set_insert (eps_via_nodes, node);
1213 if (__glibc_unlikely (! ok))
1214 return -2;
1217 /* Pick a valid destination, or return -1 if none is found. */
1218 Idx dest_node = -1;
1219 for (Idx i = 0; i < edests->nelem; i++)
1221 Idx candidate = edests->elems[i];
1222 if (!re_node_set_contains (cur_nodes, candidate))
1223 continue;
1224 if (dest_node == -1)
1225 dest_node = candidate;
1227 else
1229 /* In order to avoid infinite loop like "(a*)*", return the second
1230 epsilon-transition if the first was already considered. */
1231 if (re_node_set_contains (eps_via_nodes, dest_node))
1232 return candidate;
1234 /* Otherwise, push the second epsilon-transition on the fail stack. */
1235 else if (fs != NULL
1236 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1237 prevregs, eps_via_nodes))
1238 return -2;
1240 /* We know we are going to exit. */
1241 break;
1244 return dest_node;
1246 else
1248 Idx naccepted = 0;
1249 re_token_type_t type = dfa->nodes[node].type;
1251 if (dfa->nodes[node].accept_mb)
1252 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1253 else if (type == OP_BACK_REF)
1255 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1256 if (subexp_idx < nregs)
1257 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258 if (fs != NULL)
1260 if (subexp_idx >= nregs
1261 || regs[subexp_idx].rm_so == -1
1262 || regs[subexp_idx].rm_eo == -1)
1263 return -1;
1264 else if (naccepted)
1266 char *buf = (char *) re_string_get_buffer (&mctx->input);
1267 if (mctx->input.valid_len - *pidx < naccepted
1268 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1269 naccepted)
1270 != 0))
1271 return -1;
1275 if (naccepted == 0)
1277 Idx dest_node;
1278 bool ok = re_node_set_insert (eps_via_nodes, node);
1279 if (__glibc_unlikely (! ok))
1280 return -2;
1281 dest_node = dfa->edests[node].elems[0];
1282 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1283 dest_node))
1284 return dest_node;
1288 if (naccepted != 0
1289 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1291 Idx dest_node = dfa->nexts[node];
1292 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1293 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1294 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1295 dest_node)))
1296 return -1;
1297 re_node_set_empty (eps_via_nodes);
1298 return dest_node;
1301 return -1;
1304 static reg_errcode_t
1305 __attribute_warn_unused_result__
1306 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1307 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1308 re_node_set *eps_via_nodes)
1310 reg_errcode_t err;
1311 Idx num = fs->num;
1312 if (num == fs->alloc)
1314 struct re_fail_stack_ent_t *new_array;
1315 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1316 fs->alloc * 2);
1317 if (new_array == NULL)
1318 return REG_ESPACE;
1319 fs->alloc *= 2;
1320 fs->stack = new_array;
1322 fs->stack[num].idx = str_idx;
1323 fs->stack[num].node = dest_node;
1324 fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1325 if (fs->stack[num].regs == NULL)
1326 return REG_ESPACE;
1327 fs->num = num + 1;
1328 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1329 memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1330 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1331 return err;
1334 static Idx
1335 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1336 regmatch_t *regs, regmatch_t *prevregs,
1337 re_node_set *eps_via_nodes)
1339 if (fs == NULL || fs->num == 0)
1340 return -1;
1341 Idx num = --fs->num;
1342 *pidx = fs->stack[num].idx;
1343 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1344 memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1345 re_node_set_free (eps_via_nodes);
1346 re_free (fs->stack[num].regs);
1347 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1348 DEBUG_ASSERT (0 <= fs->stack[num].node);
1349 return fs->stack[num].node;
1353 #define DYNARRAY_STRUCT regmatch_list
1354 #define DYNARRAY_ELEMENT regmatch_t
1355 #define DYNARRAY_PREFIX regmatch_list_
1356 #include <malloc/dynarray-skeleton.c>
1358 /* Set the positions where the subexpressions are starts/ends to registers
1359 PMATCH.
1360 Note: We assume that pmatch[0] is already set, and
1361 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1363 static reg_errcode_t
1364 __attribute_warn_unused_result__
1365 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1366 regmatch_t *pmatch, bool fl_backtrack)
1368 const re_dfa_t *dfa = preg->buffer;
1369 Idx idx, cur_node;
1370 re_node_set eps_via_nodes;
1371 struct re_fail_stack_t *fs;
1372 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1373 struct regmatch_list prev_match;
1374 regmatch_list_init (&prev_match);
1376 DEBUG_ASSERT (nmatch > 1);
1377 DEBUG_ASSERT (mctx->state_log != NULL);
1378 if (fl_backtrack)
1380 fs = &fs_body;
1381 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1382 if (fs->stack == NULL)
1383 return REG_ESPACE;
1385 else
1386 fs = NULL;
1388 cur_node = dfa->init_node;
1389 re_node_set_init_empty (&eps_via_nodes);
1391 if (!regmatch_list_resize (&prev_match, nmatch))
1393 regmatch_list_free (&prev_match);
1394 free_fail_stack_return (fs);
1395 return REG_ESPACE;
1397 regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1398 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1400 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1402 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1404 if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1405 || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1407 Idx reg_idx;
1408 cur_node = -1;
1409 if (fs)
1411 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1412 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1414 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1415 prev_idx_match, &eps_via_nodes);
1416 break;
1419 if (cur_node < 0)
1421 re_node_set_free (&eps_via_nodes);
1422 regmatch_list_free (&prev_match);
1423 return free_fail_stack_return (fs);
1427 /* Proceed to next node. */
1428 cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1429 &idx, cur_node,
1430 &eps_via_nodes, fs);
1432 if (__glibc_unlikely (cur_node < 0))
1434 if (__glibc_unlikely (cur_node == -2))
1436 re_node_set_free (&eps_via_nodes);
1437 regmatch_list_free (&prev_match);
1438 free_fail_stack_return (fs);
1439 return REG_ESPACE;
1441 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1442 prev_idx_match, &eps_via_nodes);
1443 if (cur_node < 0)
1445 re_node_set_free (&eps_via_nodes);
1446 regmatch_list_free (&prev_match);
1447 free_fail_stack_return (fs);
1448 return REG_NOMATCH;
1452 re_node_set_free (&eps_via_nodes);
1453 regmatch_list_free (&prev_match);
1454 return free_fail_stack_return (fs);
1457 static reg_errcode_t
1458 free_fail_stack_return (struct re_fail_stack_t *fs)
1460 if (fs)
1462 Idx fs_idx;
1463 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1465 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1466 re_free (fs->stack[fs_idx].regs);
1468 re_free (fs->stack);
1470 return REG_NOERROR;
1473 static void
1474 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1475 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1477 int type = dfa->nodes[cur_node].type;
1478 if (type == OP_OPEN_SUBEXP)
1480 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1482 /* We are at the first node of this sub expression. */
1483 if (reg_num < nmatch)
1485 pmatch[reg_num].rm_so = cur_idx;
1486 pmatch[reg_num].rm_eo = -1;
1489 else if (type == OP_CLOSE_SUBEXP)
1491 /* We are at the last node of this sub expression. */
1492 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493 if (reg_num < nmatch)
1495 if (pmatch[reg_num].rm_so < cur_idx)
1497 pmatch[reg_num].rm_eo = cur_idx;
1498 /* This is a non-empty match or we are not inside an optional
1499 subexpression. Accept this right away. */
1500 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1502 else
1504 if (dfa->nodes[cur_node].opt_subexp
1505 && prev_idx_match[reg_num].rm_so != -1)
1506 /* We transited through an empty match for an optional
1507 subexpression, like (a?)*, and this is not the subexp's
1508 first match. Copy back the old content of the registers
1509 so that matches of an inner subexpression are undone as
1510 well, like in ((a?))*. */
1511 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1512 else
1513 /* We completed a subexpression, but it may be part of
1514 an optional one, so do not update PREV_IDX_MATCH. */
1515 pmatch[reg_num].rm_eo = cur_idx;
1521 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1522 and sift the nodes in each states according to the following rules.
1523 Updated state_log will be wrote to STATE_LOG.
1525 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1526 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1527 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1528 the LAST_NODE, we throw away the node 'a'.
1529 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1530 string 's' and transit to 'b':
1531 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1532 away the node 'a'.
1533 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1534 thrown away, we throw away the node 'a'.
1535 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1536 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1537 node 'a'.
1538 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1539 we throw away the node 'a'. */
1541 #define STATE_NODE_CONTAINS(state,node) \
1542 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1544 static reg_errcode_t
1545 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1547 reg_errcode_t err;
1548 int null_cnt = 0;
1549 Idx str_idx = sctx->last_str_idx;
1550 re_node_set cur_dest;
1552 DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1554 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1555 transit to the last_node and the last_node itself. */
1556 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1557 if (__glibc_unlikely (err != REG_NOERROR))
1558 return err;
1559 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1560 if (__glibc_unlikely (err != REG_NOERROR))
1561 goto free_return;
1563 /* Then check each states in the state_log. */
1564 while (str_idx > 0)
1566 /* Update counters. */
1567 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1568 if (null_cnt > mctx->max_mb_elem_len)
1570 memset (sctx->sifted_states, '\0',
1571 sizeof (re_dfastate_t *) * str_idx);
1572 re_node_set_free (&cur_dest);
1573 return REG_NOERROR;
1575 re_node_set_empty (&cur_dest);
1576 --str_idx;
1578 if (mctx->state_log[str_idx])
1580 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1581 if (__glibc_unlikely (err != REG_NOERROR))
1582 goto free_return;
1585 /* Add all the nodes which satisfy the following conditions:
1586 - It can epsilon transit to a node in CUR_DEST.
1587 - It is in CUR_SRC.
1588 And update state_log. */
1589 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1590 if (__glibc_unlikely (err != REG_NOERROR))
1591 goto free_return;
1593 err = REG_NOERROR;
1594 free_return:
1595 re_node_set_free (&cur_dest);
1596 return err;
1599 static reg_errcode_t
1600 __attribute_warn_unused_result__
1601 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1602 Idx str_idx, re_node_set *cur_dest)
1604 const re_dfa_t *const dfa = mctx->dfa;
1605 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1606 Idx i;
1608 /* Then build the next sifted state.
1609 We build the next sifted state on 'cur_dest', and update
1610 'sifted_states[str_idx]' with 'cur_dest'.
1611 Note:
1612 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1613 'cur_src' points the node_set of the old 'state_log[str_idx]'
1614 (with the epsilon nodes pre-filtered out). */
1615 for (i = 0; i < cur_src->nelem; i++)
1617 Idx prev_node = cur_src->elems[i];
1618 int naccepted = 0;
1619 bool ok;
1620 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1622 /* If the node may accept "multi byte". */
1623 if (dfa->nodes[prev_node].accept_mb)
1624 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1625 str_idx, sctx->last_str_idx);
1627 /* We don't check backreferences here.
1628 See update_cur_sifted_state(). */
1629 if (!naccepted
1630 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1631 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1632 dfa->nexts[prev_node]))
1633 naccepted = 1;
1635 if (naccepted == 0)
1636 continue;
1638 if (sctx->limits.nelem)
1640 Idx to_idx = str_idx + naccepted;
1641 if (check_dst_limits (mctx, &sctx->limits,
1642 dfa->nexts[prev_node], to_idx,
1643 prev_node, str_idx))
1644 continue;
1646 ok = re_node_set_insert (cur_dest, prev_node);
1647 if (__glibc_unlikely (! ok))
1648 return REG_ESPACE;
1651 return REG_NOERROR;
1654 /* Helper functions. */
1656 static reg_errcode_t
1657 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1659 Idx top = mctx->state_log_top;
1661 if ((next_state_log_idx >= mctx->input.bufs_len
1662 && mctx->input.bufs_len < mctx->input.len)
1663 || (next_state_log_idx >= mctx->input.valid_len
1664 && mctx->input.valid_len < mctx->input.len))
1666 reg_errcode_t err;
1667 err = extend_buffers (mctx, next_state_log_idx + 1);
1668 if (__glibc_unlikely (err != REG_NOERROR))
1669 return err;
1672 if (top < next_state_log_idx)
1674 DEBUG_ASSERT (mctx->state_log != NULL);
1675 memset (mctx->state_log + top + 1, '\0',
1676 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1677 mctx->state_log_top = next_state_log_idx;
1679 return REG_NOERROR;
1682 static reg_errcode_t
1683 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1684 re_dfastate_t **src, Idx num)
1686 Idx st_idx;
1687 reg_errcode_t err;
1688 for (st_idx = 0; st_idx < num; ++st_idx)
1690 if (dst[st_idx] == NULL)
1691 dst[st_idx] = src[st_idx];
1692 else if (src[st_idx] != NULL)
1694 re_node_set merged_set;
1695 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1696 &src[st_idx]->nodes);
1697 if (__glibc_unlikely (err != REG_NOERROR))
1698 return err;
1699 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1700 re_node_set_free (&merged_set);
1701 if (__glibc_unlikely (err != REG_NOERROR))
1702 return err;
1705 return REG_NOERROR;
1708 static reg_errcode_t
1709 update_cur_sifted_state (const re_match_context_t *mctx,
1710 re_sift_context_t *sctx, Idx str_idx,
1711 re_node_set *dest_nodes)
1713 const re_dfa_t *const dfa = mctx->dfa;
1714 reg_errcode_t err = REG_NOERROR;
1715 const re_node_set *candidates;
1716 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1717 : &mctx->state_log[str_idx]->nodes);
1719 if (dest_nodes->nelem == 0)
1720 sctx->sifted_states[str_idx] = NULL;
1721 else
1723 if (candidates)
1725 /* At first, add the nodes which can epsilon transit to a node in
1726 DEST_NODE. */
1727 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1728 if (__glibc_unlikely (err != REG_NOERROR))
1729 return err;
1731 /* Then, check the limitations in the current sift_context. */
1732 if (sctx->limits.nelem)
1734 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1735 mctx->bkref_ents, str_idx);
1736 if (__glibc_unlikely (err != REG_NOERROR))
1737 return err;
1741 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1742 if (__glibc_unlikely (err != REG_NOERROR))
1743 return err;
1746 if (candidates && mctx->state_log[str_idx]->has_backref)
1748 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1749 if (__glibc_unlikely (err != REG_NOERROR))
1750 return err;
1752 return REG_NOERROR;
1755 static reg_errcode_t
1756 __attribute_warn_unused_result__
1757 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1758 const re_node_set *candidates)
1760 reg_errcode_t err = REG_NOERROR;
1761 Idx i;
1763 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1764 if (__glibc_unlikely (err != REG_NOERROR))
1765 return err;
1767 if (!state->inveclosure.alloc)
1769 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1770 if (__glibc_unlikely (err != REG_NOERROR))
1771 return REG_ESPACE;
1772 for (i = 0; i < dest_nodes->nelem; i++)
1774 err = re_node_set_merge (&state->inveclosure,
1775 dfa->inveclosures + dest_nodes->elems[i]);
1776 if (__glibc_unlikely (err != REG_NOERROR))
1777 return REG_ESPACE;
1780 return re_node_set_add_intersect (dest_nodes, candidates,
1781 &state->inveclosure);
1784 static reg_errcode_t
1785 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1786 const re_node_set *candidates)
1788 Idx ecl_idx;
1789 reg_errcode_t err;
1790 re_node_set *inv_eclosure = dfa->inveclosures + node;
1791 re_node_set except_nodes;
1792 re_node_set_init_empty (&except_nodes);
1793 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1795 Idx cur_node = inv_eclosure->elems[ecl_idx];
1796 if (cur_node == node)
1797 continue;
1798 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1800 Idx edst1 = dfa->edests[cur_node].elems[0];
1801 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1802 ? dfa->edests[cur_node].elems[1] : -1);
1803 if ((!re_node_set_contains (inv_eclosure, edst1)
1804 && re_node_set_contains (dest_nodes, edst1))
1805 || (edst2 > 0
1806 && !re_node_set_contains (inv_eclosure, edst2)
1807 && re_node_set_contains (dest_nodes, edst2)))
1809 err = re_node_set_add_intersect (&except_nodes, candidates,
1810 dfa->inveclosures + cur_node);
1811 if (__glibc_unlikely (err != REG_NOERROR))
1813 re_node_set_free (&except_nodes);
1814 return err;
1819 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1821 Idx cur_node = inv_eclosure->elems[ecl_idx];
1822 if (!re_node_set_contains (&except_nodes, cur_node))
1824 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1825 re_node_set_remove_at (dest_nodes, idx);
1828 re_node_set_free (&except_nodes);
1829 return REG_NOERROR;
1832 static bool
1833 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1834 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1836 const re_dfa_t *const dfa = mctx->dfa;
1837 Idx lim_idx, src_pos, dst_pos;
1839 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1840 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1841 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1843 Idx subexp_idx;
1844 struct re_backref_cache_entry *ent;
1845 ent = mctx->bkref_ents + limits->elems[lim_idx];
1846 subexp_idx = dfa->nodes[ent->node].opr.idx;
1848 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1849 subexp_idx, dst_node, dst_idx,
1850 dst_bkref_idx);
1851 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1852 subexp_idx, src_node, src_idx,
1853 src_bkref_idx);
1855 /* In case of:
1856 <src> <dst> ( <subexp> )
1857 ( <subexp> ) <src> <dst>
1858 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1859 if (src_pos == dst_pos)
1860 continue; /* This is unrelated limitation. */
1861 else
1862 return true;
1864 return false;
1867 static int
1868 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1869 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1871 const re_dfa_t *const dfa = mctx->dfa;
1872 const re_node_set *eclosures = dfa->eclosures + from_node;
1873 Idx node_idx;
1875 /* Else, we are on the boundary: examine the nodes on the epsilon
1876 closure. */
1877 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1879 Idx node = eclosures->elems[node_idx];
1880 switch (dfa->nodes[node].type)
1882 case OP_BACK_REF:
1883 if (bkref_idx != -1)
1885 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1888 Idx dst;
1889 int cpos;
1891 if (ent->node != node)
1892 continue;
1894 if (subexp_idx < BITSET_WORD_BITS
1895 && !(ent->eps_reachable_subexps_map
1896 & ((bitset_word_t) 1 << subexp_idx)))
1897 continue;
1899 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1900 OP_CLOSE_SUBEXP cases below. But, if the
1901 destination node is the same node as the source
1902 node, don't recurse because it would cause an
1903 infinite loop: a regex that exhibits this behavior
1904 is ()\1*\1* */
1905 dst = dfa->edests[node].elems[0];
1906 if (dst == from_node)
1908 if (boundaries & 1)
1909 return -1;
1910 else /* if (boundaries & 2) */
1911 return 0;
1914 cpos =
1915 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1916 dst, bkref_idx);
1917 if (cpos == -1 /* && (boundaries & 1) */)
1918 return -1;
1919 if (cpos == 0 && (boundaries & 2))
1920 return 0;
1922 if (subexp_idx < BITSET_WORD_BITS)
1923 ent->eps_reachable_subexps_map
1924 &= ~((bitset_word_t) 1 << subexp_idx);
1926 while (ent++->more);
1928 break;
1930 case OP_OPEN_SUBEXP:
1931 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1932 return -1;
1933 break;
1935 case OP_CLOSE_SUBEXP:
1936 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1937 return 0;
1938 break;
1940 default:
1941 break;
1945 return (boundaries & 2) ? 1 : 0;
1948 static int
1949 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1950 Idx subexp_idx, Idx from_node, Idx str_idx,
1951 Idx bkref_idx)
1953 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1954 int boundaries;
1956 /* If we are outside the range of the subexpression, return -1 or 1. */
1957 if (str_idx < lim->subexp_from)
1958 return -1;
1960 if (lim->subexp_to < str_idx)
1961 return 1;
1963 /* If we are within the subexpression, return 0. */
1964 boundaries = (str_idx == lim->subexp_from);
1965 boundaries |= (str_idx == lim->subexp_to) << 1;
1966 if (boundaries == 0)
1967 return 0;
1969 /* Else, examine epsilon closure. */
1970 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1971 from_node, bkref_idx);
1974 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1975 which are against limitations from DEST_NODES. */
1977 static reg_errcode_t
1978 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1979 const re_node_set *candidates, re_node_set *limits,
1980 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1982 reg_errcode_t err;
1983 Idx node_idx, lim_idx;
1985 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1987 Idx subexp_idx;
1988 struct re_backref_cache_entry *ent;
1989 ent = bkref_ents + limits->elems[lim_idx];
1991 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1992 continue; /* This is unrelated limitation. */
1994 subexp_idx = dfa->nodes[ent->node].opr.idx;
1995 if (ent->subexp_to == str_idx)
1997 Idx ops_node = -1;
1998 Idx cls_node = -1;
1999 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2001 Idx node = dest_nodes->elems[node_idx];
2002 re_token_type_t type = dfa->nodes[node].type;
2003 if (type == OP_OPEN_SUBEXP
2004 && subexp_idx == dfa->nodes[node].opr.idx)
2005 ops_node = node;
2006 else if (type == OP_CLOSE_SUBEXP
2007 && subexp_idx == dfa->nodes[node].opr.idx)
2008 cls_node = node;
2011 /* Check the limitation of the open subexpression. */
2012 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2013 if (ops_node >= 0)
2015 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2016 candidates);
2017 if (__glibc_unlikely (err != REG_NOERROR))
2018 return err;
2021 /* Check the limitation of the close subexpression. */
2022 if (cls_node >= 0)
2023 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2025 Idx node = dest_nodes->elems[node_idx];
2026 if (!re_node_set_contains (dfa->inveclosures + node,
2027 cls_node)
2028 && !re_node_set_contains (dfa->eclosures + node,
2029 cls_node))
2031 /* It is against this limitation.
2032 Remove it form the current sifted state. */
2033 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2034 candidates);
2035 if (__glibc_unlikely (err != REG_NOERROR))
2036 return err;
2037 --node_idx;
2041 else /* (ent->subexp_to != str_idx) */
2043 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2045 Idx node = dest_nodes->elems[node_idx];
2046 re_token_type_t type = dfa->nodes[node].type;
2047 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2049 if (subexp_idx != dfa->nodes[node].opr.idx)
2050 continue;
2051 /* It is against this limitation.
2052 Remove it form the current sifted state. */
2053 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2054 candidates);
2055 if (__glibc_unlikely (err != REG_NOERROR))
2056 return err;
2061 return REG_NOERROR;
2064 static reg_errcode_t
2065 __attribute_warn_unused_result__
2066 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2067 Idx str_idx, const re_node_set *candidates)
2069 const re_dfa_t *const dfa = mctx->dfa;
2070 reg_errcode_t err;
2071 Idx node_idx, node;
2072 re_sift_context_t local_sctx;
2073 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2075 if (first_idx == -1)
2076 return REG_NOERROR;
2078 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2080 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2082 Idx enabled_idx;
2083 re_token_type_t type;
2084 struct re_backref_cache_entry *entry;
2085 node = candidates->elems[node_idx];
2086 type = dfa->nodes[node].type;
2087 /* Avoid infinite loop for the REs like "()\1+". */
2088 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2089 continue;
2090 if (type != OP_BACK_REF)
2091 continue;
2093 entry = mctx->bkref_ents + first_idx;
2094 enabled_idx = first_idx;
2097 Idx subexp_len;
2098 Idx to_idx;
2099 Idx dst_node;
2100 bool ok;
2101 re_dfastate_t *cur_state;
2103 if (entry->node != node)
2104 continue;
2105 subexp_len = entry->subexp_to - entry->subexp_from;
2106 to_idx = str_idx + subexp_len;
2107 dst_node = (subexp_len ? dfa->nexts[node]
2108 : dfa->edests[node].elems[0]);
2110 if (to_idx > sctx->last_str_idx
2111 || sctx->sifted_states[to_idx] == NULL
2112 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2113 || check_dst_limits (mctx, &sctx->limits, node,
2114 str_idx, dst_node, to_idx))
2115 continue;
2117 if (local_sctx.sifted_states == NULL)
2119 local_sctx = *sctx;
2120 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2121 if (__glibc_unlikely (err != REG_NOERROR))
2122 goto free_return;
2124 local_sctx.last_node = node;
2125 local_sctx.last_str_idx = str_idx;
2126 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2127 if (__glibc_unlikely (! ok))
2129 err = REG_ESPACE;
2130 goto free_return;
2132 cur_state = local_sctx.sifted_states[str_idx];
2133 err = sift_states_backward (mctx, &local_sctx);
2134 if (__glibc_unlikely (err != REG_NOERROR))
2135 goto free_return;
2136 if (sctx->limited_states != NULL)
2138 err = merge_state_array (dfa, sctx->limited_states,
2139 local_sctx.sifted_states,
2140 str_idx + 1);
2141 if (__glibc_unlikely (err != REG_NOERROR))
2142 goto free_return;
2144 local_sctx.sifted_states[str_idx] = cur_state;
2145 re_node_set_remove (&local_sctx.limits, enabled_idx);
2147 /* mctx->bkref_ents may have changed, reload the pointer. */
2148 entry = mctx->bkref_ents + enabled_idx;
2150 while (enabled_idx++, entry++->more);
2152 err = REG_NOERROR;
2153 free_return:
2154 if (local_sctx.sifted_states != NULL)
2156 re_node_set_free (&local_sctx.limits);
2159 return err;
2163 static int
2164 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2165 Idx node_idx, Idx str_idx, Idx max_str_idx)
2167 const re_dfa_t *const dfa = mctx->dfa;
2168 int naccepted;
2169 /* Check the node can accept "multi byte". */
2170 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2171 if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2172 && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2173 dfa->nexts[node_idx]))
2174 /* The node can't accept the "multi byte", or the
2175 destination was already thrown away, then the node
2176 couldn't accept the current input "multi byte". */
2177 naccepted = 0;
2178 /* Otherwise, it is sure that the node could accept
2179 'naccepted' bytes input. */
2180 return naccepted;
2183 /* Functions for state transition. */
2185 /* Return the next state to which the current state STATE will transit by
2186 accepting the current input byte, and update STATE_LOG if necessary.
2187 Return NULL on failure.
2188 If STATE can accept a multibyte char/collating element/back reference
2189 update the destination of STATE_LOG. */
2191 static re_dfastate_t *
2192 __attribute_warn_unused_result__
2193 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2194 re_dfastate_t *state)
2196 re_dfastate_t **trtable;
2197 unsigned char ch;
2199 /* If the current state can accept multibyte. */
2200 if (__glibc_unlikely (state->accept_mb))
2202 *err = transit_state_mb (mctx, state);
2203 if (__glibc_unlikely (*err != REG_NOERROR))
2204 return NULL;
2207 /* Then decide the next state with the single byte. */
2208 #if 0
2209 if (0)
2210 /* don't use transition table */
2211 return transit_state_sb (err, mctx, state);
2212 #endif
2214 /* Use transition table */
2215 ch = re_string_fetch_byte (&mctx->input);
2216 for (;;)
2218 trtable = state->trtable;
2219 if (__glibc_likely (trtable != NULL))
2220 return trtable[ch];
2222 trtable = state->word_trtable;
2223 if (__glibc_likely (trtable != NULL))
2225 unsigned int context;
2226 context
2227 = re_string_context_at (&mctx->input,
2228 re_string_cur_idx (&mctx->input) - 1,
2229 mctx->eflags);
2230 if (IS_WORD_CONTEXT (context))
2231 return trtable[ch + SBC_MAX];
2232 else
2233 return trtable[ch];
2236 if (!build_trtable (mctx->dfa, state))
2238 *err = REG_ESPACE;
2239 return NULL;
2242 /* Retry, we now have a transition table. */
2246 /* Update the state_log if we need */
2247 static re_dfastate_t *
2248 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2249 re_dfastate_t *next_state)
2251 const re_dfa_t *const dfa = mctx->dfa;
2252 Idx cur_idx = re_string_cur_idx (&mctx->input);
2254 if (cur_idx > mctx->state_log_top)
2256 mctx->state_log[cur_idx] = next_state;
2257 mctx->state_log_top = cur_idx;
2259 else if (mctx->state_log[cur_idx] == 0)
2261 mctx->state_log[cur_idx] = next_state;
2263 else
2265 re_dfastate_t *pstate;
2266 unsigned int context;
2267 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2268 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2269 the destination of a multibyte char/collating element/
2270 back reference. Then the next state is the union set of
2271 these destinations and the results of the transition table. */
2272 pstate = mctx->state_log[cur_idx];
2273 log_nodes = pstate->entrance_nodes;
2274 if (next_state != NULL)
2276 table_nodes = next_state->entrance_nodes;
2277 *err = re_node_set_init_union (&next_nodes, table_nodes,
2278 log_nodes);
2279 if (__glibc_unlikely (*err != REG_NOERROR))
2280 return NULL;
2282 else
2283 next_nodes = *log_nodes;
2284 /* Note: We already add the nodes of the initial state,
2285 then we don't need to add them here. */
2287 context = re_string_context_at (&mctx->input,
2288 re_string_cur_idx (&mctx->input) - 1,
2289 mctx->eflags);
2290 next_state = mctx->state_log[cur_idx]
2291 = re_acquire_state_context (err, dfa, &next_nodes, context);
2292 /* We don't need to check errors here, since the return value of
2293 this function is next_state and ERR is already set. */
2295 if (table_nodes != NULL)
2296 re_node_set_free (&next_nodes);
2299 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2301 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2302 later. We must check them here, since the back references in the
2303 next state might use them. */
2304 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2305 cur_idx);
2306 if (__glibc_unlikely (*err != REG_NOERROR))
2307 return NULL;
2309 /* If the next state has back references. */
2310 if (next_state->has_backref)
2312 *err = transit_state_bkref (mctx, &next_state->nodes);
2313 if (__glibc_unlikely (*err != REG_NOERROR))
2314 return NULL;
2315 next_state = mctx->state_log[cur_idx];
2319 return next_state;
2322 /* Skip bytes in the input that correspond to part of a
2323 multi-byte match, then look in the log for a state
2324 from which to restart matching. */
2325 static re_dfastate_t *
2326 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2328 re_dfastate_t *cur_state;
2331 Idx max = mctx->state_log_top;
2332 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2336 if (++cur_str_idx > max)
2337 return NULL;
2338 re_string_skip_bytes (&mctx->input, 1);
2340 while (mctx->state_log[cur_str_idx] == NULL);
2342 cur_state = merge_state_with_log (err, mctx, NULL);
2344 while (*err == REG_NOERROR && cur_state == NULL);
2345 return cur_state;
2348 /* Helper functions for transit_state. */
2350 /* From the node set CUR_NODES, pick up the nodes whose types are
2351 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2352 expression. And register them to use them later for evaluating the
2353 corresponding back references. */
2355 static reg_errcode_t
2356 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2357 Idx str_idx)
2359 const re_dfa_t *const dfa = mctx->dfa;
2360 Idx node_idx;
2361 reg_errcode_t err;
2363 /* TODO: This isn't efficient.
2364 Because there might be more than one nodes whose types are
2365 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2366 nodes.
2367 E.g. RE: (a){2} */
2368 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2370 Idx node = cur_nodes->elems[node_idx];
2371 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2372 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2373 && (dfa->used_bkref_map
2374 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2376 err = match_ctx_add_subtop (mctx, node, str_idx);
2377 if (__glibc_unlikely (err != REG_NOERROR))
2378 return err;
2381 return REG_NOERROR;
2384 #if 0
2385 /* Return the next state to which the current state STATE will transit by
2386 accepting the current input byte. Return NULL on failure. */
2388 static re_dfastate_t *
2389 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2390 re_dfastate_t *state)
2392 const re_dfa_t *const dfa = mctx->dfa;
2393 re_node_set next_nodes;
2394 re_dfastate_t *next_state;
2395 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2396 unsigned int context;
2398 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2399 if (__glibc_unlikely (*err != REG_NOERROR))
2400 return NULL;
2401 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2403 Idx cur_node = state->nodes.elems[node_cnt];
2404 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2406 *err = re_node_set_merge (&next_nodes,
2407 dfa->eclosures + dfa->nexts[cur_node]);
2408 if (__glibc_unlikely (*err != REG_NOERROR))
2410 re_node_set_free (&next_nodes);
2411 return NULL;
2415 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2416 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2417 /* We don't need to check errors here, since the return value of
2418 this function is next_state and ERR is already set. */
2420 re_node_set_free (&next_nodes);
2421 re_string_skip_bytes (&mctx->input, 1);
2422 return next_state;
2424 #endif
2426 static reg_errcode_t
2427 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2429 const re_dfa_t *const dfa = mctx->dfa;
2430 reg_errcode_t err;
2431 Idx i;
2433 for (i = 0; i < pstate->nodes.nelem; ++i)
2435 re_node_set dest_nodes, *new_nodes;
2436 Idx cur_node_idx = pstate->nodes.elems[i];
2437 int naccepted;
2438 Idx dest_idx;
2439 unsigned int context;
2440 re_dfastate_t *dest_state;
2442 if (!dfa->nodes[cur_node_idx].accept_mb)
2443 continue;
2445 if (dfa->nodes[cur_node_idx].constraint)
2447 context = re_string_context_at (&mctx->input,
2448 re_string_cur_idx (&mctx->input),
2449 mctx->eflags);
2450 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2451 context))
2452 continue;
2455 /* How many bytes the node can accept? */
2456 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2457 re_string_cur_idx (&mctx->input));
2458 if (naccepted == 0)
2459 continue;
2461 /* The node can accepts 'naccepted' bytes. */
2462 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2463 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2464 : mctx->max_mb_elem_len);
2465 err = clean_state_log_if_needed (mctx, dest_idx);
2466 if (__glibc_unlikely (err != REG_NOERROR))
2467 return err;
2468 DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2469 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2471 dest_state = mctx->state_log[dest_idx];
2472 if (dest_state == NULL)
2473 dest_nodes = *new_nodes;
2474 else
2476 err = re_node_set_init_union (&dest_nodes,
2477 dest_state->entrance_nodes, new_nodes);
2478 if (__glibc_unlikely (err != REG_NOERROR))
2479 return err;
2481 context = re_string_context_at (&mctx->input, dest_idx - 1,
2482 mctx->eflags);
2483 mctx->state_log[dest_idx]
2484 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2485 if (dest_state != NULL)
2486 re_node_set_free (&dest_nodes);
2487 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2488 && err != REG_NOERROR))
2489 return err;
2491 return REG_NOERROR;
2494 static reg_errcode_t
2495 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2497 const re_dfa_t *const dfa = mctx->dfa;
2498 reg_errcode_t err;
2499 Idx i;
2500 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2502 for (i = 0; i < nodes->nelem; ++i)
2504 Idx dest_str_idx, prev_nelem, bkc_idx;
2505 Idx node_idx = nodes->elems[i];
2506 unsigned int context;
2507 const re_token_t *node = dfa->nodes + node_idx;
2508 re_node_set *new_dest_nodes;
2510 /* Check whether 'node' is a backreference or not. */
2511 if (node->type != OP_BACK_REF)
2512 continue;
2514 if (node->constraint)
2516 context = re_string_context_at (&mctx->input, cur_str_idx,
2517 mctx->eflags);
2518 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2519 continue;
2522 /* 'node' is a backreference.
2523 Check the substring which the substring matched. */
2524 bkc_idx = mctx->nbkref_ents;
2525 err = get_subexp (mctx, node_idx, cur_str_idx);
2526 if (__glibc_unlikely (err != REG_NOERROR))
2527 goto free_return;
2529 /* And add the epsilon closures (which is 'new_dest_nodes') of
2530 the backreference to appropriate state_log. */
2531 DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2532 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2534 Idx subexp_len;
2535 re_dfastate_t *dest_state;
2536 struct re_backref_cache_entry *bkref_ent;
2537 bkref_ent = mctx->bkref_ents + bkc_idx;
2538 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2539 continue;
2540 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2541 new_dest_nodes = (subexp_len == 0
2542 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2543 : dfa->eclosures + dfa->nexts[node_idx]);
2544 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2545 - bkref_ent->subexp_from);
2546 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2547 mctx->eflags);
2548 dest_state = mctx->state_log[dest_str_idx];
2549 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2550 : mctx->state_log[cur_str_idx]->nodes.nelem);
2551 /* Add 'new_dest_node' to state_log. */
2552 if (dest_state == NULL)
2554 mctx->state_log[dest_str_idx]
2555 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2556 context);
2557 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2558 && err != REG_NOERROR))
2559 goto free_return;
2561 else
2563 re_node_set dest_nodes;
2564 err = re_node_set_init_union (&dest_nodes,
2565 dest_state->entrance_nodes,
2566 new_dest_nodes);
2567 if (__glibc_unlikely (err != REG_NOERROR))
2569 re_node_set_free (&dest_nodes);
2570 goto free_return;
2572 mctx->state_log[dest_str_idx]
2573 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2574 re_node_set_free (&dest_nodes);
2575 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2576 && err != REG_NOERROR))
2577 goto free_return;
2579 /* We need to check recursively if the backreference can epsilon
2580 transit. */
2581 if (subexp_len == 0
2582 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2584 err = check_subexp_matching_top (mctx, new_dest_nodes,
2585 cur_str_idx);
2586 if (__glibc_unlikely (err != REG_NOERROR))
2587 goto free_return;
2588 err = transit_state_bkref (mctx, new_dest_nodes);
2589 if (__glibc_unlikely (err != REG_NOERROR))
2590 goto free_return;
2594 err = REG_NOERROR;
2595 free_return:
2596 return err;
2599 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2600 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2601 Note that we might collect inappropriate candidates here.
2602 However, the cost of checking them strictly here is too high, then we
2603 delay these checking for prune_impossible_nodes(). */
2605 static reg_errcode_t
2606 __attribute_warn_unused_result__
2607 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2609 const re_dfa_t *const dfa = mctx->dfa;
2610 Idx subexp_num, sub_top_idx;
2611 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2612 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2613 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2614 if (cache_idx != -1)
2616 const struct re_backref_cache_entry *entry
2617 = mctx->bkref_ents + cache_idx;
2619 if (entry->node == bkref_node)
2620 return REG_NOERROR; /* We already checked it. */
2621 while (entry++->more);
2624 subexp_num = dfa->nodes[bkref_node].opr.idx;
2626 /* For each sub expression */
2627 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2629 reg_errcode_t err;
2630 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2631 re_sub_match_last_t *sub_last;
2632 Idx sub_last_idx, sl_str, bkref_str_off;
2634 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2635 continue; /* It isn't related. */
2637 sl_str = sub_top->str_idx;
2638 bkref_str_off = bkref_str_idx;
2639 /* At first, check the last node of sub expressions we already
2640 evaluated. */
2641 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2643 regoff_t sl_str_diff;
2644 sub_last = sub_top->lasts[sub_last_idx];
2645 sl_str_diff = sub_last->str_idx - sl_str;
2646 /* The matched string by the sub expression match with the substring
2647 at the back reference? */
2648 if (sl_str_diff > 0)
2650 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2651 > mctx->input.valid_len))
2653 /* Not enough chars for a successful match. */
2654 if (bkref_str_off + sl_str_diff > mctx->input.len)
2655 break;
2657 err = clean_state_log_if_needed (mctx,
2658 bkref_str_off
2659 + sl_str_diff);
2660 if (__glibc_unlikely (err != REG_NOERROR))
2661 return err;
2662 buf = (const char *) re_string_get_buffer (&mctx->input);
2664 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2665 /* We don't need to search this sub expression any more. */
2666 break;
2668 bkref_str_off += sl_str_diff;
2669 sl_str += sl_str_diff;
2670 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2671 bkref_str_idx);
2673 /* Reload buf, since the preceding call might have reallocated
2674 the buffer. */
2675 buf = (const char *) re_string_get_buffer (&mctx->input);
2677 if (err == REG_NOMATCH)
2678 continue;
2679 if (__glibc_unlikely (err != REG_NOERROR))
2680 return err;
2683 if (sub_last_idx < sub_top->nlasts)
2684 continue;
2685 if (sub_last_idx > 0)
2686 ++sl_str;
2687 /* Then, search for the other last nodes of the sub expression. */
2688 for (; sl_str <= bkref_str_idx; ++sl_str)
2690 Idx cls_node;
2691 regoff_t sl_str_off;
2692 const re_node_set *nodes;
2693 sl_str_off = sl_str - sub_top->str_idx;
2694 /* The matched string by the sub expression match with the substring
2695 at the back reference? */
2696 if (sl_str_off > 0)
2698 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2700 /* If we are at the end of the input, we cannot match. */
2701 if (bkref_str_off >= mctx->input.len)
2702 break;
2704 err = extend_buffers (mctx, bkref_str_off + 1);
2705 if (__glibc_unlikely (err != REG_NOERROR))
2706 return err;
2708 buf = (const char *) re_string_get_buffer (&mctx->input);
2710 if (buf [bkref_str_off++] != buf[sl_str - 1])
2711 break; /* We don't need to search this sub expression
2712 any more. */
2714 if (mctx->state_log[sl_str] == NULL)
2715 continue;
2716 /* Does this state have a ')' of the sub expression? */
2717 nodes = &mctx->state_log[sl_str]->nodes;
2718 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2719 OP_CLOSE_SUBEXP);
2720 if (cls_node == -1)
2721 continue; /* No. */
2722 if (sub_top->path == NULL)
2724 sub_top->path = calloc (sizeof (state_array_t),
2725 sl_str - sub_top->str_idx + 1);
2726 if (sub_top->path == NULL)
2727 return REG_ESPACE;
2729 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2730 in the current context? */
2731 err = check_arrival (mctx, sub_top->path, sub_top->node,
2732 sub_top->str_idx, cls_node, sl_str,
2733 OP_CLOSE_SUBEXP);
2734 if (err == REG_NOMATCH)
2735 continue;
2736 if (__glibc_unlikely (err != REG_NOERROR))
2737 return err;
2738 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2739 if (__glibc_unlikely (sub_last == NULL))
2740 return REG_ESPACE;
2741 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2742 bkref_str_idx);
2743 buf = (const char *) re_string_get_buffer (&mctx->input);
2744 if (err == REG_NOMATCH)
2745 continue;
2746 if (__glibc_unlikely (err != REG_NOERROR))
2747 return err;
2750 return REG_NOERROR;
2753 /* Helper functions for get_subexp(). */
2755 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2756 If it can arrive, register the sub expression expressed with SUB_TOP
2757 and SUB_LAST. */
2759 static reg_errcode_t
2760 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2761 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2763 reg_errcode_t err;
2764 Idx to_idx;
2765 /* Can the subexpression arrive the back reference? */
2766 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2767 sub_last->str_idx, bkref_node, bkref_str,
2768 OP_OPEN_SUBEXP);
2769 if (err != REG_NOERROR)
2770 return err;
2771 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2772 sub_last->str_idx);
2773 if (__glibc_unlikely (err != REG_NOERROR))
2774 return err;
2775 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2776 return clean_state_log_if_needed (mctx, to_idx);
2779 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2780 Search '(' if FL_OPEN, or search ')' otherwise.
2781 TODO: This function isn't efficient...
2782 Because there might be more than one nodes whose types are
2783 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2784 nodes.
2785 E.g. RE: (a){2} */
2787 static Idx
2788 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2789 Idx subexp_idx, int type)
2791 Idx cls_idx;
2792 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2794 Idx cls_node = nodes->elems[cls_idx];
2795 const re_token_t *node = dfa->nodes + cls_node;
2796 if (node->type == type
2797 && node->opr.idx == subexp_idx)
2798 return cls_node;
2800 return -1;
2803 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2804 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2805 heavily reused.
2806 Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2807 REG_ESPACE if memory is exhausted. */
2809 static reg_errcode_t
2810 __attribute_warn_unused_result__
2811 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2812 Idx top_str, Idx last_node, Idx last_str, int type)
2814 const re_dfa_t *const dfa = mctx->dfa;
2815 reg_errcode_t err = REG_NOERROR;
2816 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2817 re_dfastate_t *cur_state = NULL;
2818 re_node_set *cur_nodes, next_nodes;
2819 re_dfastate_t **backup_state_log;
2820 unsigned int context;
2822 subexp_num = dfa->nodes[top_node].opr.idx;
2823 /* Extend the buffer if we need. */
2824 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2826 re_dfastate_t **new_array;
2827 Idx old_alloc = path->alloc;
2828 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2829 Idx new_alloc;
2830 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2831 return REG_ESPACE;
2832 new_alloc = old_alloc + incr_alloc;
2833 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2834 return REG_ESPACE;
2835 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2836 if (__glibc_unlikely (new_array == NULL))
2837 return REG_ESPACE;
2838 path->array = new_array;
2839 path->alloc = new_alloc;
2840 memset (new_array + old_alloc, '\0',
2841 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2844 str_idx = path->next_idx ? path->next_idx : top_str;
2846 /* Temporary modify MCTX. */
2847 backup_state_log = mctx->state_log;
2848 backup_cur_idx = mctx->input.cur_idx;
2849 mctx->state_log = path->array;
2850 mctx->input.cur_idx = str_idx;
2852 /* Setup initial node set. */
2853 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2854 if (str_idx == top_str)
2856 err = re_node_set_init_1 (&next_nodes, top_node);
2857 if (__glibc_unlikely (err != REG_NOERROR))
2858 return err;
2859 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2860 if (__glibc_unlikely (err != REG_NOERROR))
2862 re_node_set_free (&next_nodes);
2863 return err;
2866 else
2868 cur_state = mctx->state_log[str_idx];
2869 if (cur_state && cur_state->has_backref)
2871 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2872 if (__glibc_unlikely (err != REG_NOERROR))
2873 return err;
2875 else
2876 re_node_set_init_empty (&next_nodes);
2878 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2880 if (next_nodes.nelem)
2882 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2883 subexp_num, type);
2884 if (__glibc_unlikely (err != REG_NOERROR))
2886 re_node_set_free (&next_nodes);
2887 return err;
2890 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2891 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2893 re_node_set_free (&next_nodes);
2894 return err;
2896 mctx->state_log[str_idx] = cur_state;
2899 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2901 re_node_set_empty (&next_nodes);
2902 if (mctx->state_log[str_idx + 1])
2904 err = re_node_set_merge (&next_nodes,
2905 &mctx->state_log[str_idx + 1]->nodes);
2906 if (__glibc_unlikely (err != REG_NOERROR))
2908 re_node_set_free (&next_nodes);
2909 return err;
2912 if (cur_state)
2914 err = check_arrival_add_next_nodes (mctx, str_idx,
2915 &cur_state->non_eps_nodes,
2916 &next_nodes);
2917 if (__glibc_unlikely (err != REG_NOERROR))
2919 re_node_set_free (&next_nodes);
2920 return err;
2923 ++str_idx;
2924 if (next_nodes.nelem)
2926 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2927 if (__glibc_unlikely (err != REG_NOERROR))
2929 re_node_set_free (&next_nodes);
2930 return err;
2932 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2933 subexp_num, type);
2934 if (__glibc_unlikely (err != REG_NOERROR))
2936 re_node_set_free (&next_nodes);
2937 return err;
2940 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2942 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2944 re_node_set_free (&next_nodes);
2945 return err;
2947 mctx->state_log[str_idx] = cur_state;
2948 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2950 re_node_set_free (&next_nodes);
2951 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2952 : &mctx->state_log[last_str]->nodes);
2953 path->next_idx = str_idx;
2955 /* Fix MCTX. */
2956 mctx->state_log = backup_state_log;
2957 mctx->input.cur_idx = backup_cur_idx;
2959 /* Then check the current node set has the node LAST_NODE. */
2960 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2961 return REG_NOERROR;
2963 return REG_NOMATCH;
2966 /* Helper functions for check_arrival. */
2968 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2969 to NEXT_NODES.
2970 TODO: This function is similar to the functions transit_state*(),
2971 however this function has many additional works.
2972 Can't we unify them? */
2974 static reg_errcode_t
2975 __attribute_warn_unused_result__
2976 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2977 re_node_set *cur_nodes, re_node_set *next_nodes)
2979 const re_dfa_t *const dfa = mctx->dfa;
2980 bool ok;
2981 Idx cur_idx;
2982 reg_errcode_t err = REG_NOERROR;
2983 re_node_set union_set;
2984 re_node_set_init_empty (&union_set);
2985 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2987 int naccepted = 0;
2988 Idx cur_node = cur_nodes->elems[cur_idx];
2989 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2991 /* If the node may accept "multi byte". */
2992 if (dfa->nodes[cur_node].accept_mb)
2994 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2995 str_idx);
2996 if (naccepted > 1)
2998 re_dfastate_t *dest_state;
2999 Idx next_node = dfa->nexts[cur_node];
3000 Idx next_idx = str_idx + naccepted;
3001 dest_state = mctx->state_log[next_idx];
3002 re_node_set_empty (&union_set);
3003 if (dest_state)
3005 err = re_node_set_merge (&union_set, &dest_state->nodes);
3006 if (__glibc_unlikely (err != REG_NOERROR))
3008 re_node_set_free (&union_set);
3009 return err;
3012 ok = re_node_set_insert (&union_set, next_node);
3013 if (__glibc_unlikely (! ok))
3015 re_node_set_free (&union_set);
3016 return REG_ESPACE;
3018 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3019 &union_set);
3020 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3021 && err != REG_NOERROR))
3023 re_node_set_free (&union_set);
3024 return err;
3029 if (naccepted
3030 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3032 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3033 if (__glibc_unlikely (! ok))
3035 re_node_set_free (&union_set);
3036 return REG_ESPACE;
3040 re_node_set_free (&union_set);
3041 return REG_NOERROR;
3044 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3045 CUR_NODES, however exclude the nodes which are:
3046 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3047 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3050 static reg_errcode_t
3051 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3052 Idx ex_subexp, int type)
3054 reg_errcode_t err;
3055 Idx idx, outside_node;
3056 re_node_set new_nodes;
3057 DEBUG_ASSERT (cur_nodes->nelem);
3058 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3059 if (__glibc_unlikely (err != REG_NOERROR))
3060 return err;
3061 /* Create a new node set NEW_NODES with the nodes which are epsilon
3062 closures of the node in CUR_NODES. */
3064 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3066 Idx cur_node = cur_nodes->elems[idx];
3067 const re_node_set *eclosure = dfa->eclosures + cur_node;
3068 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3069 if (outside_node == -1)
3071 /* There are no problematic nodes, just merge them. */
3072 err = re_node_set_merge (&new_nodes, eclosure);
3073 if (__glibc_unlikely (err != REG_NOERROR))
3075 re_node_set_free (&new_nodes);
3076 return err;
3079 else
3081 /* There are problematic nodes, re-calculate incrementally. */
3082 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3083 ex_subexp, type);
3084 if (__glibc_unlikely (err != REG_NOERROR))
3086 re_node_set_free (&new_nodes);
3087 return err;
3091 re_node_set_free (cur_nodes);
3092 *cur_nodes = new_nodes;
3093 return REG_NOERROR;
3096 /* Helper function for check_arrival_expand_ecl.
3097 Check incrementally the epsilon closure of TARGET, and if it isn't
3098 problematic append it to DST_NODES. */
3100 static reg_errcode_t
3101 __attribute_warn_unused_result__
3102 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3103 Idx target, Idx ex_subexp, int type)
3105 Idx cur_node;
3106 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3108 bool ok;
3110 if (dfa->nodes[cur_node].type == type
3111 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3113 if (type == OP_CLOSE_SUBEXP)
3115 ok = re_node_set_insert (dst_nodes, cur_node);
3116 if (__glibc_unlikely (! ok))
3117 return REG_ESPACE;
3119 break;
3121 ok = re_node_set_insert (dst_nodes, cur_node);
3122 if (__glibc_unlikely (! ok))
3123 return REG_ESPACE;
3124 if (dfa->edests[cur_node].nelem == 0)
3125 break;
3126 if (dfa->edests[cur_node].nelem == 2)
3128 reg_errcode_t err;
3129 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3130 dfa->edests[cur_node].elems[1],
3131 ex_subexp, type);
3132 if (__glibc_unlikely (err != REG_NOERROR))
3133 return err;
3135 cur_node = dfa->edests[cur_node].elems[0];
3137 return REG_NOERROR;
3141 /* For all the back references in the current state, calculate the
3142 destination of the back references by the appropriate entry
3143 in MCTX->BKREF_ENTS. */
3145 static reg_errcode_t
3146 __attribute_warn_unused_result__
3147 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3148 Idx cur_str, Idx subexp_num, int type)
3150 const re_dfa_t *const dfa = mctx->dfa;
3151 reg_errcode_t err;
3152 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3153 struct re_backref_cache_entry *ent;
3155 if (cache_idx_start == -1)
3156 return REG_NOERROR;
3158 restart:
3159 ent = mctx->bkref_ents + cache_idx_start;
3162 Idx to_idx, next_node;
3164 /* Is this entry ENT is appropriate? */
3165 if (!re_node_set_contains (cur_nodes, ent->node))
3166 continue; /* No. */
3168 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3169 /* Calculate the destination of the back reference, and append it
3170 to MCTX->STATE_LOG. */
3171 if (to_idx == cur_str)
3173 /* The backreference did epsilon transit, we must re-check all the
3174 node in the current state. */
3175 re_node_set new_dests;
3176 reg_errcode_t err2, err3;
3177 next_node = dfa->edests[ent->node].elems[0];
3178 if (re_node_set_contains (cur_nodes, next_node))
3179 continue;
3180 err = re_node_set_init_1 (&new_dests, next_node);
3181 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3182 err3 = re_node_set_merge (cur_nodes, &new_dests);
3183 re_node_set_free (&new_dests);
3184 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3185 || err3 != REG_NOERROR))
3187 err = (err != REG_NOERROR ? err
3188 : (err2 != REG_NOERROR ? err2 : err3));
3189 return err;
3191 /* TODO: It is still inefficient... */
3192 goto restart;
3194 else
3196 re_node_set union_set;
3197 next_node = dfa->nexts[ent->node];
3198 if (mctx->state_log[to_idx])
3200 bool ok;
3201 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3202 next_node))
3203 continue;
3204 err = re_node_set_init_copy (&union_set,
3205 &mctx->state_log[to_idx]->nodes);
3206 ok = re_node_set_insert (&union_set, next_node);
3207 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3209 re_node_set_free (&union_set);
3210 err = err != REG_NOERROR ? err : REG_ESPACE;
3211 return err;
3214 else
3216 err = re_node_set_init_1 (&union_set, next_node);
3217 if (__glibc_unlikely (err != REG_NOERROR))
3218 return err;
3220 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3221 re_node_set_free (&union_set);
3222 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3223 && err != REG_NOERROR))
3224 return err;
3227 while (ent++->more);
3228 return REG_NOERROR;
3231 /* Build transition table for the state.
3232 Return true if successful. */
3234 static bool __attribute_noinline__
3235 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3237 reg_errcode_t err;
3238 Idx i, j;
3239 int ch;
3240 bool need_word_trtable = false;
3241 bitset_word_t elem, mask;
3242 Idx ndests; /* Number of the destination states from 'state'. */
3243 re_dfastate_t **trtable;
3244 re_dfastate_t *dest_states[SBC_MAX];
3245 re_dfastate_t *dest_states_word[SBC_MAX];
3246 re_dfastate_t *dest_states_nl[SBC_MAX];
3247 re_node_set follows;
3248 bitset_t acceptable;
3250 /* We build DFA states which corresponds to the destination nodes
3251 from 'state'. 'dests_node[i]' represents the nodes which i-th
3252 destination state contains, and 'dests_ch[i]' represents the
3253 characters which i-th destination state accepts. */
3254 re_node_set dests_node[SBC_MAX];
3255 bitset_t dests_ch[SBC_MAX];
3257 /* Initialize transition table. */
3258 state->word_trtable = state->trtable = NULL;
3260 /* At first, group all nodes belonging to 'state' into several
3261 destinations. */
3262 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3263 if (__glibc_unlikely (ndests <= 0))
3265 /* Return false in case of an error, true otherwise. */
3266 if (ndests == 0)
3268 state->trtable = (re_dfastate_t **)
3269 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3270 if (__glibc_unlikely (state->trtable == NULL))
3271 return false;
3272 return true;
3274 return false;
3277 err = re_node_set_alloc (&follows, ndests + 1);
3278 if (__glibc_unlikely (err != REG_NOERROR))
3280 out_free:
3281 re_node_set_free (&follows);
3282 for (i = 0; i < ndests; ++i)
3283 re_node_set_free (dests_node + i);
3284 return false;
3287 bitset_empty (acceptable);
3289 /* Then build the states for all destinations. */
3290 for (i = 0; i < ndests; ++i)
3292 Idx next_node;
3293 re_node_set_empty (&follows);
3294 /* Merge the follows of this destination states. */
3295 for (j = 0; j < dests_node[i].nelem; ++j)
3297 next_node = dfa->nexts[dests_node[i].elems[j]];
3298 if (next_node != -1)
3300 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3301 if (__glibc_unlikely (err != REG_NOERROR))
3302 goto out_free;
3305 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3306 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3307 goto out_free;
3308 /* If the new state has context constraint,
3309 build appropriate states for these contexts. */
3310 if (dest_states[i]->has_constraint)
3312 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3313 CONTEXT_WORD);
3314 if (__glibc_unlikely (dest_states_word[i] == NULL
3315 && err != REG_NOERROR))
3316 goto out_free;
3318 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3319 need_word_trtable = true;
3321 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3322 CONTEXT_NEWLINE);
3323 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3324 goto out_free;
3326 else
3328 dest_states_word[i] = dest_states[i];
3329 dest_states_nl[i] = dest_states[i];
3331 bitset_merge (acceptable, dests_ch[i]);
3334 if (!__glibc_unlikely (need_word_trtable))
3336 /* We don't care about whether the following character is a word
3337 character, or we are in a single-byte character set so we can
3338 discern by looking at the character code: allocate a
3339 256-entry transition table. */
3340 trtable = state->trtable =
3341 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3342 if (__glibc_unlikely (trtable == NULL))
3343 goto out_free;
3345 /* For all characters ch...: */
3346 for (i = 0; i < BITSET_WORDS; ++i)
3347 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3348 elem;
3349 mask <<= 1, elem >>= 1, ++ch)
3350 if (__glibc_unlikely (elem & 1))
3352 /* There must be exactly one destination which accepts
3353 character ch. See group_nodes_into_DFAstates. */
3354 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3357 /* j-th destination accepts the word character ch. */
3358 if (dfa->word_char[i] & mask)
3359 trtable[ch] = dest_states_word[j];
3360 else
3361 trtable[ch] = dest_states[j];
3364 else
3366 /* We care about whether the following character is a word
3367 character, and we are in a multi-byte character set: discern
3368 by looking at the character code: build two 256-entry
3369 transition tables, one starting at trtable[0] and one
3370 starting at trtable[SBC_MAX]. */
3371 trtable = state->word_trtable =
3372 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3373 if (__glibc_unlikely (trtable == NULL))
3374 goto out_free;
3376 /* For all characters ch...: */
3377 for (i = 0; i < BITSET_WORDS; ++i)
3378 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3379 elem;
3380 mask <<= 1, elem >>= 1, ++ch)
3381 if (__glibc_unlikely (elem & 1))
3383 /* There must be exactly one destination which accepts
3384 character ch. See group_nodes_into_DFAstates. */
3385 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3388 /* j-th destination accepts the word character ch. */
3389 trtable[ch] = dest_states[j];
3390 trtable[ch + SBC_MAX] = dest_states_word[j];
3394 /* new line */
3395 if (bitset_contain (acceptable, NEWLINE_CHAR))
3397 /* The current state accepts newline character. */
3398 for (j = 0; j < ndests; ++j)
3399 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3401 /* k-th destination accepts newline character. */
3402 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3403 if (need_word_trtable)
3404 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3405 /* There must be only one destination which accepts
3406 newline. See group_nodes_into_DFAstates. */
3407 break;
3411 re_node_set_free (&follows);
3412 for (i = 0; i < ndests; ++i)
3413 re_node_set_free (dests_node + i);
3414 return true;
3417 /* Group all nodes belonging to STATE into several destinations.
3418 Then for all destinations, set the nodes belonging to the destination
3419 to DESTS_NODE[i] and set the characters accepted by the destination
3420 to DEST_CH[i]. Return the number of destinations if successful,
3421 -1 on internal error. */
3423 static Idx
3424 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3425 re_node_set *dests_node, bitset_t *dests_ch)
3427 reg_errcode_t err;
3428 bool ok;
3429 Idx i, j, k;
3430 Idx ndests; /* Number of the destinations from 'state'. */
3431 bitset_t accepts; /* Characters a node can accept. */
3432 const re_node_set *cur_nodes = &state->nodes;
3433 bitset_empty (accepts);
3434 ndests = 0;
3436 /* For all the nodes belonging to 'state', */
3437 for (i = 0; i < cur_nodes->nelem; ++i)
3439 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3440 re_token_type_t type = node->type;
3441 unsigned int constraint = node->constraint;
3443 /* Enumerate all single byte character this node can accept. */
3444 if (type == CHARACTER)
3445 bitset_set (accepts, node->opr.c);
3446 else if (type == SIMPLE_BRACKET)
3448 bitset_merge (accepts, node->opr.sbcset);
3450 else if (type == OP_PERIOD)
3452 if (dfa->mb_cur_max > 1)
3453 bitset_merge (accepts, dfa->sb_char);
3454 else
3455 bitset_set_all (accepts);
3456 if (!(dfa->syntax & RE_DOT_NEWLINE))
3457 bitset_clear (accepts, '\n');
3458 if (dfa->syntax & RE_DOT_NOT_NULL)
3459 bitset_clear (accepts, '\0');
3461 else if (type == OP_UTF8_PERIOD)
3463 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3464 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3465 else
3466 bitset_merge (accepts, utf8_sb_map);
3467 if (!(dfa->syntax & RE_DOT_NEWLINE))
3468 bitset_clear (accepts, '\n');
3469 if (dfa->syntax & RE_DOT_NOT_NULL)
3470 bitset_clear (accepts, '\0');
3472 else
3473 continue;
3475 /* Check the 'accepts' and sift the characters which are not
3476 match it the context. */
3477 if (constraint)
3479 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3481 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3482 bitset_empty (accepts);
3483 if (accepts_newline)
3484 bitset_set (accepts, NEWLINE_CHAR);
3485 else
3486 continue;
3488 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3490 bitset_empty (accepts);
3491 continue;
3494 if (constraint & NEXT_WORD_CONSTRAINT)
3496 bitset_word_t any_set = 0;
3497 if (type == CHARACTER && !node->word_char)
3499 bitset_empty (accepts);
3500 continue;
3502 if (dfa->mb_cur_max > 1)
3503 for (j = 0; j < BITSET_WORDS; ++j)
3504 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3505 else
3506 for (j = 0; j < BITSET_WORDS; ++j)
3507 any_set |= (accepts[j] &= dfa->word_char[j]);
3508 if (!any_set)
3509 continue;
3511 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3513 bitset_word_t any_set = 0;
3514 if (type == CHARACTER && node->word_char)
3516 bitset_empty (accepts);
3517 continue;
3519 if (dfa->mb_cur_max > 1)
3520 for (j = 0; j < BITSET_WORDS; ++j)
3521 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3522 else
3523 for (j = 0; j < BITSET_WORDS; ++j)
3524 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3525 if (!any_set)
3526 continue;
3530 /* Then divide 'accepts' into DFA states, or create a new
3531 state. Above, we make sure that accepts is not empty. */
3532 for (j = 0; j < ndests; ++j)
3534 bitset_t intersec; /* Intersection sets, see below. */
3535 bitset_t remains;
3536 /* Flags, see below. */
3537 bitset_word_t has_intersec, not_subset, not_consumed;
3539 /* Optimization, skip if this state doesn't accept the character. */
3540 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3541 continue;
3543 /* Enumerate the intersection set of this state and 'accepts'. */
3544 has_intersec = 0;
3545 for (k = 0; k < BITSET_WORDS; ++k)
3546 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3547 /* And skip if the intersection set is empty. */
3548 if (!has_intersec)
3549 continue;
3551 /* Then check if this state is a subset of 'accepts'. */
3552 not_subset = not_consumed = 0;
3553 for (k = 0; k < BITSET_WORDS; ++k)
3555 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3556 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3559 /* If this state isn't a subset of 'accepts', create a
3560 new group state, which has the 'remains'. */
3561 if (not_subset)
3563 bitset_copy (dests_ch[ndests], remains);
3564 bitset_copy (dests_ch[j], intersec);
3565 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3566 if (__glibc_unlikely (err != REG_NOERROR))
3567 goto error_return;
3568 ++ndests;
3571 /* Put the position in the current group. */
3572 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3573 if (__glibc_unlikely (! ok))
3574 goto error_return;
3576 /* If all characters are consumed, go to next node. */
3577 if (!not_consumed)
3578 break;
3580 /* Some characters remain, create a new group. */
3581 if (j == ndests)
3583 bitset_copy (dests_ch[ndests], accepts);
3584 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3585 if (__glibc_unlikely (err != REG_NOERROR))
3586 goto error_return;
3587 ++ndests;
3588 bitset_empty (accepts);
3591 assume (ndests <= SBC_MAX);
3592 return ndests;
3593 error_return:
3594 for (j = 0; j < ndests; ++j)
3595 re_node_set_free (dests_node + j);
3596 return -1;
3599 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3600 Return the number of the bytes the node accepts.
3601 STR_IDX is the current index of the input string.
3603 This function handles the nodes which can accept one character, or
3604 one collating element like '.', '[a-z]', opposite to the other nodes
3605 can only accept one byte. */
3607 #ifdef _LIBC
3608 # include <locale/weight.h>
3609 #endif
3611 static int
3612 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3613 const re_string_t *input, Idx str_idx)
3615 const re_token_t *node = dfa->nodes + node_idx;
3616 int char_len, elem_len;
3617 Idx i;
3619 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3621 unsigned char c = re_string_byte_at (input, str_idx), d;
3622 if (__glibc_likely (c < 0xc2))
3623 return 0;
3625 if (str_idx + 2 > input->len)
3626 return 0;
3628 d = re_string_byte_at (input, str_idx + 1);
3629 if (c < 0xe0)
3630 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3631 else if (c < 0xf0)
3633 char_len = 3;
3634 if (c == 0xe0 && d < 0xa0)
3635 return 0;
3637 else if (c < 0xf8)
3639 char_len = 4;
3640 if (c == 0xf0 && d < 0x90)
3641 return 0;
3643 else if (c < 0xfc)
3645 char_len = 5;
3646 if (c == 0xf8 && d < 0x88)
3647 return 0;
3649 else if (c < 0xfe)
3651 char_len = 6;
3652 if (c == 0xfc && d < 0x84)
3653 return 0;
3655 else
3656 return 0;
3658 if (str_idx + char_len > input->len)
3659 return 0;
3661 for (i = 1; i < char_len; ++i)
3663 d = re_string_byte_at (input, str_idx + i);
3664 if (d < 0x80 || d > 0xbf)
3665 return 0;
3667 return char_len;
3670 char_len = re_string_char_size_at (input, str_idx);
3671 if (node->type == OP_PERIOD)
3673 if (char_len <= 1)
3674 return 0;
3675 /* FIXME: I don't think this if is needed, as both '\n'
3676 and '\0' are char_len == 1. */
3677 /* '.' accepts any one character except the following two cases. */
3678 if ((!(dfa->syntax & RE_DOT_NEWLINE)
3679 && re_string_byte_at (input, str_idx) == '\n')
3680 || ((dfa->syntax & RE_DOT_NOT_NULL)
3681 && re_string_byte_at (input, str_idx) == '\0'))
3682 return 0;
3683 return char_len;
3686 elem_len = re_string_elem_size_at (input, str_idx);
3687 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3688 return 0;
3690 if (node->type == COMPLEX_BRACKET)
3692 const re_charset_t *cset = node->opr.mbcset;
3693 #ifdef _LIBC
3694 const unsigned char *pin
3695 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3696 Idx j;
3697 uint32_t nrules;
3698 #endif
3699 int match_len = 0;
3700 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3701 ? re_string_wchar_at (input, str_idx) : 0);
3703 /* match with multibyte character? */
3704 for (i = 0; i < cset->nmbchars; ++i)
3705 if (wc == cset->mbchars[i])
3707 match_len = char_len;
3708 goto check_node_accept_bytes_match;
3710 /* match with character_class? */
3711 for (i = 0; i < cset->nchar_classes; ++i)
3713 wctype_t wt = cset->char_classes[i];
3714 if (__iswctype (wc, wt))
3716 match_len = char_len;
3717 goto check_node_accept_bytes_match;
3721 #ifdef _LIBC
3722 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3723 if (nrules != 0)
3725 unsigned int in_collseq = 0;
3726 const int32_t *table, *indirect;
3727 const unsigned char *weights, *extra;
3728 const char *collseqwc;
3730 /* match with collating_symbol? */
3731 if (cset->ncoll_syms)
3732 extra = (const unsigned char *)
3733 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3734 for (i = 0; i < cset->ncoll_syms; ++i)
3736 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3737 /* Compare the length of input collating element and
3738 the length of current collating element. */
3739 if (*coll_sym != elem_len)
3740 continue;
3741 /* Compare each bytes. */
3742 for (j = 0; j < *coll_sym; j++)
3743 if (pin[j] != coll_sym[1 + j])
3744 break;
3745 if (j == *coll_sym)
3747 /* Match if every bytes is equal. */
3748 match_len = j;
3749 goto check_node_accept_bytes_match;
3753 if (cset->nranges)
3755 if (elem_len <= char_len)
3757 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3758 in_collseq = __collseq_table_lookup (collseqwc, wc);
3760 else
3761 in_collseq = find_collation_sequence_value (pin, elem_len);
3763 /* match with range expression? */
3764 /* FIXME: Implement rational ranges here, too. */
3765 for (i = 0; i < cset->nranges; ++i)
3766 if (cset->range_starts[i] <= in_collseq
3767 && in_collseq <= cset->range_ends[i])
3769 match_len = elem_len;
3770 goto check_node_accept_bytes_match;
3773 /* match with equivalence_class? */
3774 if (cset->nequiv_classes)
3776 const unsigned char *cp = pin;
3777 table = (const int32_t *)
3778 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3779 weights = (const unsigned char *)
3780 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3781 extra = (const unsigned char *)
3782 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3783 indirect = (const int32_t *)
3784 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3785 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3786 int32_t rule = idx >> 24;
3787 idx &= 0xffffff;
3788 if (idx > 0)
3790 size_t weight_len = weights[idx];
3791 for (i = 0; i < cset->nequiv_classes; ++i)
3793 int32_t equiv_class_idx = cset->equiv_classes[i];
3794 int32_t equiv_class_rule = equiv_class_idx >> 24;
3795 equiv_class_idx &= 0xffffff;
3796 if (weights[equiv_class_idx] == weight_len
3797 && equiv_class_rule == rule
3798 && memcmp (weights + idx + 1,
3799 weights + equiv_class_idx + 1,
3800 weight_len) == 0)
3802 match_len = elem_len;
3803 goto check_node_accept_bytes_match;
3809 else
3810 #endif /* _LIBC */
3812 /* match with range expression? */
3813 for (i = 0; i < cset->nranges; ++i)
3815 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3817 match_len = char_len;
3818 goto check_node_accept_bytes_match;
3822 check_node_accept_bytes_match:
3823 if (!cset->non_match)
3824 return match_len;
3825 else
3827 if (match_len > 0)
3828 return 0;
3829 else
3830 return (elem_len > char_len) ? elem_len : char_len;
3833 return 0;
3836 #ifdef _LIBC
3837 static unsigned int
3838 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3840 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3841 if (nrules == 0)
3843 if (mbs_len == 1)
3845 /* No valid character. Match it as a single byte character. */
3846 const unsigned char *collseq = (const unsigned char *)
3847 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3848 return collseq[mbs[0]];
3850 return UINT_MAX;
3852 else
3854 int32_t idx;
3855 const unsigned char *extra = (const unsigned char *)
3856 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3857 int32_t extrasize = (const unsigned char *)
3858 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3860 for (idx = 0; idx < extrasize;)
3862 int mbs_cnt;
3863 bool found = false;
3864 int32_t elem_mbs_len;
3865 /* Skip the name of collating element name. */
3866 idx = idx + extra[idx] + 1;
3867 elem_mbs_len = extra[idx++];
3868 if (mbs_len == elem_mbs_len)
3870 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3871 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3872 break;
3873 if (mbs_cnt == elem_mbs_len)
3874 /* Found the entry. */
3875 found = true;
3877 /* Skip the byte sequence of the collating element. */
3878 idx += elem_mbs_len;
3879 /* Adjust for the alignment. */
3880 idx = (idx + 3) & ~3;
3881 /* Skip the collation sequence value. */
3882 idx += sizeof (uint32_t);
3883 /* Skip the wide char sequence of the collating element. */
3884 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3885 /* If we found the entry, return the sequence value. */
3886 if (found)
3887 return *(uint32_t *) (extra + idx);
3888 /* Skip the collation sequence value. */
3889 idx += sizeof (uint32_t);
3891 return UINT_MAX;
3894 #endif /* _LIBC */
3896 /* Check whether the node accepts the byte which is IDX-th
3897 byte of the INPUT. */
3899 static bool
3900 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3901 Idx idx)
3903 unsigned char ch;
3904 ch = re_string_byte_at (&mctx->input, idx);
3905 switch (node->type)
3907 case CHARACTER:
3908 if (node->opr.c != ch)
3909 return false;
3910 break;
3912 case SIMPLE_BRACKET:
3913 if (!bitset_contain (node->opr.sbcset, ch))
3914 return false;
3915 break;
3917 case OP_UTF8_PERIOD:
3918 if (ch >= ASCII_CHARS)
3919 return false;
3920 FALLTHROUGH;
3921 case OP_PERIOD:
3922 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3923 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3924 return false;
3925 break;
3927 default:
3928 return false;
3931 if (node->constraint)
3933 /* The node has constraints. Check whether the current context
3934 satisfies the constraints. */
3935 unsigned int context = re_string_context_at (&mctx->input, idx,
3936 mctx->eflags);
3937 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3938 return false;
3941 return true;
3944 /* Extend the buffers, if the buffers have run out. */
3946 static reg_errcode_t
3947 __attribute_warn_unused_result__
3948 extend_buffers (re_match_context_t *mctx, int min_len)
3950 reg_errcode_t ret;
3951 re_string_t *pstr = &mctx->input;
3953 /* Avoid overflow. */
3954 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3955 <= pstr->bufs_len))
3956 return REG_ESPACE;
3958 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3959 ret = re_string_realloc_buffers (pstr,
3960 MAX (min_len,
3961 MIN (pstr->len, pstr->bufs_len * 2)));
3962 if (__glibc_unlikely (ret != REG_NOERROR))
3963 return ret;
3965 if (mctx->state_log != NULL)
3967 /* And double the length of state_log. */
3968 /* XXX We have no indication of the size of this buffer. If this
3969 allocation fail we have no indication that the state_log array
3970 does not have the right size. */
3971 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3972 pstr->bufs_len + 1);
3973 if (__glibc_unlikely (new_array == NULL))
3974 return REG_ESPACE;
3975 mctx->state_log = new_array;
3978 /* Then reconstruct the buffers. */
3979 if (pstr->icase)
3981 if (pstr->mb_cur_max > 1)
3983 ret = build_wcs_upper_buffer (pstr);
3984 if (__glibc_unlikely (ret != REG_NOERROR))
3985 return ret;
3987 else
3988 build_upper_buffer (pstr);
3990 else
3992 if (pstr->mb_cur_max > 1)
3993 build_wcs_buffer (pstr);
3994 else
3996 if (pstr->trans != NULL)
3997 re_string_translate_buffer (pstr);
4000 return REG_NOERROR;
4004 /* Functions for matching context. */
4006 /* Initialize MCTX. */
4008 static reg_errcode_t
4009 __attribute_warn_unused_result__
4010 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4012 mctx->eflags = eflags;
4013 mctx->match_last = -1;
4014 if (n > 0)
4016 /* Avoid overflow. */
4017 size_t max_object_size =
4018 MAX (sizeof (struct re_backref_cache_entry),
4019 sizeof (re_sub_match_top_t *));
4020 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4021 return REG_ESPACE;
4023 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4024 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4025 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4026 return REG_ESPACE;
4028 /* Already zero-ed by the caller.
4029 else
4030 mctx->bkref_ents = NULL;
4031 mctx->nbkref_ents = 0;
4032 mctx->nsub_tops = 0; */
4033 mctx->abkref_ents = n;
4034 mctx->max_mb_elem_len = 1;
4035 mctx->asub_tops = n;
4036 return REG_NOERROR;
4039 /* Clean the entries which depend on the current input in MCTX.
4040 This function must be invoked when the matcher changes the start index
4041 of the input, or changes the input string. */
4043 static void
4044 match_ctx_clean (re_match_context_t *mctx)
4046 Idx st_idx;
4047 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4049 Idx sl_idx;
4050 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4051 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4053 re_sub_match_last_t *last = top->lasts[sl_idx];
4054 re_free (last->path.array);
4055 re_free (last);
4057 re_free (top->lasts);
4058 if (top->path)
4060 re_free (top->path->array);
4061 re_free (top->path);
4063 re_free (top);
4066 mctx->nsub_tops = 0;
4067 mctx->nbkref_ents = 0;
4070 /* Free all the memory associated with MCTX. */
4072 static void
4073 match_ctx_free (re_match_context_t *mctx)
4075 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4076 match_ctx_clean (mctx);
4077 re_free (mctx->sub_tops);
4078 re_free (mctx->bkref_ents);
4081 /* Add a new backreference entry to MCTX.
4082 Note that we assume that caller never call this function with duplicate
4083 entry, and call with STR_IDX which isn't smaller than any existing entry.
4086 static reg_errcode_t
4087 __attribute_warn_unused_result__
4088 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4089 Idx to)
4091 if (mctx->nbkref_ents >= mctx->abkref_ents)
4093 struct re_backref_cache_entry* new_entry;
4094 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4095 mctx->abkref_ents * 2);
4096 if (__glibc_unlikely (new_entry == NULL))
4098 re_free (mctx->bkref_ents);
4099 return REG_ESPACE;
4101 mctx->bkref_ents = new_entry;
4102 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4103 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4104 mctx->abkref_ents *= 2;
4106 if (mctx->nbkref_ents > 0
4107 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4108 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4110 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4111 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4112 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4113 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4115 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4116 If bit N is clear, means that this entry won't epsilon-transition to
4117 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4118 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4119 such node.
4121 A backreference does not epsilon-transition unless it is empty, so set
4122 to all zeros if FROM != TO. */
4123 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4124 = (from == to ? -1 : 0);
4126 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4127 if (mctx->max_mb_elem_len < to - from)
4128 mctx->max_mb_elem_len = to - from;
4129 return REG_NOERROR;
4132 /* Return the first entry with the same str_idx, or -1 if none is
4133 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4135 static Idx
4136 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4138 Idx left, right, mid, last;
4139 last = right = mctx->nbkref_ents;
4140 for (left = 0; left < right;)
4142 mid = (left + right) / 2;
4143 if (mctx->bkref_ents[mid].str_idx < str_idx)
4144 left = mid + 1;
4145 else
4146 right = mid;
4148 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4149 return left;
4150 else
4151 return -1;
4154 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4155 at STR_IDX. */
4157 static reg_errcode_t
4158 __attribute_warn_unused_result__
4159 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4161 DEBUG_ASSERT (mctx->sub_tops != NULL);
4162 DEBUG_ASSERT (mctx->asub_tops > 0);
4163 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4165 Idx new_asub_tops = mctx->asub_tops * 2;
4166 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4167 re_sub_match_top_t *,
4168 new_asub_tops);
4169 if (__glibc_unlikely (new_array == NULL))
4170 return REG_ESPACE;
4171 mctx->sub_tops = new_array;
4172 mctx->asub_tops = new_asub_tops;
4174 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4175 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4176 return REG_ESPACE;
4177 mctx->sub_tops[mctx->nsub_tops]->node = node;
4178 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4179 return REG_NOERROR;
4182 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4183 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4184 Return the new entry if successful, NULL if memory is exhausted. */
4186 static re_sub_match_last_t *
4187 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4189 re_sub_match_last_t *new_entry;
4190 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4192 Idx new_alasts = 2 * subtop->alasts + 1;
4193 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4194 re_sub_match_last_t *,
4195 new_alasts);
4196 if (__glibc_unlikely (new_array == NULL))
4197 return NULL;
4198 subtop->lasts = new_array;
4199 subtop->alasts = new_alasts;
4201 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4202 if (__glibc_likely (new_entry != NULL))
4204 subtop->lasts[subtop->nlasts] = new_entry;
4205 new_entry->node = node;
4206 new_entry->str_idx = str_idx;
4207 ++subtop->nlasts;
4209 return new_entry;
4212 static void
4213 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4214 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4216 sctx->sifted_states = sifted_sts;
4217 sctx->limited_states = limited_sts;
4218 sctx->last_node = last_node;
4219 sctx->last_str_idx = last_str_idx;
4220 re_node_set_init_empty (&sctx->limits);