Revert "Optimize 32bit memset/memcpy with SSE2/SSSE3."
[glibc.git] / posix / regexec.c
blobb8db74062bc966ac412cf1522659b1fa64014d8c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
77 #ifdef RE_ENABLE_I18N
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99 static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137 #if 0
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142 #endif
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188 # ifdef _LIBC
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192 # endif /* _LIBC */
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198 static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
207 string STRING.
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg, string, nmatch, pmatch, eflags)
222 const regex_t *__restrict preg;
223 const char *__restrict string;
224 size_t nmatch;
225 regmatch_t pmatch[];
226 int eflags;
228 reg_errcode_t err;
229 int start, length;
230 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
232 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
233 return REG_BADPAT;
235 if (eflags & REG_STARTEND)
237 start = pmatch[0].rm_so;
238 length = pmatch[0].rm_eo;
240 else
242 start = 0;
243 length = strlen (string);
246 __libc_lock_lock (dfa->lock);
247 if (preg->no_sub)
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, 0, NULL, eflags);
250 else
251 err = re_search_internal (preg, string, length, start, length - start,
252 length, nmatch, pmatch, eflags);
253 __libc_lock_unlock (dfa->lock);
254 return err != REG_NOERROR;
257 #ifdef _LIBC
258 # include <shlib-compat.h>
259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec) __compat_regexec;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t *__restrict preg,
267 const char *__restrict string, size_t nmatch,
268 regmatch_t pmatch[], int eflags)
270 return regexec (preg, string, nmatch, pmatch,
271 eflags & (REG_NOTBOL | REG_NOTEOL));
273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
274 # endif
275 #endif
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
291 way as re_match().)
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
295 concerned.
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
300 strings.)
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp, string, length, start, regs)
308 struct re_pattern_buffer *bufp;
309 const char *string;
310 int length, start;
311 struct re_registers *regs;
313 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
315 #ifdef _LIBC
316 weak_alias (__re_match, re_match)
317 #endif
320 re_search (bufp, string, length, start, range, regs)
321 struct re_pattern_buffer *bufp;
322 const char *string;
323 int length, start, range;
324 struct re_registers *regs;
326 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
328 #ifdef _LIBC
329 weak_alias (__re_search, re_search)
330 #endif
333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
334 struct re_pattern_buffer *bufp;
335 const char *string1, *string2;
336 int length1, length2, start, stop;
337 struct re_registers *regs;
339 return re_search_2_stub (bufp, string1, length1, string2, length2,
340 start, 0, regs, stop, 1);
342 #ifdef _LIBC
343 weak_alias (__re_match_2, re_match_2)
344 #endif
347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
348 struct re_pattern_buffer *bufp;
349 const char *string1, *string2;
350 int length1, length2, start, range, stop;
351 struct re_registers *regs;
353 return re_search_2_stub (bufp, string1, length1, string2, length2,
354 start, range, regs, stop, 0);
356 #ifdef _LIBC
357 weak_alias (__re_search_2, re_search_2)
358 #endif
360 static int
361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
362 stop, ret_len)
363 struct re_pattern_buffer *bufp;
364 const char *string1, *string2;
365 int length1, length2, start, range, stop, ret_len;
366 struct re_registers *regs;
368 const char *str;
369 int rval;
370 int len = length1 + length2;
371 int free_str = 0;
373 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
374 return -2;
376 /* Concatenate the strings. */
377 if (length2 > 0)
378 if (length1 > 0)
380 char *s = re_malloc (char, len);
382 if (BE (s == NULL, 0))
383 return -2;
384 #ifdef _LIBC
385 memcpy (__mempcpy (s, string1, length1), string2, length2);
386 #else
387 memcpy (s, string1, length1);
388 memcpy (s + length1, string2, length2);
389 #endif
390 str = s;
391 free_str = 1;
393 else
394 str = string2;
395 else
396 str = string1;
398 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
399 ret_len);
400 if (free_str)
401 re_free ((char *) str);
402 return rval;
405 /* The parameters have the same meaning as those of re_search.
406 Additional parameters:
407 If RET_LEN is nonzero the length of the match is returned (re_match style);
408 otherwise the position of the match is returned. */
410 static int
411 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
412 struct re_pattern_buffer *bufp;
413 const char *string;
414 int length, start, range, stop, ret_len;
415 struct re_registers *regs;
417 reg_errcode_t result;
418 regmatch_t *pmatch;
419 int nregs, rval;
420 int eflags = 0;
421 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
423 /* Check for out-of-range. */
424 if (BE (start < 0 || start > length, 0))
425 return -1;
426 if (BE (start + range > length, 0))
427 range = length - start;
428 else if (BE (start + range < 0, 0))
429 range = -start;
431 __libc_lock_lock (dfa->lock);
433 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
434 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
436 /* Compile fastmap if we haven't yet. */
437 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
438 re_compile_fastmap (bufp);
440 if (BE (bufp->no_sub, 0))
441 regs = NULL;
443 /* We need at least 1 register. */
444 if (regs == NULL)
445 nregs = 1;
446 else if (BE (bufp->regs_allocated == REGS_FIXED &&
447 regs->num_regs < bufp->re_nsub + 1, 0))
449 nregs = regs->num_regs;
450 if (BE (nregs < 1, 0))
452 /* Nothing can be copied to regs. */
453 regs = NULL;
454 nregs = 1;
457 else
458 nregs = bufp->re_nsub + 1;
459 pmatch = re_malloc (regmatch_t, nregs);
460 if (BE (pmatch == NULL, 0))
462 rval = -2;
463 goto out;
466 result = re_search_internal (bufp, string, length, start, range, stop,
467 nregs, pmatch, eflags);
469 rval = 0;
471 /* I hope we needn't fill ther regs with -1's when no match was found. */
472 if (result != REG_NOERROR)
473 rval = -1;
474 else if (regs != NULL)
476 /* If caller wants register contents data back, copy them. */
477 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
478 bufp->regs_allocated);
479 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
480 rval = -2;
483 if (BE (rval == 0, 1))
485 if (ret_len)
487 assert (pmatch[0].rm_so == start);
488 rval = pmatch[0].rm_eo - start;
490 else
491 rval = pmatch[0].rm_so;
493 re_free (pmatch);
494 out:
495 __libc_lock_unlock (dfa->lock);
496 return rval;
499 static unsigned
500 re_copy_regs (regs, pmatch, nregs, regs_allocated)
501 struct re_registers *regs;
502 regmatch_t *pmatch;
503 int nregs, regs_allocated;
505 int rval = REGS_REALLOCATE;
506 int i;
507 int need_regs = nregs + 1;
508 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
509 uses. */
511 /* Have the register data arrays been allocated? */
512 if (regs_allocated == REGS_UNALLOCATED)
513 { /* No. So allocate them with malloc. */
514 regs->start = re_malloc (regoff_t, need_regs);
515 regs->end = re_malloc (regoff_t, need_regs);
516 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
517 return REGS_UNALLOCATED;
518 regs->num_regs = need_regs;
520 else if (regs_allocated == REGS_REALLOCATE)
521 { /* Yes. If we need more elements than were already
522 allocated, reallocate them. If we need fewer, just
523 leave it alone. */
524 if (BE (need_regs > regs->num_regs, 0))
526 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
527 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
528 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
529 return REGS_UNALLOCATED;
530 regs->start = new_start;
531 regs->end = new_end;
532 regs->num_regs = need_regs;
535 else
537 assert (regs_allocated == REGS_FIXED);
538 /* This function may not be called with REGS_FIXED and nregs too big. */
539 assert (regs->num_regs >= nregs);
540 rval = REGS_FIXED;
543 /* Copy the regs. */
544 for (i = 0; i < nregs; ++i)
546 regs->start[i] = pmatch[i].rm_so;
547 regs->end[i] = pmatch[i].rm_eo;
549 for ( ; i < regs->num_regs; ++i)
550 regs->start[i] = regs->end[i] = -1;
552 return rval;
555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
556 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
557 this memory for recording register information. STARTS and ENDS
558 must be allocated using the malloc library routine, and must each
559 be at least NUM_REGS * sizeof (regoff_t) bytes long.
561 If NUM_REGS == 0, then subsequent matches should allocate their own
562 register data.
564 Unless this function is called, the first search or match using
565 PATTERN_BUFFER will allocate its own register data, without
566 freeing the old data. */
568 void
569 re_set_registers (bufp, regs, num_regs, starts, ends)
570 struct re_pattern_buffer *bufp;
571 struct re_registers *regs;
572 unsigned num_regs;
573 regoff_t *starts, *ends;
575 if (num_regs)
577 bufp->regs_allocated = REGS_REALLOCATE;
578 regs->num_regs = num_regs;
579 regs->start = starts;
580 regs->end = ends;
582 else
584 bufp->regs_allocated = REGS_UNALLOCATED;
585 regs->num_regs = 0;
586 regs->start = regs->end = (regoff_t *) 0;
589 #ifdef _LIBC
590 weak_alias (__re_set_registers, re_set_registers)
591 #endif
593 /* Entry points compatible with 4.2 BSD regex library. We don't define
594 them unless specifically requested. */
596 #if defined _REGEX_RE_COMP || defined _LIBC
598 # ifdef _LIBC
599 weak_function
600 # endif
601 re_exec (s)
602 const char *s;
604 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
606 #endif /* _REGEX_RE_COMP */
608 /* Internal entry point. */
610 /* Searches for a compiled pattern PREG in the string STRING, whose
611 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
612 mingings with regexec. START, and RANGE have the same meanings
613 with re_search.
614 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
615 otherwise return the error code.
616 Note: We assume front end functions already check ranges.
617 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
619 static reg_errcode_t
620 __attribute_warn_unused_result__
621 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
622 eflags)
623 const regex_t *preg;
624 const char *string;
625 int length, start, range, stop, eflags;
626 size_t nmatch;
627 regmatch_t pmatch[];
629 reg_errcode_t err;
630 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
631 int left_lim, right_lim, incr;
632 int fl_longest_match, match_first, match_kind, match_last = -1;
633 int extra_nmatch;
634 int sb, ch;
635 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
636 re_match_context_t mctx = { .dfa = dfa };
637 #else
638 re_match_context_t mctx;
639 #endif
640 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
641 && range && !preg->can_be_null) ? preg->fastmap : NULL;
642 RE_TRANSLATE_TYPE t = preg->translate;
644 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
645 memset (&mctx, '\0', sizeof (re_match_context_t));
646 mctx.dfa = dfa;
647 #endif
649 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
650 nmatch -= extra_nmatch;
652 /* Check if the DFA haven't been compiled. */
653 if (BE (preg->used == 0 || dfa->init_state == NULL
654 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
655 || dfa->init_state_begbuf == NULL, 0))
656 return REG_NOMATCH;
658 #ifdef DEBUG
659 /* We assume front-end functions already check them. */
660 assert (start + range >= 0 && start + range <= length);
661 #endif
663 /* If initial states with non-begbuf contexts have no elements,
664 the regex must be anchored. If preg->newline_anchor is set,
665 we'll never use init_state_nl, so do not check it. */
666 if (dfa->init_state->nodes.nelem == 0
667 && dfa->init_state_word->nodes.nelem == 0
668 && (dfa->init_state_nl->nodes.nelem == 0
669 || !preg->newline_anchor))
671 if (start != 0 && start + range != 0)
672 return REG_NOMATCH;
673 start = range = 0;
676 /* We must check the longest matching, if nmatch > 0. */
677 fl_longest_match = (nmatch != 0 || dfa->nbackref);
679 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
680 preg->translate, preg->syntax & RE_ICASE, dfa);
681 if (BE (err != REG_NOERROR, 0))
682 goto free_return;
683 mctx.input.stop = stop;
684 mctx.input.raw_stop = stop;
685 mctx.input.newline_anchor = preg->newline_anchor;
687 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
688 if (BE (err != REG_NOERROR, 0))
689 goto free_return;
691 /* We will log all the DFA states through which the dfa pass,
692 if nmatch > 1, or this dfa has "multibyte node", which is a
693 back-reference or a node which can accept multibyte character or
694 multi character collating element. */
695 if (nmatch > 1 || dfa->has_mb_node)
697 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
698 if (BE (mctx.state_log == NULL, 0))
700 err = REG_ESPACE;
701 goto free_return;
704 else
705 mctx.state_log = NULL;
707 match_first = start;
708 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
709 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
711 /* Check incrementally whether of not the input string match. */
712 incr = (range < 0) ? -1 : 1;
713 left_lim = (range < 0) ? start + range : start;
714 right_lim = (range < 0) ? start : start + range;
715 sb = dfa->mb_cur_max == 1;
716 match_kind =
717 (fastmap
718 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
719 | (range >= 0 ? 2 : 0)
720 | (t != NULL ? 1 : 0))
721 : 8);
723 for (;; match_first += incr)
725 err = REG_NOMATCH;
726 if (match_first < left_lim || right_lim < match_first)
727 goto free_return;
729 /* Advance as rapidly as possible through the string, until we
730 find a plausible place to start matching. This may be done
731 with varying efficiency, so there are various possibilities:
732 only the most common of them are specialized, in order to
733 save on code size. We use a switch statement for speed. */
734 switch (match_kind)
736 case 8:
737 /* No fastmap. */
738 break;
740 case 7:
741 /* Fastmap with single-byte translation, match forward. */
742 while (BE (match_first < right_lim, 1)
743 && !fastmap[t[(unsigned char) string[match_first]]])
744 ++match_first;
745 goto forward_match_found_start_or_reached_end;
747 case 6:
748 /* Fastmap without translation, match forward. */
749 while (BE (match_first < right_lim, 1)
750 && !fastmap[(unsigned char) string[match_first]])
751 ++match_first;
753 forward_match_found_start_or_reached_end:
754 if (BE (match_first == right_lim, 0))
756 ch = match_first >= length
757 ? 0 : (unsigned char) string[match_first];
758 if (!fastmap[t ? t[ch] : ch])
759 goto free_return;
761 break;
763 case 4:
764 case 5:
765 /* Fastmap without multi-byte translation, match backwards. */
766 while (match_first >= left_lim)
768 ch = match_first >= length
769 ? 0 : (unsigned char) string[match_first];
770 if (fastmap[t ? t[ch] : ch])
771 break;
772 --match_first;
774 if (match_first < left_lim)
775 goto free_return;
776 break;
778 default:
779 /* In this case, we can't determine easily the current byte,
780 since it might be a component byte of a multibyte
781 character. Then we use the constructed buffer instead. */
782 for (;;)
784 /* If MATCH_FIRST is out of the valid range, reconstruct the
785 buffers. */
786 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
787 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
789 err = re_string_reconstruct (&mctx.input, match_first,
790 eflags);
791 if (BE (err != REG_NOERROR, 0))
792 goto free_return;
794 offset = match_first - mctx.input.raw_mbs_idx;
796 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
797 Note that MATCH_FIRST must not be smaller than 0. */
798 ch = (match_first >= length
799 ? 0 : re_string_byte_at (&mctx.input, offset));
800 if (fastmap[ch])
801 break;
802 match_first += incr;
803 if (match_first < left_lim || match_first > right_lim)
805 err = REG_NOMATCH;
806 goto free_return;
809 break;
812 /* Reconstruct the buffers so that the matcher can assume that
813 the matching starts from the beginning of the buffer. */
814 err = re_string_reconstruct (&mctx.input, match_first, eflags);
815 if (BE (err != REG_NOERROR, 0))
816 goto free_return;
818 #ifdef RE_ENABLE_I18N
819 /* Don't consider this char as a possible match start if it part,
820 yet isn't the head, of a multibyte character. */
821 if (!sb && !re_string_first_byte (&mctx.input, 0))
822 continue;
823 #endif
825 /* It seems to be appropriate one, then use the matcher. */
826 /* We assume that the matching starts from 0. */
827 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
828 match_last = check_matching (&mctx, fl_longest_match,
829 range >= 0 ? &match_first : NULL);
830 if (match_last != -1)
832 if (BE (match_last == -2, 0))
834 err = REG_ESPACE;
835 goto free_return;
837 else
839 mctx.match_last = match_last;
840 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
842 re_dfastate_t *pstate = mctx.state_log[match_last];
843 mctx.last_node = check_halt_state_context (&mctx, pstate,
844 match_last);
846 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
847 || dfa->nbackref)
849 err = prune_impossible_nodes (&mctx);
850 if (err == REG_NOERROR)
851 break;
852 if (BE (err != REG_NOMATCH, 0))
853 goto free_return;
854 match_last = -1;
856 else
857 break; /* We found a match. */
861 match_ctx_clean (&mctx);
864 #ifdef DEBUG
865 assert (match_last != -1);
866 assert (err == REG_NOERROR);
867 #endif
869 /* Set pmatch[] if we need. */
870 if (nmatch > 0)
872 int reg_idx;
874 /* Initialize registers. */
875 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
876 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
878 /* Set the points where matching start/end. */
879 pmatch[0].rm_so = 0;
880 pmatch[0].rm_eo = mctx.match_last;
882 if (!preg->no_sub && nmatch > 1)
884 err = set_regs (preg, &mctx, nmatch, pmatch,
885 dfa->has_plural_match && dfa->nbackref > 0);
886 if (BE (err != REG_NOERROR, 0))
887 goto free_return;
890 /* At last, add the offset to the each registers, since we slided
891 the buffers so that we could assume that the matching starts
892 from 0. */
893 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
894 if (pmatch[reg_idx].rm_so != -1)
896 #ifdef RE_ENABLE_I18N
897 if (BE (mctx.input.offsets_needed != 0, 0))
899 pmatch[reg_idx].rm_so =
900 (pmatch[reg_idx].rm_so == mctx.input.valid_len
901 ? mctx.input.valid_raw_len
902 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
903 pmatch[reg_idx].rm_eo =
904 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
905 ? mctx.input.valid_raw_len
906 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
908 #else
909 assert (mctx.input.offsets_needed == 0);
910 #endif
911 pmatch[reg_idx].rm_so += match_first;
912 pmatch[reg_idx].rm_eo += match_first;
914 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
916 pmatch[nmatch + reg_idx].rm_so = -1;
917 pmatch[nmatch + reg_idx].rm_eo = -1;
920 if (dfa->subexp_map)
921 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
922 if (dfa->subexp_map[reg_idx] != reg_idx)
924 pmatch[reg_idx + 1].rm_so
925 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
926 pmatch[reg_idx + 1].rm_eo
927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
931 free_return:
932 re_free (mctx.state_log);
933 if (dfa->nbackref)
934 match_ctx_free (&mctx);
935 re_string_destruct (&mctx.input);
936 return err;
939 static reg_errcode_t
940 __attribute_warn_unused_result__
941 prune_impossible_nodes (mctx)
942 re_match_context_t *mctx;
944 const re_dfa_t *const dfa = mctx->dfa;
945 int halt_node, match_last;
946 reg_errcode_t ret;
947 re_dfastate_t **sifted_states;
948 re_dfastate_t **lim_states = NULL;
949 re_sift_context_t sctx;
950 #ifdef DEBUG
951 assert (mctx->state_log != NULL);
952 #endif
953 match_last = mctx->match_last;
954 halt_node = mctx->last_node;
955 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
956 if (BE (sifted_states == NULL, 0))
958 ret = REG_ESPACE;
959 goto free_return;
961 if (dfa->nbackref)
963 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
964 if (BE (lim_states == NULL, 0))
966 ret = REG_ESPACE;
967 goto free_return;
969 while (1)
971 memset (lim_states, '\0',
972 sizeof (re_dfastate_t *) * (match_last + 1));
973 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
974 match_last);
975 ret = sift_states_backward (mctx, &sctx);
976 re_node_set_free (&sctx.limits);
977 if (BE (ret != REG_NOERROR, 0))
978 goto free_return;
979 if (sifted_states[0] != NULL || lim_states[0] != NULL)
980 break;
983 --match_last;
984 if (match_last < 0)
986 ret = REG_NOMATCH;
987 goto free_return;
989 } while (mctx->state_log[match_last] == NULL
990 || !mctx->state_log[match_last]->halt);
991 halt_node = check_halt_state_context (mctx,
992 mctx->state_log[match_last],
993 match_last);
995 ret = merge_state_array (dfa, sifted_states, lim_states,
996 match_last + 1);
997 re_free (lim_states);
998 lim_states = NULL;
999 if (BE (ret != REG_NOERROR, 0))
1000 goto free_return;
1002 else
1004 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1005 ret = sift_states_backward (mctx, &sctx);
1006 re_node_set_free (&sctx.limits);
1007 if (BE (ret != REG_NOERROR, 0))
1008 goto free_return;
1009 if (sifted_states[0] == NULL)
1011 ret = REG_NOMATCH;
1012 goto free_return;
1015 re_free (mctx->state_log);
1016 mctx->state_log = sifted_states;
1017 sifted_states = NULL;
1018 mctx->last_node = halt_node;
1019 mctx->match_last = match_last;
1020 ret = REG_NOERROR;
1021 free_return:
1022 re_free (sifted_states);
1023 re_free (lim_states);
1024 return ret;
1027 /* Acquire an initial state and return it.
1028 We must select appropriate initial state depending on the context,
1029 since initial states may have constraints like "\<", "^", etc.. */
1031 static inline re_dfastate_t *
1032 __attribute ((always_inline)) internal_function
1033 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1034 int idx)
1036 const re_dfa_t *const dfa = mctx->dfa;
1037 if (dfa->init_state->has_constraint)
1039 unsigned int context;
1040 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1041 if (IS_WORD_CONTEXT (context))
1042 return dfa->init_state_word;
1043 else if (IS_ORDINARY_CONTEXT (context))
1044 return dfa->init_state;
1045 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1046 return dfa->init_state_begbuf;
1047 else if (IS_NEWLINE_CONTEXT (context))
1048 return dfa->init_state_nl;
1049 else if (IS_BEGBUF_CONTEXT (context))
1051 /* It is relatively rare case, then calculate on demand. */
1052 return re_acquire_state_context (err, dfa,
1053 dfa->init_state->entrance_nodes,
1054 context);
1056 else
1057 /* Must not happen? */
1058 return dfa->init_state;
1060 else
1061 return dfa->init_state;
1064 /* Check whether the regular expression match input string INPUT or not,
1065 and return the index where the matching end, return -1 if not match,
1066 or return -2 in case of an error.
1067 FL_LONGEST_MATCH means we want the POSIX longest matching.
1068 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1069 next place where we may want to try matching.
1070 Note that the matcher assume that the maching starts from the current
1071 index of the buffer. */
1073 static int
1074 internal_function __attribute_warn_unused_result__
1075 check_matching (re_match_context_t *mctx, int fl_longest_match,
1076 int *p_match_first)
1078 const re_dfa_t *const dfa = mctx->dfa;
1079 reg_errcode_t err;
1080 int match = 0;
1081 int match_last = -1;
1082 int cur_str_idx = re_string_cur_idx (&mctx->input);
1083 re_dfastate_t *cur_state;
1084 int at_init_state = p_match_first != NULL;
1085 int next_start_idx = cur_str_idx;
1087 err = REG_NOERROR;
1088 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1089 /* An initial state must not be NULL (invalid). */
1090 if (BE (cur_state == NULL, 0))
1092 assert (err == REG_ESPACE);
1093 return -2;
1096 if (mctx->state_log != NULL)
1098 mctx->state_log[cur_str_idx] = cur_state;
1100 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1101 later. E.g. Processing back references. */
1102 if (BE (dfa->nbackref, 0))
1104 at_init_state = 0;
1105 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1106 if (BE (err != REG_NOERROR, 0))
1107 return err;
1109 if (cur_state->has_backref)
1111 err = transit_state_bkref (mctx, &cur_state->nodes);
1112 if (BE (err != REG_NOERROR, 0))
1113 return err;
1118 /* If the RE accepts NULL string. */
1119 if (BE (cur_state->halt, 0))
1121 if (!cur_state->has_constraint
1122 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1124 if (!fl_longest_match)
1125 return cur_str_idx;
1126 else
1128 match_last = cur_str_idx;
1129 match = 1;
1134 while (!re_string_eoi (&mctx->input))
1136 re_dfastate_t *old_state = cur_state;
1137 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1139 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1140 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1141 && mctx->input.valid_len < mctx->input.len))
1143 err = extend_buffers (mctx);
1144 if (BE (err != REG_NOERROR, 0))
1146 assert (err == REG_ESPACE);
1147 return -2;
1151 cur_state = transit_state (&err, mctx, cur_state);
1152 if (mctx->state_log != NULL)
1153 cur_state = merge_state_with_log (&err, mctx, cur_state);
1155 if (cur_state == NULL)
1157 /* Reached the invalid state or an error. Try to recover a valid
1158 state using the state log, if available and if we have not
1159 already found a valid (even if not the longest) match. */
1160 if (BE (err != REG_NOERROR, 0))
1161 return -2;
1163 if (mctx->state_log == NULL
1164 || (match && !fl_longest_match)
1165 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1166 break;
1169 if (BE (at_init_state, 0))
1171 if (old_state == cur_state)
1172 next_start_idx = next_char_idx;
1173 else
1174 at_init_state = 0;
1177 if (cur_state->halt)
1179 /* Reached a halt state.
1180 Check the halt state can satisfy the current context. */
1181 if (!cur_state->has_constraint
1182 || check_halt_state_context (mctx, cur_state,
1183 re_string_cur_idx (&mctx->input)))
1185 /* We found an appropriate halt state. */
1186 match_last = re_string_cur_idx (&mctx->input);
1187 match = 1;
1189 /* We found a match, do not modify match_first below. */
1190 p_match_first = NULL;
1191 if (!fl_longest_match)
1192 break;
1197 if (p_match_first)
1198 *p_match_first += next_start_idx;
1200 return match_last;
1203 /* Check NODE match the current context. */
1205 static int
1206 internal_function
1207 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1209 re_token_type_t type = dfa->nodes[node].type;
1210 unsigned int constraint = dfa->nodes[node].constraint;
1211 if (type != END_OF_RE)
1212 return 0;
1213 if (!constraint)
1214 return 1;
1215 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1216 return 0;
1217 return 1;
1220 /* Check the halt state STATE match the current context.
1221 Return 0 if not match, if the node, STATE has, is a halt node and
1222 match the context, return the node. */
1224 static int
1225 internal_function
1226 check_halt_state_context (const re_match_context_t *mctx,
1227 const re_dfastate_t *state, int idx)
1229 int i;
1230 unsigned int context;
1231 #ifdef DEBUG
1232 assert (state->halt);
1233 #endif
1234 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1235 for (i = 0; i < state->nodes.nelem; ++i)
1236 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1237 return state->nodes.elems[i];
1238 return 0;
1241 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1242 corresponding to the DFA).
1243 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1244 of errors. */
1246 static int
1247 internal_function
1248 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1249 int *pidx, int node, re_node_set *eps_via_nodes,
1250 struct re_fail_stack_t *fs)
1252 const re_dfa_t *const dfa = mctx->dfa;
1253 int i, err;
1254 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1256 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1257 re_node_set *edests = &dfa->edests[node];
1258 int dest_node;
1259 err = re_node_set_insert (eps_via_nodes, node);
1260 if (BE (err < 0, 0))
1261 return -2;
1262 /* Pick up a valid destination, or return -1 if none is found. */
1263 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1265 int candidate = edests->elems[i];
1266 if (!re_node_set_contains (cur_nodes, candidate))
1267 continue;
1268 if (dest_node == -1)
1269 dest_node = candidate;
1271 else
1273 /* In order to avoid infinite loop like "(a*)*", return the second
1274 epsilon-transition if the first was already considered. */
1275 if (re_node_set_contains (eps_via_nodes, dest_node))
1276 return candidate;
1278 /* Otherwise, push the second epsilon-transition on the fail stack. */
1279 else if (fs != NULL
1280 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1281 eps_via_nodes))
1282 return -2;
1284 /* We know we are going to exit. */
1285 break;
1288 return dest_node;
1290 else
1292 int naccepted = 0;
1293 re_token_type_t type = dfa->nodes[node].type;
1295 #ifdef RE_ENABLE_I18N
1296 if (dfa->nodes[node].accept_mb)
1297 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1298 else
1299 #endif /* RE_ENABLE_I18N */
1300 if (type == OP_BACK_REF)
1302 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1303 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1304 if (fs != NULL)
1306 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1307 return -1;
1308 else if (naccepted)
1310 char *buf = (char *) re_string_get_buffer (&mctx->input);
1311 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1312 naccepted) != 0)
1313 return -1;
1317 if (naccepted == 0)
1319 int dest_node;
1320 err = re_node_set_insert (eps_via_nodes, node);
1321 if (BE (err < 0, 0))
1322 return -2;
1323 dest_node = dfa->edests[node].elems[0];
1324 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1325 dest_node))
1326 return dest_node;
1330 if (naccepted != 0
1331 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1333 int dest_node = dfa->nexts[node];
1334 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1335 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1336 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1337 dest_node)))
1338 return -1;
1339 re_node_set_empty (eps_via_nodes);
1340 return dest_node;
1343 return -1;
1346 static reg_errcode_t
1347 internal_function __attribute_warn_unused_result__
1348 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1349 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1351 reg_errcode_t err;
1352 int num = fs->num++;
1353 if (fs->num == fs->alloc)
1355 struct re_fail_stack_ent_t *new_array;
1356 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1357 * fs->alloc * 2));
1358 if (new_array == NULL)
1359 return REG_ESPACE;
1360 fs->alloc *= 2;
1361 fs->stack = new_array;
1363 fs->stack[num].idx = str_idx;
1364 fs->stack[num].node = dest_node;
1365 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1366 if (fs->stack[num].regs == NULL)
1367 return REG_ESPACE;
1368 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1369 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1370 return err;
1373 static int
1374 internal_function
1375 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1376 regmatch_t *regs, re_node_set *eps_via_nodes)
1378 int num = --fs->num;
1379 assert (num >= 0);
1380 *pidx = fs->stack[num].idx;
1381 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1382 re_node_set_free (eps_via_nodes);
1383 re_free (fs->stack[num].regs);
1384 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1385 return fs->stack[num].node;
1388 /* Set the positions where the subexpressions are starts/ends to registers
1389 PMATCH.
1390 Note: We assume that pmatch[0] is already set, and
1391 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1393 static reg_errcode_t
1394 internal_function __attribute_warn_unused_result__
1395 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1396 regmatch_t *pmatch, int fl_backtrack)
1398 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1399 int idx, cur_node;
1400 re_node_set eps_via_nodes;
1401 struct re_fail_stack_t *fs;
1402 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1403 regmatch_t *prev_idx_match;
1404 int prev_idx_match_malloced = 0;
1406 #ifdef DEBUG
1407 assert (nmatch > 1);
1408 assert (mctx->state_log != NULL);
1409 #endif
1410 if (fl_backtrack)
1412 fs = &fs_body;
1413 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1414 if (fs->stack == NULL)
1415 return REG_ESPACE;
1417 else
1418 fs = NULL;
1420 cur_node = dfa->init_node;
1421 re_node_set_init_empty (&eps_via_nodes);
1423 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1424 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1425 else
1427 prev_idx_match = re_malloc (regmatch_t, nmatch);
1428 if (prev_idx_match == NULL)
1430 free_fail_stack_return (fs);
1431 return REG_ESPACE;
1433 prev_idx_match_malloced = 1;
1435 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1437 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1439 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1441 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1443 int reg_idx;
1444 if (fs)
1446 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1447 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1448 break;
1449 if (reg_idx == nmatch)
1451 re_node_set_free (&eps_via_nodes);
1452 if (prev_idx_match_malloced)
1453 re_free (prev_idx_match);
1454 return free_fail_stack_return (fs);
1456 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1457 &eps_via_nodes);
1459 else
1461 re_node_set_free (&eps_via_nodes);
1462 if (prev_idx_match_malloced)
1463 re_free (prev_idx_match);
1464 return REG_NOERROR;
1468 /* Proceed to next node. */
1469 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1470 &eps_via_nodes, fs);
1472 if (BE (cur_node < 0, 0))
1474 if (BE (cur_node == -2, 0))
1476 re_node_set_free (&eps_via_nodes);
1477 if (prev_idx_match_malloced)
1478 re_free (prev_idx_match);
1479 free_fail_stack_return (fs);
1480 return REG_ESPACE;
1482 if (fs)
1483 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1484 &eps_via_nodes);
1485 else
1487 re_node_set_free (&eps_via_nodes);
1488 if (prev_idx_match_malloced)
1489 re_free (prev_idx_match);
1490 return REG_NOMATCH;
1494 re_node_set_free (&eps_via_nodes);
1495 if (prev_idx_match_malloced)
1496 re_free (prev_idx_match);
1497 return free_fail_stack_return (fs);
1500 static reg_errcode_t
1501 internal_function
1502 free_fail_stack_return (struct re_fail_stack_t *fs)
1504 if (fs)
1506 int fs_idx;
1507 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1509 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1510 re_free (fs->stack[fs_idx].regs);
1512 re_free (fs->stack);
1514 return REG_NOERROR;
1517 static void
1518 internal_function
1519 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1520 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1522 int type = dfa->nodes[cur_node].type;
1523 if (type == OP_OPEN_SUBEXP)
1525 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1527 /* We are at the first node of this sub expression. */
1528 if (reg_num < nmatch)
1530 pmatch[reg_num].rm_so = cur_idx;
1531 pmatch[reg_num].rm_eo = -1;
1534 else if (type == OP_CLOSE_SUBEXP)
1536 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1537 if (reg_num < nmatch)
1539 /* We are at the last node of this sub expression. */
1540 if (pmatch[reg_num].rm_so < cur_idx)
1542 pmatch[reg_num].rm_eo = cur_idx;
1543 /* This is a non-empty match or we are not inside an optional
1544 subexpression. Accept this right away. */
1545 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1547 else
1549 if (dfa->nodes[cur_node].opt_subexp
1550 && prev_idx_match[reg_num].rm_so != -1)
1551 /* We transited through an empty match for an optional
1552 subexpression, like (a?)*, and this is not the subexp's
1553 first match. Copy back the old content of the registers
1554 so that matches of an inner subexpression are undone as
1555 well, like in ((a?))*. */
1556 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1557 else
1558 /* We completed a subexpression, but it may be part of
1559 an optional one, so do not update PREV_IDX_MATCH. */
1560 pmatch[reg_num].rm_eo = cur_idx;
1566 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1567 and sift the nodes in each states according to the following rules.
1568 Updated state_log will be wrote to STATE_LOG.
1570 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1571 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1572 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1573 the LAST_NODE, we throw away the node `a'.
1574 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1575 string `s' and transit to `b':
1576 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1577 away the node `a'.
1578 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1579 thrown away, we throw away the node `a'.
1580 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1581 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1582 node `a'.
1583 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1584 we throw away the node `a'. */
1586 #define STATE_NODE_CONTAINS(state,node) \
1587 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1589 static reg_errcode_t
1590 internal_function
1591 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1593 reg_errcode_t err;
1594 int null_cnt = 0;
1595 int str_idx = sctx->last_str_idx;
1596 re_node_set cur_dest;
1598 #ifdef DEBUG
1599 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1600 #endif
1602 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1603 transit to the last_node and the last_node itself. */
1604 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1605 if (BE (err != REG_NOERROR, 0))
1606 return err;
1607 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1608 if (BE (err != REG_NOERROR, 0))
1609 goto free_return;
1611 /* Then check each states in the state_log. */
1612 while (str_idx > 0)
1614 /* Update counters. */
1615 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1616 if (null_cnt > mctx->max_mb_elem_len)
1618 memset (sctx->sifted_states, '\0',
1619 sizeof (re_dfastate_t *) * str_idx);
1620 re_node_set_free (&cur_dest);
1621 return REG_NOERROR;
1623 re_node_set_empty (&cur_dest);
1624 --str_idx;
1626 if (mctx->state_log[str_idx])
1628 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1629 if (BE (err != REG_NOERROR, 0))
1630 goto free_return;
1633 /* Add all the nodes which satisfy the following conditions:
1634 - It can epsilon transit to a node in CUR_DEST.
1635 - It is in CUR_SRC.
1636 And update state_log. */
1637 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1638 if (BE (err != REG_NOERROR, 0))
1639 goto free_return;
1641 err = REG_NOERROR;
1642 free_return:
1643 re_node_set_free (&cur_dest);
1644 return err;
1647 static reg_errcode_t
1648 internal_function __attribute_warn_unused_result__
1649 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1650 int str_idx, re_node_set *cur_dest)
1652 const re_dfa_t *const dfa = mctx->dfa;
1653 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1654 int i;
1656 /* Then build the next sifted state.
1657 We build the next sifted state on `cur_dest', and update
1658 `sifted_states[str_idx]' with `cur_dest'.
1659 Note:
1660 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1661 `cur_src' points the node_set of the old `state_log[str_idx]'
1662 (with the epsilon nodes pre-filtered out). */
1663 for (i = 0; i < cur_src->nelem; i++)
1665 int prev_node = cur_src->elems[i];
1666 int naccepted = 0;
1667 int ret;
1669 #ifdef DEBUG
1670 re_token_type_t type = dfa->nodes[prev_node].type;
1671 assert (!IS_EPSILON_NODE (type));
1672 #endif
1673 #ifdef RE_ENABLE_I18N
1674 /* If the node may accept `multi byte'. */
1675 if (dfa->nodes[prev_node].accept_mb)
1676 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1677 str_idx, sctx->last_str_idx);
1678 #endif /* RE_ENABLE_I18N */
1680 /* We don't check backreferences here.
1681 See update_cur_sifted_state(). */
1682 if (!naccepted
1683 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1684 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1685 dfa->nexts[prev_node]))
1686 naccepted = 1;
1688 if (naccepted == 0)
1689 continue;
1691 if (sctx->limits.nelem)
1693 int to_idx = str_idx + naccepted;
1694 if (check_dst_limits (mctx, &sctx->limits,
1695 dfa->nexts[prev_node], to_idx,
1696 prev_node, str_idx))
1697 continue;
1699 ret = re_node_set_insert (cur_dest, prev_node);
1700 if (BE (ret == -1, 0))
1701 return REG_ESPACE;
1704 return REG_NOERROR;
1707 /* Helper functions. */
1709 static reg_errcode_t
1710 internal_function
1711 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1713 int top = mctx->state_log_top;
1715 if (next_state_log_idx >= mctx->input.bufs_len
1716 || (next_state_log_idx >= mctx->input.valid_len
1717 && mctx->input.valid_len < mctx->input.len))
1719 reg_errcode_t err;
1720 err = extend_buffers (mctx);
1721 if (BE (err != REG_NOERROR, 0))
1722 return err;
1725 if (top < next_state_log_idx)
1727 memset (mctx->state_log + top + 1, '\0',
1728 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1729 mctx->state_log_top = next_state_log_idx;
1731 return REG_NOERROR;
1734 static reg_errcode_t
1735 internal_function
1736 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1737 re_dfastate_t **src, int num)
1739 int st_idx;
1740 reg_errcode_t err;
1741 for (st_idx = 0; st_idx < num; ++st_idx)
1743 if (dst[st_idx] == NULL)
1744 dst[st_idx] = src[st_idx];
1745 else if (src[st_idx] != NULL)
1747 re_node_set merged_set;
1748 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1749 &src[st_idx]->nodes);
1750 if (BE (err != REG_NOERROR, 0))
1751 return err;
1752 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1753 re_node_set_free (&merged_set);
1754 if (BE (err != REG_NOERROR, 0))
1755 return err;
1758 return REG_NOERROR;
1761 static reg_errcode_t
1762 internal_function
1763 update_cur_sifted_state (const re_match_context_t *mctx,
1764 re_sift_context_t *sctx, int str_idx,
1765 re_node_set *dest_nodes)
1767 const re_dfa_t *const dfa = mctx->dfa;
1768 reg_errcode_t err = REG_NOERROR;
1769 const re_node_set *candidates;
1770 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1771 : &mctx->state_log[str_idx]->nodes);
1773 if (dest_nodes->nelem == 0)
1774 sctx->sifted_states[str_idx] = NULL;
1775 else
1777 if (candidates)
1779 /* At first, add the nodes which can epsilon transit to a node in
1780 DEST_NODE. */
1781 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1782 if (BE (err != REG_NOERROR, 0))
1783 return err;
1785 /* Then, check the limitations in the current sift_context. */
1786 if (sctx->limits.nelem)
1788 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1789 mctx->bkref_ents, str_idx);
1790 if (BE (err != REG_NOERROR, 0))
1791 return err;
1795 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1796 if (BE (err != REG_NOERROR, 0))
1797 return err;
1800 if (candidates && mctx->state_log[str_idx]->has_backref)
1802 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1803 if (BE (err != REG_NOERROR, 0))
1804 return err;
1806 return REG_NOERROR;
1809 static reg_errcode_t
1810 internal_function __attribute_warn_unused_result__
1811 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1812 const re_node_set *candidates)
1814 reg_errcode_t err = REG_NOERROR;
1815 int i;
1817 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1818 if (BE (err != REG_NOERROR, 0))
1819 return err;
1821 if (!state->inveclosure.alloc)
1823 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1824 if (BE (err != REG_NOERROR, 0))
1825 return REG_ESPACE;
1826 for (i = 0; i < dest_nodes->nelem; i++)
1828 err = re_node_set_merge (&state->inveclosure,
1829 dfa->inveclosures + dest_nodes->elems[i]);
1830 if (BE (err != REG_NOERROR, 0))
1831 return REG_ESPACE;
1834 return re_node_set_add_intersect (dest_nodes, candidates,
1835 &state->inveclosure);
1838 static reg_errcode_t
1839 internal_function
1840 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1841 const re_node_set *candidates)
1843 int ecl_idx;
1844 reg_errcode_t err;
1845 re_node_set *inv_eclosure = dfa->inveclosures + node;
1846 re_node_set except_nodes;
1847 re_node_set_init_empty (&except_nodes);
1848 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1850 int cur_node = inv_eclosure->elems[ecl_idx];
1851 if (cur_node == node)
1852 continue;
1853 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1855 int edst1 = dfa->edests[cur_node].elems[0];
1856 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1857 ? dfa->edests[cur_node].elems[1] : -1);
1858 if ((!re_node_set_contains (inv_eclosure, edst1)
1859 && re_node_set_contains (dest_nodes, edst1))
1860 || (edst2 > 0
1861 && !re_node_set_contains (inv_eclosure, edst2)
1862 && re_node_set_contains (dest_nodes, edst2)))
1864 err = re_node_set_add_intersect (&except_nodes, candidates,
1865 dfa->inveclosures + cur_node);
1866 if (BE (err != REG_NOERROR, 0))
1868 re_node_set_free (&except_nodes);
1869 return err;
1874 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1876 int cur_node = inv_eclosure->elems[ecl_idx];
1877 if (!re_node_set_contains (&except_nodes, cur_node))
1879 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1880 re_node_set_remove_at (dest_nodes, idx);
1883 re_node_set_free (&except_nodes);
1884 return REG_NOERROR;
1887 static int
1888 internal_function
1889 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1890 int dst_node, int dst_idx, int src_node, int src_idx)
1892 const re_dfa_t *const dfa = mctx->dfa;
1893 int lim_idx, src_pos, dst_pos;
1895 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1896 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1897 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1899 int subexp_idx;
1900 struct re_backref_cache_entry *ent;
1901 ent = mctx->bkref_ents + limits->elems[lim_idx];
1902 subexp_idx = dfa->nodes[ent->node].opr.idx;
1904 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1905 subexp_idx, dst_node, dst_idx,
1906 dst_bkref_idx);
1907 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1908 subexp_idx, src_node, src_idx,
1909 src_bkref_idx);
1911 /* In case of:
1912 <src> <dst> ( <subexp> )
1913 ( <subexp> ) <src> <dst>
1914 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1915 if (src_pos == dst_pos)
1916 continue; /* This is unrelated limitation. */
1917 else
1918 return 1;
1920 return 0;
1923 static int
1924 internal_function
1925 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1926 int subexp_idx, int from_node, int bkref_idx)
1928 const re_dfa_t *const dfa = mctx->dfa;
1929 const re_node_set *eclosures = dfa->eclosures + from_node;
1930 int node_idx;
1932 /* Else, we are on the boundary: examine the nodes on the epsilon
1933 closure. */
1934 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1936 int node = eclosures->elems[node_idx];
1937 switch (dfa->nodes[node].type)
1939 case OP_BACK_REF:
1940 if (bkref_idx != -1)
1942 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1945 int dst, cpos;
1947 if (ent->node != node)
1948 continue;
1950 if (subexp_idx < BITSET_WORD_BITS
1951 && !(ent->eps_reachable_subexps_map
1952 & ((bitset_word_t) 1 << subexp_idx)))
1953 continue;
1955 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1956 OP_CLOSE_SUBEXP cases below. But, if the
1957 destination node is the same node as the source
1958 node, don't recurse because it would cause an
1959 infinite loop: a regex that exhibits this behavior
1960 is ()\1*\1* */
1961 dst = dfa->edests[node].elems[0];
1962 if (dst == from_node)
1964 if (boundaries & 1)
1965 return -1;
1966 else /* if (boundaries & 2) */
1967 return 0;
1970 cpos =
1971 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1972 dst, bkref_idx);
1973 if (cpos == -1 /* && (boundaries & 1) */)
1974 return -1;
1975 if (cpos == 0 && (boundaries & 2))
1976 return 0;
1978 if (subexp_idx < BITSET_WORD_BITS)
1979 ent->eps_reachable_subexps_map
1980 &= ~((bitset_word_t) 1 << subexp_idx);
1982 while (ent++->more);
1984 break;
1986 case OP_OPEN_SUBEXP:
1987 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1988 return -1;
1989 break;
1991 case OP_CLOSE_SUBEXP:
1992 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1993 return 0;
1994 break;
1996 default:
1997 break;
2001 return (boundaries & 2) ? 1 : 0;
2004 static int
2005 internal_function
2006 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2007 int subexp_idx, int from_node, int str_idx,
2008 int bkref_idx)
2010 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2011 int boundaries;
2013 /* If we are outside the range of the subexpression, return -1 or 1. */
2014 if (str_idx < lim->subexp_from)
2015 return -1;
2017 if (lim->subexp_to < str_idx)
2018 return 1;
2020 /* If we are within the subexpression, return 0. */
2021 boundaries = (str_idx == lim->subexp_from);
2022 boundaries |= (str_idx == lim->subexp_to) << 1;
2023 if (boundaries == 0)
2024 return 0;
2026 /* Else, examine epsilon closure. */
2027 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2028 from_node, bkref_idx);
2031 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2032 which are against limitations from DEST_NODES. */
2034 static reg_errcode_t
2035 internal_function
2036 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2037 const re_node_set *candidates, re_node_set *limits,
2038 struct re_backref_cache_entry *bkref_ents, int str_idx)
2040 reg_errcode_t err;
2041 int node_idx, lim_idx;
2043 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2045 int subexp_idx;
2046 struct re_backref_cache_entry *ent;
2047 ent = bkref_ents + limits->elems[lim_idx];
2049 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2050 continue; /* This is unrelated limitation. */
2052 subexp_idx = dfa->nodes[ent->node].opr.idx;
2053 if (ent->subexp_to == str_idx)
2055 int ops_node = -1;
2056 int cls_node = -1;
2057 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2059 int node = dest_nodes->elems[node_idx];
2060 re_token_type_t type = dfa->nodes[node].type;
2061 if (type == OP_OPEN_SUBEXP
2062 && subexp_idx == dfa->nodes[node].opr.idx)
2063 ops_node = node;
2064 else if (type == OP_CLOSE_SUBEXP
2065 && subexp_idx == dfa->nodes[node].opr.idx)
2066 cls_node = node;
2069 /* Check the limitation of the open subexpression. */
2070 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2071 if (ops_node >= 0)
2073 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2074 candidates);
2075 if (BE (err != REG_NOERROR, 0))
2076 return err;
2079 /* Check the limitation of the close subexpression. */
2080 if (cls_node >= 0)
2081 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2083 int node = dest_nodes->elems[node_idx];
2084 if (!re_node_set_contains (dfa->inveclosures + node,
2085 cls_node)
2086 && !re_node_set_contains (dfa->eclosures + node,
2087 cls_node))
2089 /* It is against this limitation.
2090 Remove it form the current sifted state. */
2091 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2092 candidates);
2093 if (BE (err != REG_NOERROR, 0))
2094 return err;
2095 --node_idx;
2099 else /* (ent->subexp_to != str_idx) */
2101 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2103 int node = dest_nodes->elems[node_idx];
2104 re_token_type_t type = dfa->nodes[node].type;
2105 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2107 if (subexp_idx != dfa->nodes[node].opr.idx)
2108 continue;
2109 /* It is against this limitation.
2110 Remove it form the current sifted state. */
2111 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2112 candidates);
2113 if (BE (err != REG_NOERROR, 0))
2114 return err;
2119 return REG_NOERROR;
2122 static reg_errcode_t
2123 internal_function __attribute_warn_unused_result__
2124 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2125 int str_idx, const re_node_set *candidates)
2127 const re_dfa_t *const dfa = mctx->dfa;
2128 reg_errcode_t err;
2129 int node_idx, node;
2130 re_sift_context_t local_sctx;
2131 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2133 if (first_idx == -1)
2134 return REG_NOERROR;
2136 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2138 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2140 int enabled_idx;
2141 re_token_type_t type;
2142 struct re_backref_cache_entry *entry;
2143 node = candidates->elems[node_idx];
2144 type = dfa->nodes[node].type;
2145 /* Avoid infinite loop for the REs like "()\1+". */
2146 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2147 continue;
2148 if (type != OP_BACK_REF)
2149 continue;
2151 entry = mctx->bkref_ents + first_idx;
2152 enabled_idx = first_idx;
2155 int subexp_len;
2156 int to_idx;
2157 int dst_node;
2158 int ret;
2159 re_dfastate_t *cur_state;
2161 if (entry->node != node)
2162 continue;
2163 subexp_len = entry->subexp_to - entry->subexp_from;
2164 to_idx = str_idx + subexp_len;
2165 dst_node = (subexp_len ? dfa->nexts[node]
2166 : dfa->edests[node].elems[0]);
2168 if (to_idx > sctx->last_str_idx
2169 || sctx->sifted_states[to_idx] == NULL
2170 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2171 || check_dst_limits (mctx, &sctx->limits, node,
2172 str_idx, dst_node, to_idx))
2173 continue;
2175 if (local_sctx.sifted_states == NULL)
2177 local_sctx = *sctx;
2178 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2179 if (BE (err != REG_NOERROR, 0))
2180 goto free_return;
2182 local_sctx.last_node = node;
2183 local_sctx.last_str_idx = str_idx;
2184 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2185 if (BE (ret < 0, 0))
2187 err = REG_ESPACE;
2188 goto free_return;
2190 cur_state = local_sctx.sifted_states[str_idx];
2191 err = sift_states_backward (mctx, &local_sctx);
2192 if (BE (err != REG_NOERROR, 0))
2193 goto free_return;
2194 if (sctx->limited_states != NULL)
2196 err = merge_state_array (dfa, sctx->limited_states,
2197 local_sctx.sifted_states,
2198 str_idx + 1);
2199 if (BE (err != REG_NOERROR, 0))
2200 goto free_return;
2202 local_sctx.sifted_states[str_idx] = cur_state;
2203 re_node_set_remove (&local_sctx.limits, enabled_idx);
2205 /* mctx->bkref_ents may have changed, reload the pointer. */
2206 entry = mctx->bkref_ents + enabled_idx;
2208 while (enabled_idx++, entry++->more);
2210 err = REG_NOERROR;
2211 free_return:
2212 if (local_sctx.sifted_states != NULL)
2214 re_node_set_free (&local_sctx.limits);
2217 return err;
2221 #ifdef RE_ENABLE_I18N
2222 static int
2223 internal_function
2224 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2225 int node_idx, int str_idx, int max_str_idx)
2227 const re_dfa_t *const dfa = mctx->dfa;
2228 int naccepted;
2229 /* Check the node can accept `multi byte'. */
2230 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2231 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2232 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2233 dfa->nexts[node_idx]))
2234 /* The node can't accept the `multi byte', or the
2235 destination was already thrown away, then the node
2236 could't accept the current input `multi byte'. */
2237 naccepted = 0;
2238 /* Otherwise, it is sure that the node could accept
2239 `naccepted' bytes input. */
2240 return naccepted;
2242 #endif /* RE_ENABLE_I18N */
2245 /* Functions for state transition. */
2247 /* Return the next state to which the current state STATE will transit by
2248 accepting the current input byte, and update STATE_LOG if necessary.
2249 If STATE can accept a multibyte char/collating element/back reference
2250 update the destination of STATE_LOG. */
2252 static re_dfastate_t *
2253 internal_function __attribute_warn_unused_result__
2254 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2255 re_dfastate_t *state)
2257 re_dfastate_t **trtable;
2258 unsigned char ch;
2260 #ifdef RE_ENABLE_I18N
2261 /* If the current state can accept multibyte. */
2262 if (BE (state->accept_mb, 0))
2264 *err = transit_state_mb (mctx, state);
2265 if (BE (*err != REG_NOERROR, 0))
2266 return NULL;
2268 #endif /* RE_ENABLE_I18N */
2270 /* Then decide the next state with the single byte. */
2271 #if 0
2272 if (0)
2273 /* don't use transition table */
2274 return transit_state_sb (err, mctx, state);
2275 #endif
2277 /* Use transition table */
2278 ch = re_string_fetch_byte (&mctx->input);
2279 for (;;)
2281 trtable = state->trtable;
2282 if (BE (trtable != NULL, 1))
2283 return trtable[ch];
2285 trtable = state->word_trtable;
2286 if (BE (trtable != NULL, 1))
2288 unsigned int context;
2289 context
2290 = re_string_context_at (&mctx->input,
2291 re_string_cur_idx (&mctx->input) - 1,
2292 mctx->eflags);
2293 if (IS_WORD_CONTEXT (context))
2294 return trtable[ch + SBC_MAX];
2295 else
2296 return trtable[ch];
2299 if (!build_trtable (mctx->dfa, state))
2301 *err = REG_ESPACE;
2302 return NULL;
2305 /* Retry, we now have a transition table. */
2309 /* Update the state_log if we need */
2310 re_dfastate_t *
2311 internal_function
2312 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2313 re_dfastate_t *next_state)
2315 const re_dfa_t *const dfa = mctx->dfa;
2316 int cur_idx = re_string_cur_idx (&mctx->input);
2318 if (cur_idx > mctx->state_log_top)
2320 mctx->state_log[cur_idx] = next_state;
2321 mctx->state_log_top = cur_idx;
2323 else if (mctx->state_log[cur_idx] == 0)
2325 mctx->state_log[cur_idx] = next_state;
2327 else
2329 re_dfastate_t *pstate;
2330 unsigned int context;
2331 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2332 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2333 the destination of a multibyte char/collating element/
2334 back reference. Then the next state is the union set of
2335 these destinations and the results of the transition table. */
2336 pstate = mctx->state_log[cur_idx];
2337 log_nodes = pstate->entrance_nodes;
2338 if (next_state != NULL)
2340 table_nodes = next_state->entrance_nodes;
2341 *err = re_node_set_init_union (&next_nodes, table_nodes,
2342 log_nodes);
2343 if (BE (*err != REG_NOERROR, 0))
2344 return NULL;
2346 else
2347 next_nodes = *log_nodes;
2348 /* Note: We already add the nodes of the initial state,
2349 then we don't need to add them here. */
2351 context = re_string_context_at (&mctx->input,
2352 re_string_cur_idx (&mctx->input) - 1,
2353 mctx->eflags);
2354 next_state = mctx->state_log[cur_idx]
2355 = re_acquire_state_context (err, dfa, &next_nodes, context);
2356 /* We don't need to check errors here, since the return value of
2357 this function is next_state and ERR is already set. */
2359 if (table_nodes != NULL)
2360 re_node_set_free (&next_nodes);
2363 if (BE (dfa->nbackref, 0) && next_state != NULL)
2365 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2366 later. We must check them here, since the back references in the
2367 next state might use them. */
2368 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2369 cur_idx);
2370 if (BE (*err != REG_NOERROR, 0))
2371 return NULL;
2373 /* If the next state has back references. */
2374 if (next_state->has_backref)
2376 *err = transit_state_bkref (mctx, &next_state->nodes);
2377 if (BE (*err != REG_NOERROR, 0))
2378 return NULL;
2379 next_state = mctx->state_log[cur_idx];
2383 return next_state;
2386 /* Skip bytes in the input that correspond to part of a
2387 multi-byte match, then look in the log for a state
2388 from which to restart matching. */
2389 re_dfastate_t *
2390 internal_function
2391 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2393 re_dfastate_t *cur_state;
2396 int max = mctx->state_log_top;
2397 int cur_str_idx = re_string_cur_idx (&mctx->input);
2401 if (++cur_str_idx > max)
2402 return NULL;
2403 re_string_skip_bytes (&mctx->input, 1);
2405 while (mctx->state_log[cur_str_idx] == NULL);
2407 cur_state = merge_state_with_log (err, mctx, NULL);
2409 while (*err == REG_NOERROR && cur_state == NULL);
2410 return cur_state;
2413 /* Helper functions for transit_state. */
2415 /* From the node set CUR_NODES, pick up the nodes whose types are
2416 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2417 expression. And register them to use them later for evaluating the
2418 correspoding back references. */
2420 static reg_errcode_t
2421 internal_function
2422 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2423 int str_idx)
2425 const re_dfa_t *const dfa = mctx->dfa;
2426 int node_idx;
2427 reg_errcode_t err;
2429 /* TODO: This isn't efficient.
2430 Because there might be more than one nodes whose types are
2431 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2432 nodes.
2433 E.g. RE: (a){2} */
2434 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2436 int node = cur_nodes->elems[node_idx];
2437 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2438 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2439 && (dfa->used_bkref_map
2440 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2442 err = match_ctx_add_subtop (mctx, node, str_idx);
2443 if (BE (err != REG_NOERROR, 0))
2444 return err;
2447 return REG_NOERROR;
2450 #if 0
2451 /* Return the next state to which the current state STATE will transit by
2452 accepting the current input byte. */
2454 static re_dfastate_t *
2455 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2456 re_dfastate_t *state)
2458 const re_dfa_t *const dfa = mctx->dfa;
2459 re_node_set next_nodes;
2460 re_dfastate_t *next_state;
2461 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2462 unsigned int context;
2464 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2465 if (BE (*err != REG_NOERROR, 0))
2466 return NULL;
2467 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2469 int cur_node = state->nodes.elems[node_cnt];
2470 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2472 *err = re_node_set_merge (&next_nodes,
2473 dfa->eclosures + dfa->nexts[cur_node]);
2474 if (BE (*err != REG_NOERROR, 0))
2476 re_node_set_free (&next_nodes);
2477 return NULL;
2481 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2482 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2483 /* We don't need to check errors here, since the return value of
2484 this function is next_state and ERR is already set. */
2486 re_node_set_free (&next_nodes);
2487 re_string_skip_bytes (&mctx->input, 1);
2488 return next_state;
2490 #endif
2492 #ifdef RE_ENABLE_I18N
2493 static reg_errcode_t
2494 internal_function
2495 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2497 const re_dfa_t *const dfa = mctx->dfa;
2498 reg_errcode_t err;
2499 int i;
2501 for (i = 0; i < pstate->nodes.nelem; ++i)
2503 re_node_set dest_nodes, *new_nodes;
2504 int cur_node_idx = pstate->nodes.elems[i];
2505 int naccepted, dest_idx;
2506 unsigned int context;
2507 re_dfastate_t *dest_state;
2509 if (!dfa->nodes[cur_node_idx].accept_mb)
2510 continue;
2512 if (dfa->nodes[cur_node_idx].constraint)
2514 context = re_string_context_at (&mctx->input,
2515 re_string_cur_idx (&mctx->input),
2516 mctx->eflags);
2517 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2518 context))
2519 continue;
2522 /* How many bytes the node can accept? */
2523 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2524 re_string_cur_idx (&mctx->input));
2525 if (naccepted == 0)
2526 continue;
2528 /* The node can accepts `naccepted' bytes. */
2529 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2530 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2531 : mctx->max_mb_elem_len);
2532 err = clean_state_log_if_needed (mctx, dest_idx);
2533 if (BE (err != REG_NOERROR, 0))
2534 return err;
2535 #ifdef DEBUG
2536 assert (dfa->nexts[cur_node_idx] != -1);
2537 #endif
2538 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2540 dest_state = mctx->state_log[dest_idx];
2541 if (dest_state == NULL)
2542 dest_nodes = *new_nodes;
2543 else
2545 err = re_node_set_init_union (&dest_nodes,
2546 dest_state->entrance_nodes, new_nodes);
2547 if (BE (err != REG_NOERROR, 0))
2548 return err;
2550 context = re_string_context_at (&mctx->input, dest_idx - 1,
2551 mctx->eflags);
2552 mctx->state_log[dest_idx]
2553 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2554 if (dest_state != NULL)
2555 re_node_set_free (&dest_nodes);
2556 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2557 return err;
2559 return REG_NOERROR;
2561 #endif /* RE_ENABLE_I18N */
2563 static reg_errcode_t
2564 internal_function
2565 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2567 const re_dfa_t *const dfa = mctx->dfa;
2568 reg_errcode_t err;
2569 int i;
2570 int cur_str_idx = re_string_cur_idx (&mctx->input);
2572 for (i = 0; i < nodes->nelem; ++i)
2574 int dest_str_idx, prev_nelem, bkc_idx;
2575 int node_idx = nodes->elems[i];
2576 unsigned int context;
2577 const re_token_t *node = dfa->nodes + node_idx;
2578 re_node_set *new_dest_nodes;
2580 /* Check whether `node' is a backreference or not. */
2581 if (node->type != OP_BACK_REF)
2582 continue;
2584 if (node->constraint)
2586 context = re_string_context_at (&mctx->input, cur_str_idx,
2587 mctx->eflags);
2588 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2589 continue;
2592 /* `node' is a backreference.
2593 Check the substring which the substring matched. */
2594 bkc_idx = mctx->nbkref_ents;
2595 err = get_subexp (mctx, node_idx, cur_str_idx);
2596 if (BE (err != REG_NOERROR, 0))
2597 goto free_return;
2599 /* And add the epsilon closures (which is `new_dest_nodes') of
2600 the backreference to appropriate state_log. */
2601 #ifdef DEBUG
2602 assert (dfa->nexts[node_idx] != -1);
2603 #endif
2604 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2606 int subexp_len;
2607 re_dfastate_t *dest_state;
2608 struct re_backref_cache_entry *bkref_ent;
2609 bkref_ent = mctx->bkref_ents + bkc_idx;
2610 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2611 continue;
2612 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2613 new_dest_nodes = (subexp_len == 0
2614 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2615 : dfa->eclosures + dfa->nexts[node_idx]);
2616 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2617 - bkref_ent->subexp_from);
2618 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2619 mctx->eflags);
2620 dest_state = mctx->state_log[dest_str_idx];
2621 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2622 : mctx->state_log[cur_str_idx]->nodes.nelem);
2623 /* Add `new_dest_node' to state_log. */
2624 if (dest_state == NULL)
2626 mctx->state_log[dest_str_idx]
2627 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2628 context);
2629 if (BE (mctx->state_log[dest_str_idx] == NULL
2630 && err != REG_NOERROR, 0))
2631 goto free_return;
2633 else
2635 re_node_set dest_nodes;
2636 err = re_node_set_init_union (&dest_nodes,
2637 dest_state->entrance_nodes,
2638 new_dest_nodes);
2639 if (BE (err != REG_NOERROR, 0))
2641 re_node_set_free (&dest_nodes);
2642 goto free_return;
2644 mctx->state_log[dest_str_idx]
2645 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2646 re_node_set_free (&dest_nodes);
2647 if (BE (mctx->state_log[dest_str_idx] == NULL
2648 && err != REG_NOERROR, 0))
2649 goto free_return;
2651 /* We need to check recursively if the backreference can epsilon
2652 transit. */
2653 if (subexp_len == 0
2654 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2656 err = check_subexp_matching_top (mctx, new_dest_nodes,
2657 cur_str_idx);
2658 if (BE (err != REG_NOERROR, 0))
2659 goto free_return;
2660 err = transit_state_bkref (mctx, new_dest_nodes);
2661 if (BE (err != REG_NOERROR, 0))
2662 goto free_return;
2666 err = REG_NOERROR;
2667 free_return:
2668 return err;
2671 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2672 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2673 Note that we might collect inappropriate candidates here.
2674 However, the cost of checking them strictly here is too high, then we
2675 delay these checking for prune_impossible_nodes(). */
2677 static reg_errcode_t
2678 internal_function __attribute_warn_unused_result__
2679 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2681 const re_dfa_t *const dfa = mctx->dfa;
2682 int subexp_num, sub_top_idx;
2683 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2684 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2685 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2686 if (cache_idx != -1)
2688 const struct re_backref_cache_entry *entry
2689 = mctx->bkref_ents + cache_idx;
2691 if (entry->node == bkref_node)
2692 return REG_NOERROR; /* We already checked it. */
2693 while (entry++->more);
2696 subexp_num = dfa->nodes[bkref_node].opr.idx;
2698 /* For each sub expression */
2699 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2701 reg_errcode_t err;
2702 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2703 re_sub_match_last_t *sub_last;
2704 int sub_last_idx, sl_str, bkref_str_off;
2706 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2707 continue; /* It isn't related. */
2709 sl_str = sub_top->str_idx;
2710 bkref_str_off = bkref_str_idx;
2711 /* At first, check the last node of sub expressions we already
2712 evaluated. */
2713 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2715 int sl_str_diff;
2716 sub_last = sub_top->lasts[sub_last_idx];
2717 sl_str_diff = sub_last->str_idx - sl_str;
2718 /* The matched string by the sub expression match with the substring
2719 at the back reference? */
2720 if (sl_str_diff > 0)
2722 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2724 /* Not enough chars for a successful match. */
2725 if (bkref_str_off + sl_str_diff > mctx->input.len)
2726 break;
2728 err = clean_state_log_if_needed (mctx,
2729 bkref_str_off
2730 + sl_str_diff);
2731 if (BE (err != REG_NOERROR, 0))
2732 return err;
2733 buf = (const char *) re_string_get_buffer (&mctx->input);
2735 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2736 /* We don't need to search this sub expression any more. */
2737 break;
2739 bkref_str_off += sl_str_diff;
2740 sl_str += sl_str_diff;
2741 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2742 bkref_str_idx);
2744 /* Reload buf, since the preceding call might have reallocated
2745 the buffer. */
2746 buf = (const char *) re_string_get_buffer (&mctx->input);
2748 if (err == REG_NOMATCH)
2749 continue;
2750 if (BE (err != REG_NOERROR, 0))
2751 return err;
2754 if (sub_last_idx < sub_top->nlasts)
2755 continue;
2756 if (sub_last_idx > 0)
2757 ++sl_str;
2758 /* Then, search for the other last nodes of the sub expression. */
2759 for (; sl_str <= bkref_str_idx; ++sl_str)
2761 int cls_node, sl_str_off;
2762 const re_node_set *nodes;
2763 sl_str_off = sl_str - sub_top->str_idx;
2764 /* The matched string by the sub expression match with the substring
2765 at the back reference? */
2766 if (sl_str_off > 0)
2768 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2770 /* If we are at the end of the input, we cannot match. */
2771 if (bkref_str_off >= mctx->input.len)
2772 break;
2774 err = extend_buffers (mctx);
2775 if (BE (err != REG_NOERROR, 0))
2776 return err;
2778 buf = (const char *) re_string_get_buffer (&mctx->input);
2780 if (buf [bkref_str_off++] != buf[sl_str - 1])
2781 break; /* We don't need to search this sub expression
2782 any more. */
2784 if (mctx->state_log[sl_str] == NULL)
2785 continue;
2786 /* Does this state have a ')' of the sub expression? */
2787 nodes = &mctx->state_log[sl_str]->nodes;
2788 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2789 OP_CLOSE_SUBEXP);
2790 if (cls_node == -1)
2791 continue; /* No. */
2792 if (sub_top->path == NULL)
2794 sub_top->path = calloc (sizeof (state_array_t),
2795 sl_str - sub_top->str_idx + 1);
2796 if (sub_top->path == NULL)
2797 return REG_ESPACE;
2799 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2800 in the current context? */
2801 err = check_arrival (mctx, sub_top->path, sub_top->node,
2802 sub_top->str_idx, cls_node, sl_str,
2803 OP_CLOSE_SUBEXP);
2804 if (err == REG_NOMATCH)
2805 continue;
2806 if (BE (err != REG_NOERROR, 0))
2807 return err;
2808 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2809 if (BE (sub_last == NULL, 0))
2810 return REG_ESPACE;
2811 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2812 bkref_str_idx);
2813 if (err == REG_NOMATCH)
2814 continue;
2817 return REG_NOERROR;
2820 /* Helper functions for get_subexp(). */
2822 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2823 If it can arrive, register the sub expression expressed with SUB_TOP
2824 and SUB_LAST. */
2826 static reg_errcode_t
2827 internal_function
2828 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2829 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2831 reg_errcode_t err;
2832 int to_idx;
2833 /* Can the subexpression arrive the back reference? */
2834 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2835 sub_last->str_idx, bkref_node, bkref_str,
2836 OP_OPEN_SUBEXP);
2837 if (err != REG_NOERROR)
2838 return err;
2839 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2840 sub_last->str_idx);
2841 if (BE (err != REG_NOERROR, 0))
2842 return err;
2843 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2844 return clean_state_log_if_needed (mctx, to_idx);
2847 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2848 Search '(' if FL_OPEN, or search ')' otherwise.
2849 TODO: This function isn't efficient...
2850 Because there might be more than one nodes whose types are
2851 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2852 nodes.
2853 E.g. RE: (a){2} */
2855 static int
2856 internal_function
2857 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2858 int subexp_idx, int type)
2860 int cls_idx;
2861 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2863 int cls_node = nodes->elems[cls_idx];
2864 const re_token_t *node = dfa->nodes + cls_node;
2865 if (node->type == type
2866 && node->opr.idx == subexp_idx)
2867 return cls_node;
2869 return -1;
2872 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2873 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2874 heavily reused.
2875 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2877 static reg_errcode_t
2878 internal_function __attribute_warn_unused_result__
2879 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2880 int top_str, int last_node, int last_str, int type)
2882 const re_dfa_t *const dfa = mctx->dfa;
2883 reg_errcode_t err = REG_NOERROR;
2884 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2885 re_dfastate_t *cur_state = NULL;
2886 re_node_set *cur_nodes, next_nodes;
2887 re_dfastate_t **backup_state_log;
2888 unsigned int context;
2890 subexp_num = dfa->nodes[top_node].opr.idx;
2891 /* Extend the buffer if we need. */
2892 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2894 re_dfastate_t **new_array;
2895 int old_alloc = path->alloc;
2896 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2897 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2898 if (BE (new_array == NULL, 0))
2900 path->alloc = old_alloc;
2901 return REG_ESPACE;
2903 path->array = new_array;
2904 memset (new_array + old_alloc, '\0',
2905 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2908 str_idx = path->next_idx ?: top_str;
2910 /* Temporary modify MCTX. */
2911 backup_state_log = mctx->state_log;
2912 backup_cur_idx = mctx->input.cur_idx;
2913 mctx->state_log = path->array;
2914 mctx->input.cur_idx = str_idx;
2916 /* Setup initial node set. */
2917 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2918 if (str_idx == top_str)
2920 err = re_node_set_init_1 (&next_nodes, top_node);
2921 if (BE (err != REG_NOERROR, 0))
2922 return err;
2923 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2924 if (BE (err != REG_NOERROR, 0))
2926 re_node_set_free (&next_nodes);
2927 return err;
2930 else
2932 cur_state = mctx->state_log[str_idx];
2933 if (cur_state && cur_state->has_backref)
2935 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2936 if (BE (err != REG_NOERROR, 0))
2937 return err;
2939 else
2940 re_node_set_init_empty (&next_nodes);
2942 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2944 if (next_nodes.nelem)
2946 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2947 subexp_num, type);
2948 if (BE (err != REG_NOERROR, 0))
2950 re_node_set_free (&next_nodes);
2951 return err;
2954 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2955 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2957 re_node_set_free (&next_nodes);
2958 return err;
2960 mctx->state_log[str_idx] = cur_state;
2963 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2965 re_node_set_empty (&next_nodes);
2966 if (mctx->state_log[str_idx + 1])
2968 err = re_node_set_merge (&next_nodes,
2969 &mctx->state_log[str_idx + 1]->nodes);
2970 if (BE (err != REG_NOERROR, 0))
2972 re_node_set_free (&next_nodes);
2973 return err;
2976 if (cur_state)
2978 err = check_arrival_add_next_nodes (mctx, str_idx,
2979 &cur_state->non_eps_nodes,
2980 &next_nodes);
2981 if (BE (err != REG_NOERROR, 0))
2983 re_node_set_free (&next_nodes);
2984 return err;
2987 ++str_idx;
2988 if (next_nodes.nelem)
2990 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2991 if (BE (err != REG_NOERROR, 0))
2993 re_node_set_free (&next_nodes);
2994 return err;
2996 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2997 subexp_num, type);
2998 if (BE (err != REG_NOERROR, 0))
3000 re_node_set_free (&next_nodes);
3001 return err;
3004 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3005 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3006 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3008 re_node_set_free (&next_nodes);
3009 return err;
3011 mctx->state_log[str_idx] = cur_state;
3012 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3014 re_node_set_free (&next_nodes);
3015 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3016 : &mctx->state_log[last_str]->nodes);
3017 path->next_idx = str_idx;
3019 /* Fix MCTX. */
3020 mctx->state_log = backup_state_log;
3021 mctx->input.cur_idx = backup_cur_idx;
3023 /* Then check the current node set has the node LAST_NODE. */
3024 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3025 return REG_NOERROR;
3027 return REG_NOMATCH;
3030 /* Helper functions for check_arrival. */
3032 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3033 to NEXT_NODES.
3034 TODO: This function is similar to the functions transit_state*(),
3035 however this function has many additional works.
3036 Can't we unify them? */
3038 static reg_errcode_t
3039 internal_function __attribute_warn_unused_result__
3040 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3041 re_node_set *cur_nodes, re_node_set *next_nodes)
3043 const re_dfa_t *const dfa = mctx->dfa;
3044 int result;
3045 int cur_idx;
3046 reg_errcode_t err = REG_NOERROR;
3047 re_node_set union_set;
3048 re_node_set_init_empty (&union_set);
3049 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3051 int naccepted = 0;
3052 int cur_node = cur_nodes->elems[cur_idx];
3053 #ifdef DEBUG
3054 re_token_type_t type = dfa->nodes[cur_node].type;
3055 assert (!IS_EPSILON_NODE (type));
3056 #endif
3057 #ifdef RE_ENABLE_I18N
3058 /* If the node may accept `multi byte'. */
3059 if (dfa->nodes[cur_node].accept_mb)
3061 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3062 str_idx);
3063 if (naccepted > 1)
3065 re_dfastate_t *dest_state;
3066 int next_node = dfa->nexts[cur_node];
3067 int next_idx = str_idx + naccepted;
3068 dest_state = mctx->state_log[next_idx];
3069 re_node_set_empty (&union_set);
3070 if (dest_state)
3072 err = re_node_set_merge (&union_set, &dest_state->nodes);
3073 if (BE (err != REG_NOERROR, 0))
3075 re_node_set_free (&union_set);
3076 return err;
3079 result = re_node_set_insert (&union_set, next_node);
3080 if (BE (result < 0, 0))
3082 re_node_set_free (&union_set);
3083 return REG_ESPACE;
3085 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3086 &union_set);
3087 if (BE (mctx->state_log[next_idx] == NULL
3088 && err != REG_NOERROR, 0))
3090 re_node_set_free (&union_set);
3091 return err;
3095 #endif /* RE_ENABLE_I18N */
3096 if (naccepted
3097 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3099 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3100 if (BE (result < 0, 0))
3102 re_node_set_free (&union_set);
3103 return REG_ESPACE;
3107 re_node_set_free (&union_set);
3108 return REG_NOERROR;
3111 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3112 CUR_NODES, however exclude the nodes which are:
3113 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3114 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3117 static reg_errcode_t
3118 internal_function
3119 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3120 int ex_subexp, int type)
3122 reg_errcode_t err;
3123 int idx, outside_node;
3124 re_node_set new_nodes;
3125 #ifdef DEBUG
3126 assert (cur_nodes->nelem);
3127 #endif
3128 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3129 if (BE (err != REG_NOERROR, 0))
3130 return err;
3131 /* Create a new node set NEW_NODES with the nodes which are epsilon
3132 closures of the node in CUR_NODES. */
3134 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3136 int cur_node = cur_nodes->elems[idx];
3137 const re_node_set *eclosure = dfa->eclosures + cur_node;
3138 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3139 if (outside_node == -1)
3141 /* There are no problematic nodes, just merge them. */
3142 err = re_node_set_merge (&new_nodes, eclosure);
3143 if (BE (err != REG_NOERROR, 0))
3145 re_node_set_free (&new_nodes);
3146 return err;
3149 else
3151 /* There are problematic nodes, re-calculate incrementally. */
3152 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3153 ex_subexp, type);
3154 if (BE (err != REG_NOERROR, 0))
3156 re_node_set_free (&new_nodes);
3157 return err;
3161 re_node_set_free (cur_nodes);
3162 *cur_nodes = new_nodes;
3163 return REG_NOERROR;
3166 /* Helper function for check_arrival_expand_ecl.
3167 Check incrementally the epsilon closure of TARGET, and if it isn't
3168 problematic append it to DST_NODES. */
3170 static reg_errcode_t
3171 internal_function __attribute_warn_unused_result__
3172 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3173 int target, int ex_subexp, int type)
3175 int cur_node;
3176 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3178 int err;
3180 if (dfa->nodes[cur_node].type == type
3181 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3183 if (type == OP_CLOSE_SUBEXP)
3185 err = re_node_set_insert (dst_nodes, cur_node);
3186 if (BE (err == -1, 0))
3187 return REG_ESPACE;
3189 break;
3191 err = re_node_set_insert (dst_nodes, cur_node);
3192 if (BE (err == -1, 0))
3193 return REG_ESPACE;
3194 if (dfa->edests[cur_node].nelem == 0)
3195 break;
3196 if (dfa->edests[cur_node].nelem == 2)
3198 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3199 dfa->edests[cur_node].elems[1],
3200 ex_subexp, type);
3201 if (BE (err != REG_NOERROR, 0))
3202 return err;
3204 cur_node = dfa->edests[cur_node].elems[0];
3206 return REG_NOERROR;
3210 /* For all the back references in the current state, calculate the
3211 destination of the back references by the appropriate entry
3212 in MCTX->BKREF_ENTS. */
3214 static reg_errcode_t
3215 internal_function __attribute_warn_unused_result__
3216 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3217 int cur_str, int subexp_num, int type)
3219 const re_dfa_t *const dfa = mctx->dfa;
3220 reg_errcode_t err;
3221 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3222 struct re_backref_cache_entry *ent;
3224 if (cache_idx_start == -1)
3225 return REG_NOERROR;
3227 restart:
3228 ent = mctx->bkref_ents + cache_idx_start;
3231 int to_idx, next_node;
3233 /* Is this entry ENT is appropriate? */
3234 if (!re_node_set_contains (cur_nodes, ent->node))
3235 continue; /* No. */
3237 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3238 /* Calculate the destination of the back reference, and append it
3239 to MCTX->STATE_LOG. */
3240 if (to_idx == cur_str)
3242 /* The backreference did epsilon transit, we must re-check all the
3243 node in the current state. */
3244 re_node_set new_dests;
3245 reg_errcode_t err2, err3;
3246 next_node = dfa->edests[ent->node].elems[0];
3247 if (re_node_set_contains (cur_nodes, next_node))
3248 continue;
3249 err = re_node_set_init_1 (&new_dests, next_node);
3250 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3251 err3 = re_node_set_merge (cur_nodes, &new_dests);
3252 re_node_set_free (&new_dests);
3253 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3254 || err3 != REG_NOERROR, 0))
3256 err = (err != REG_NOERROR ? err
3257 : (err2 != REG_NOERROR ? err2 : err3));
3258 return err;
3260 /* TODO: It is still inefficient... */
3261 goto restart;
3263 else
3265 re_node_set union_set;
3266 next_node = dfa->nexts[ent->node];
3267 if (mctx->state_log[to_idx])
3269 int ret;
3270 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3271 next_node))
3272 continue;
3273 err = re_node_set_init_copy (&union_set,
3274 &mctx->state_log[to_idx]->nodes);
3275 ret = re_node_set_insert (&union_set, next_node);
3276 if (BE (err != REG_NOERROR || ret < 0, 0))
3278 re_node_set_free (&union_set);
3279 err = err != REG_NOERROR ? err : REG_ESPACE;
3280 return err;
3283 else
3285 err = re_node_set_init_1 (&union_set, next_node);
3286 if (BE (err != REG_NOERROR, 0))
3287 return err;
3289 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3290 re_node_set_free (&union_set);
3291 if (BE (mctx->state_log[to_idx] == NULL
3292 && err != REG_NOERROR, 0))
3293 return err;
3296 while (ent++->more);
3297 return REG_NOERROR;
3300 /* Build transition table for the state.
3301 Return 1 if succeeded, otherwise return NULL. */
3303 static int
3304 internal_function
3305 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3307 reg_errcode_t err;
3308 int i, j, ch, need_word_trtable = 0;
3309 bitset_word_t elem, mask;
3310 bool dests_node_malloced = false;
3311 bool dest_states_malloced = false;
3312 int ndests; /* Number of the destination states from `state'. */
3313 re_dfastate_t **trtable;
3314 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3315 re_node_set follows, *dests_node;
3316 bitset_t *dests_ch;
3317 bitset_t acceptable;
3319 struct dests_alloc
3321 re_node_set dests_node[SBC_MAX];
3322 bitset_t dests_ch[SBC_MAX];
3323 } *dests_alloc;
3325 /* We build DFA states which corresponds to the destination nodes
3326 from `state'. `dests_node[i]' represents the nodes which i-th
3327 destination state contains, and `dests_ch[i]' represents the
3328 characters which i-th destination state accepts. */
3329 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3330 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3331 else
3333 dests_alloc = re_malloc (struct dests_alloc, 1);
3334 if (BE (dests_alloc == NULL, 0))
3335 return 0;
3336 dests_node_malloced = true;
3338 dests_node = dests_alloc->dests_node;
3339 dests_ch = dests_alloc->dests_ch;
3341 /* Initialize transiton table. */
3342 state->word_trtable = state->trtable = NULL;
3344 /* At first, group all nodes belonging to `state' into several
3345 destinations. */
3346 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3347 if (BE (ndests <= 0, 0))
3349 if (dests_node_malloced)
3350 free (dests_alloc);
3351 /* Return 0 in case of an error, 1 otherwise. */
3352 if (ndests == 0)
3354 state->trtable = (re_dfastate_t **)
3355 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3356 return 1;
3358 return 0;
3361 err = re_node_set_alloc (&follows, ndests + 1);
3362 if (BE (err != REG_NOERROR, 0))
3363 goto out_free;
3365 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3366 + ndests * 3 * sizeof (re_dfastate_t *)))
3367 dest_states = (re_dfastate_t **)
3368 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3369 else
3371 dest_states = (re_dfastate_t **)
3372 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3373 if (BE (dest_states == NULL, 0))
3375 out_free:
3376 if (dest_states_malloced)
3377 free (dest_states);
3378 re_node_set_free (&follows);
3379 for (i = 0; i < ndests; ++i)
3380 re_node_set_free (dests_node + i);
3381 if (dests_node_malloced)
3382 free (dests_alloc);
3383 return 0;
3385 dest_states_malloced = true;
3387 dest_states_word = dest_states + ndests;
3388 dest_states_nl = dest_states_word + ndests;
3389 bitset_empty (acceptable);
3391 /* Then build the states for all destinations. */
3392 for (i = 0; i < ndests; ++i)
3394 int next_node;
3395 re_node_set_empty (&follows);
3396 /* Merge the follows of this destination states. */
3397 for (j = 0; j < dests_node[i].nelem; ++j)
3399 next_node = dfa->nexts[dests_node[i].elems[j]];
3400 if (next_node != -1)
3402 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3403 if (BE (err != REG_NOERROR, 0))
3404 goto out_free;
3407 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3408 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3409 goto out_free;
3410 /* If the new state has context constraint,
3411 build appropriate states for these contexts. */
3412 if (dest_states[i]->has_constraint)
3414 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3415 CONTEXT_WORD);
3416 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3417 goto out_free;
3419 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3420 need_word_trtable = 1;
3422 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3423 CONTEXT_NEWLINE);
3424 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3425 goto out_free;
3427 else
3429 dest_states_word[i] = dest_states[i];
3430 dest_states_nl[i] = dest_states[i];
3432 bitset_merge (acceptable, dests_ch[i]);
3435 if (!BE (need_word_trtable, 0))
3437 /* We don't care about whether the following character is a word
3438 character, or we are in a single-byte character set so we can
3439 discern by looking at the character code: allocate a
3440 256-entry transition table. */
3441 trtable = state->trtable =
3442 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3443 if (BE (trtable == NULL, 0))
3444 goto out_free;
3446 /* For all characters ch...: */
3447 for (i = 0; i < BITSET_WORDS; ++i)
3448 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3449 elem;
3450 mask <<= 1, elem >>= 1, ++ch)
3451 if (BE (elem & 1, 0))
3453 /* There must be exactly one destination which accepts
3454 character ch. See group_nodes_into_DFAstates. */
3455 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3458 /* j-th destination accepts the word character ch. */
3459 if (dfa->word_char[i] & mask)
3460 trtable[ch] = dest_states_word[j];
3461 else
3462 trtable[ch] = dest_states[j];
3465 else
3467 /* We care about whether the following character is a word
3468 character, and we are in a multi-byte character set: discern
3469 by looking at the character code: build two 256-entry
3470 transition tables, one starting at trtable[0] and one
3471 starting at trtable[SBC_MAX]. */
3472 trtable = state->word_trtable =
3473 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3474 if (BE (trtable == NULL, 0))
3475 goto out_free;
3477 /* For all characters ch...: */
3478 for (i = 0; i < BITSET_WORDS; ++i)
3479 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3480 elem;
3481 mask <<= 1, elem >>= 1, ++ch)
3482 if (BE (elem & 1, 0))
3484 /* There must be exactly one destination which accepts
3485 character ch. See group_nodes_into_DFAstates. */
3486 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3489 /* j-th destination accepts the word character ch. */
3490 trtable[ch] = dest_states[j];
3491 trtable[ch + SBC_MAX] = dest_states_word[j];
3495 /* new line */
3496 if (bitset_contain (acceptable, NEWLINE_CHAR))
3498 /* The current state accepts newline character. */
3499 for (j = 0; j < ndests; ++j)
3500 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3502 /* k-th destination accepts newline character. */
3503 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3504 if (need_word_trtable)
3505 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3506 /* There must be only one destination which accepts
3507 newline. See group_nodes_into_DFAstates. */
3508 break;
3512 if (dest_states_malloced)
3513 free (dest_states);
3515 re_node_set_free (&follows);
3516 for (i = 0; i < ndests; ++i)
3517 re_node_set_free (dests_node + i);
3519 if (dests_node_malloced)
3520 free (dests_alloc);
3522 return 1;
3525 /* Group all nodes belonging to STATE into several destinations.
3526 Then for all destinations, set the nodes belonging to the destination
3527 to DESTS_NODE[i] and set the characters accepted by the destination
3528 to DEST_CH[i]. This function return the number of destinations. */
3530 static int
3531 internal_function
3532 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3533 re_node_set *dests_node, bitset_t *dests_ch)
3535 reg_errcode_t err;
3536 int result;
3537 int i, j, k;
3538 int ndests; /* Number of the destinations from `state'. */
3539 bitset_t accepts; /* Characters a node can accept. */
3540 const re_node_set *cur_nodes = &state->nodes;
3541 bitset_empty (accepts);
3542 ndests = 0;
3544 /* For all the nodes belonging to `state', */
3545 for (i = 0; i < cur_nodes->nelem; ++i)
3547 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3548 re_token_type_t type = node->type;
3549 unsigned int constraint = node->constraint;
3551 /* Enumerate all single byte character this node can accept. */
3552 if (type == CHARACTER)
3553 bitset_set (accepts, node->opr.c);
3554 else if (type == SIMPLE_BRACKET)
3556 bitset_merge (accepts, node->opr.sbcset);
3558 else if (type == OP_PERIOD)
3560 #ifdef RE_ENABLE_I18N
3561 if (dfa->mb_cur_max > 1)
3562 bitset_merge (accepts, dfa->sb_char);
3563 else
3564 #endif
3565 bitset_set_all (accepts);
3566 if (!(dfa->syntax & RE_DOT_NEWLINE))
3567 bitset_clear (accepts, '\n');
3568 if (dfa->syntax & RE_DOT_NOT_NULL)
3569 bitset_clear (accepts, '\0');
3571 #ifdef RE_ENABLE_I18N
3572 else if (type == OP_UTF8_PERIOD)
3574 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3575 if (!(dfa->syntax & RE_DOT_NEWLINE))
3576 bitset_clear (accepts, '\n');
3577 if (dfa->syntax & RE_DOT_NOT_NULL)
3578 bitset_clear (accepts, '\0');
3580 #endif
3581 else
3582 continue;
3584 /* Check the `accepts' and sift the characters which are not
3585 match it the context. */
3586 if (constraint)
3588 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3590 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3591 bitset_empty (accepts);
3592 if (accepts_newline)
3593 bitset_set (accepts, NEWLINE_CHAR);
3594 else
3595 continue;
3597 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3599 bitset_empty (accepts);
3600 continue;
3603 if (constraint & NEXT_WORD_CONSTRAINT)
3605 bitset_word_t any_set = 0;
3606 if (type == CHARACTER && !node->word_char)
3608 bitset_empty (accepts);
3609 continue;
3611 #ifdef RE_ENABLE_I18N
3612 if (dfa->mb_cur_max > 1)
3613 for (j = 0; j < BITSET_WORDS; ++j)
3614 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3615 else
3616 #endif
3617 for (j = 0; j < BITSET_WORDS; ++j)
3618 any_set |= (accepts[j] &= dfa->word_char[j]);
3619 if (!any_set)
3620 continue;
3622 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3624 bitset_word_t any_set = 0;
3625 if (type == CHARACTER && node->word_char)
3627 bitset_empty (accepts);
3628 continue;
3630 #ifdef RE_ENABLE_I18N
3631 if (dfa->mb_cur_max > 1)
3632 for (j = 0; j < BITSET_WORDS; ++j)
3633 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3634 else
3635 #endif
3636 for (j = 0; j < BITSET_WORDS; ++j)
3637 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3638 if (!any_set)
3639 continue;
3643 /* Then divide `accepts' into DFA states, or create a new
3644 state. Above, we make sure that accepts is not empty. */
3645 for (j = 0; j < ndests; ++j)
3647 bitset_t intersec; /* Intersection sets, see below. */
3648 bitset_t remains;
3649 /* Flags, see below. */
3650 bitset_word_t has_intersec, not_subset, not_consumed;
3652 /* Optimization, skip if this state doesn't accept the character. */
3653 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3654 continue;
3656 /* Enumerate the intersection set of this state and `accepts'. */
3657 has_intersec = 0;
3658 for (k = 0; k < BITSET_WORDS; ++k)
3659 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3660 /* And skip if the intersection set is empty. */
3661 if (!has_intersec)
3662 continue;
3664 /* Then check if this state is a subset of `accepts'. */
3665 not_subset = not_consumed = 0;
3666 for (k = 0; k < BITSET_WORDS; ++k)
3668 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3669 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3672 /* If this state isn't a subset of `accepts', create a
3673 new group state, which has the `remains'. */
3674 if (not_subset)
3676 bitset_copy (dests_ch[ndests], remains);
3677 bitset_copy (dests_ch[j], intersec);
3678 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3679 if (BE (err != REG_NOERROR, 0))
3680 goto error_return;
3681 ++ndests;
3684 /* Put the position in the current group. */
3685 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3686 if (BE (result < 0, 0))
3687 goto error_return;
3689 /* If all characters are consumed, go to next node. */
3690 if (!not_consumed)
3691 break;
3693 /* Some characters remain, create a new group. */
3694 if (j == ndests)
3696 bitset_copy (dests_ch[ndests], accepts);
3697 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3698 if (BE (err != REG_NOERROR, 0))
3699 goto error_return;
3700 ++ndests;
3701 bitset_empty (accepts);
3704 return ndests;
3705 error_return:
3706 for (j = 0; j < ndests; ++j)
3707 re_node_set_free (dests_node + j);
3708 return -1;
3711 #ifdef RE_ENABLE_I18N
3712 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3713 Return the number of the bytes the node accepts.
3714 STR_IDX is the current index of the input string.
3716 This function handles the nodes which can accept one character, or
3717 one collating element like '.', '[a-z]', opposite to the other nodes
3718 can only accept one byte. */
3720 static int
3721 internal_function
3722 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3723 const re_string_t *input, int str_idx)
3725 const re_token_t *node = dfa->nodes + node_idx;
3726 int char_len, elem_len;
3727 int i;
3729 if (BE (node->type == OP_UTF8_PERIOD, 0))
3731 unsigned char c = re_string_byte_at (input, str_idx), d;
3732 if (BE (c < 0xc2, 1))
3733 return 0;
3735 if (str_idx + 2 > input->len)
3736 return 0;
3738 d = re_string_byte_at (input, str_idx + 1);
3739 if (c < 0xe0)
3740 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3741 else if (c < 0xf0)
3743 char_len = 3;
3744 if (c == 0xe0 && d < 0xa0)
3745 return 0;
3747 else if (c < 0xf8)
3749 char_len = 4;
3750 if (c == 0xf0 && d < 0x90)
3751 return 0;
3753 else if (c < 0xfc)
3755 char_len = 5;
3756 if (c == 0xf8 && d < 0x88)
3757 return 0;
3759 else if (c < 0xfe)
3761 char_len = 6;
3762 if (c == 0xfc && d < 0x84)
3763 return 0;
3765 else
3766 return 0;
3768 if (str_idx + char_len > input->len)
3769 return 0;
3771 for (i = 1; i < char_len; ++i)
3773 d = re_string_byte_at (input, str_idx + i);
3774 if (d < 0x80 || d > 0xbf)
3775 return 0;
3777 return char_len;
3780 char_len = re_string_char_size_at (input, str_idx);
3781 if (node->type == OP_PERIOD)
3783 if (char_len <= 1)
3784 return 0;
3785 /* FIXME: I don't think this if is needed, as both '\n'
3786 and '\0' are char_len == 1. */
3787 /* '.' accepts any one character except the following two cases. */
3788 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3789 re_string_byte_at (input, str_idx) == '\n') ||
3790 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3791 re_string_byte_at (input, str_idx) == '\0'))
3792 return 0;
3793 return char_len;
3796 elem_len = re_string_elem_size_at (input, str_idx);
3797 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3798 return 0;
3800 if (node->type == COMPLEX_BRACKET)
3802 const re_charset_t *cset = node->opr.mbcset;
3803 # ifdef _LIBC
3804 const unsigned char *pin
3805 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3806 int j;
3807 uint32_t nrules;
3808 # endif /* _LIBC */
3809 int match_len = 0;
3810 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3811 ? re_string_wchar_at (input, str_idx) : 0);
3813 /* match with multibyte character? */
3814 for (i = 0; i < cset->nmbchars; ++i)
3815 if (wc == cset->mbchars[i])
3817 match_len = char_len;
3818 goto check_node_accept_bytes_match;
3820 /* match with character_class? */
3821 for (i = 0; i < cset->nchar_classes; ++i)
3823 wctype_t wt = cset->char_classes[i];
3824 if (__iswctype (wc, wt))
3826 match_len = char_len;
3827 goto check_node_accept_bytes_match;
3831 # ifdef _LIBC
3832 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3833 if (nrules != 0)
3835 unsigned int in_collseq = 0;
3836 const int32_t *table, *indirect;
3837 const unsigned char *weights, *extra;
3838 const char *collseqwc;
3839 /* This #include defines a local function! */
3840 # include <locale/weight.h>
3842 /* match with collating_symbol? */
3843 if (cset->ncoll_syms)
3844 extra = (const unsigned char *)
3845 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3846 for (i = 0; i < cset->ncoll_syms; ++i)
3848 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3849 /* Compare the length of input collating element and
3850 the length of current collating element. */
3851 if (*coll_sym != elem_len)
3852 continue;
3853 /* Compare each bytes. */
3854 for (j = 0; j < *coll_sym; j++)
3855 if (pin[j] != coll_sym[1 + j])
3856 break;
3857 if (j == *coll_sym)
3859 /* Match if every bytes is equal. */
3860 match_len = j;
3861 goto check_node_accept_bytes_match;
3865 if (cset->nranges)
3867 if (elem_len <= char_len)
3869 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3870 in_collseq = __collseq_table_lookup (collseqwc, wc);
3872 else
3873 in_collseq = find_collation_sequence_value (pin, elem_len);
3875 /* match with range expression? */
3876 for (i = 0; i < cset->nranges; ++i)
3877 if (cset->range_starts[i] <= in_collseq
3878 && in_collseq <= cset->range_ends[i])
3880 match_len = elem_len;
3881 goto check_node_accept_bytes_match;
3884 /* match with equivalence_class? */
3885 if (cset->nequiv_classes)
3887 const unsigned char *cp = pin;
3888 table = (const int32_t *)
3889 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3890 weights = (const unsigned char *)
3891 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3892 extra = (const unsigned char *)
3893 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3894 indirect = (const int32_t *)
3895 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3896 int32_t idx = findidx (&cp);
3897 if (idx > 0)
3898 for (i = 0; i < cset->nequiv_classes; ++i)
3900 int32_t equiv_class_idx = cset->equiv_classes[i];
3901 size_t weight_len = weights[idx & 0xffffff];
3902 if (weight_len == weights[equiv_class_idx & 0xffffff]
3903 && (idx >> 24) == (equiv_class_idx >> 24))
3905 int cnt = 0;
3907 idx &= 0xffffff;
3908 equiv_class_idx &= 0xffffff;
3910 while (cnt <= weight_len
3911 && (weights[equiv_class_idx + 1 + cnt]
3912 == weights[idx + 1 + cnt]))
3913 ++cnt;
3914 if (cnt > weight_len)
3916 match_len = elem_len;
3917 goto check_node_accept_bytes_match;
3923 else
3924 # endif /* _LIBC */
3926 /* match with range expression? */
3927 #if __GNUC__ >= 2
3928 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3929 #else
3930 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3931 cmp_buf[2] = wc;
3932 #endif
3933 for (i = 0; i < cset->nranges; ++i)
3935 cmp_buf[0] = cset->range_starts[i];
3936 cmp_buf[4] = cset->range_ends[i];
3937 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3938 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3940 match_len = char_len;
3941 goto check_node_accept_bytes_match;
3945 check_node_accept_bytes_match:
3946 if (!cset->non_match)
3947 return match_len;
3948 else
3950 if (match_len > 0)
3951 return 0;
3952 else
3953 return (elem_len > char_len) ? elem_len : char_len;
3956 return 0;
3959 # ifdef _LIBC
3960 static unsigned int
3961 internal_function
3962 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3964 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3965 if (nrules == 0)
3967 if (mbs_len == 1)
3969 /* No valid character. Match it as a single byte character. */
3970 const unsigned char *collseq = (const unsigned char *)
3971 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3972 return collseq[mbs[0]];
3974 return UINT_MAX;
3976 else
3978 int32_t idx;
3979 const unsigned char *extra = (const unsigned char *)
3980 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3981 int32_t extrasize = (const unsigned char *)
3982 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3984 for (idx = 0; idx < extrasize;)
3986 int mbs_cnt, found = 0;
3987 int32_t elem_mbs_len;
3988 /* Skip the name of collating element name. */
3989 idx = idx + extra[idx] + 1;
3990 elem_mbs_len = extra[idx++];
3991 if (mbs_len == elem_mbs_len)
3993 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3994 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3995 break;
3996 if (mbs_cnt == elem_mbs_len)
3997 /* Found the entry. */
3998 found = 1;
4000 /* Skip the byte sequence of the collating element. */
4001 idx += elem_mbs_len;
4002 /* Adjust for the alignment. */
4003 idx = (idx + 3) & ~3;
4004 /* Skip the collation sequence value. */
4005 idx += sizeof (uint32_t);
4006 /* Skip the wide char sequence of the collating element. */
4007 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4008 /* If we found the entry, return the sequence value. */
4009 if (found)
4010 return *(uint32_t *) (extra + idx);
4011 /* Skip the collation sequence value. */
4012 idx += sizeof (uint32_t);
4014 return UINT_MAX;
4017 # endif /* _LIBC */
4018 #endif /* RE_ENABLE_I18N */
4020 /* Check whether the node accepts the byte which is IDX-th
4021 byte of the INPUT. */
4023 static int
4024 internal_function
4025 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4026 int idx)
4028 unsigned char ch;
4029 ch = re_string_byte_at (&mctx->input, idx);
4030 switch (node->type)
4032 case CHARACTER:
4033 if (node->opr.c != ch)
4034 return 0;
4035 break;
4037 case SIMPLE_BRACKET:
4038 if (!bitset_contain (node->opr.sbcset, ch))
4039 return 0;
4040 break;
4042 #ifdef RE_ENABLE_I18N
4043 case OP_UTF8_PERIOD:
4044 if (ch >= 0x80)
4045 return 0;
4046 /* FALLTHROUGH */
4047 #endif
4048 case OP_PERIOD:
4049 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4050 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4051 return 0;
4052 break;
4054 default:
4055 return 0;
4058 if (node->constraint)
4060 /* The node has constraints. Check whether the current context
4061 satisfies the constraints. */
4062 unsigned int context = re_string_context_at (&mctx->input, idx,
4063 mctx->eflags);
4064 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4065 return 0;
4068 return 1;
4071 /* Extend the buffers, if the buffers have run out. */
4073 static reg_errcode_t
4074 internal_function __attribute_warn_unused_result__
4075 extend_buffers (re_match_context_t *mctx)
4077 reg_errcode_t ret;
4078 re_string_t *pstr = &mctx->input;
4080 /* Double the lengthes of the buffers. */
4081 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4082 if (BE (ret != REG_NOERROR, 0))
4083 return ret;
4085 if (mctx->state_log != NULL)
4087 /* And double the length of state_log. */
4088 /* XXX We have no indication of the size of this buffer. If this
4089 allocation fail we have no indication that the state_log array
4090 does not have the right size. */
4091 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4092 pstr->bufs_len + 1);
4093 if (BE (new_array == NULL, 0))
4094 return REG_ESPACE;
4095 mctx->state_log = new_array;
4098 /* Then reconstruct the buffers. */
4099 if (pstr->icase)
4101 #ifdef RE_ENABLE_I18N
4102 if (pstr->mb_cur_max > 1)
4104 ret = build_wcs_upper_buffer (pstr);
4105 if (BE (ret != REG_NOERROR, 0))
4106 return ret;
4108 else
4109 #endif /* RE_ENABLE_I18N */
4110 build_upper_buffer (pstr);
4112 else
4114 #ifdef RE_ENABLE_I18N
4115 if (pstr->mb_cur_max > 1)
4116 build_wcs_buffer (pstr);
4117 else
4118 #endif /* RE_ENABLE_I18N */
4120 if (pstr->trans != NULL)
4121 re_string_translate_buffer (pstr);
4124 return REG_NOERROR;
4128 /* Functions for matching context. */
4130 /* Initialize MCTX. */
4132 static reg_errcode_t
4133 internal_function __attribute_warn_unused_result__
4134 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4136 mctx->eflags = eflags;
4137 mctx->match_last = -1;
4138 if (n > 0)
4140 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4141 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4142 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4143 return REG_ESPACE;
4145 /* Already zero-ed by the caller.
4146 else
4147 mctx->bkref_ents = NULL;
4148 mctx->nbkref_ents = 0;
4149 mctx->nsub_tops = 0; */
4150 mctx->abkref_ents = n;
4151 mctx->max_mb_elem_len = 1;
4152 mctx->asub_tops = n;
4153 return REG_NOERROR;
4156 /* Clean the entries which depend on the current input in MCTX.
4157 This function must be invoked when the matcher changes the start index
4158 of the input, or changes the input string. */
4160 static void
4161 internal_function
4162 match_ctx_clean (re_match_context_t *mctx)
4164 int st_idx;
4165 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4167 int sl_idx;
4168 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4169 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4171 re_sub_match_last_t *last = top->lasts[sl_idx];
4172 re_free (last->path.array);
4173 re_free (last);
4175 re_free (top->lasts);
4176 if (top->path)
4178 re_free (top->path->array);
4179 re_free (top->path);
4181 free (top);
4184 mctx->nsub_tops = 0;
4185 mctx->nbkref_ents = 0;
4188 /* Free all the memory associated with MCTX. */
4190 static void
4191 internal_function
4192 match_ctx_free (re_match_context_t *mctx)
4194 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4195 match_ctx_clean (mctx);
4196 re_free (mctx->sub_tops);
4197 re_free (mctx->bkref_ents);
4200 /* Add a new backreference entry to MCTX.
4201 Note that we assume that caller never call this function with duplicate
4202 entry, and call with STR_IDX which isn't smaller than any existing entry.
4205 static reg_errcode_t
4206 internal_function __attribute_warn_unused_result__
4207 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4208 int to)
4210 if (mctx->nbkref_ents >= mctx->abkref_ents)
4212 struct re_backref_cache_entry* new_entry;
4213 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4214 mctx->abkref_ents * 2);
4215 if (BE (new_entry == NULL, 0))
4217 re_free (mctx->bkref_ents);
4218 return REG_ESPACE;
4220 mctx->bkref_ents = new_entry;
4221 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4222 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4223 mctx->abkref_ents *= 2;
4225 if (mctx->nbkref_ents > 0
4226 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4227 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4229 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4230 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4231 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4232 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4234 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4235 If bit N is clear, means that this entry won't epsilon-transition to
4236 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4237 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4238 such node.
4240 A backreference does not epsilon-transition unless it is empty, so set
4241 to all zeros if FROM != TO. */
4242 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4243 = (from == to ? ~0 : 0);
4245 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4246 if (mctx->max_mb_elem_len < to - from)
4247 mctx->max_mb_elem_len = to - from;
4248 return REG_NOERROR;
4251 /* Search for the first entry which has the same str_idx, or -1 if none is
4252 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4254 static int
4255 internal_function
4256 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4258 int left, right, mid, last;
4259 last = right = mctx->nbkref_ents;
4260 for (left = 0; left < right;)
4262 mid = (left + right) / 2;
4263 if (mctx->bkref_ents[mid].str_idx < str_idx)
4264 left = mid + 1;
4265 else
4266 right = mid;
4268 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4269 return left;
4270 else
4271 return -1;
4274 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4275 at STR_IDX. */
4277 static reg_errcode_t
4278 internal_function __attribute_warn_unused_result__
4279 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4281 #ifdef DEBUG
4282 assert (mctx->sub_tops != NULL);
4283 assert (mctx->asub_tops > 0);
4284 #endif
4285 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4287 int new_asub_tops = mctx->asub_tops * 2;
4288 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4289 re_sub_match_top_t *,
4290 new_asub_tops);
4291 if (BE (new_array == NULL, 0))
4292 return REG_ESPACE;
4293 mctx->sub_tops = new_array;
4294 mctx->asub_tops = new_asub_tops;
4296 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4297 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4298 return REG_ESPACE;
4299 mctx->sub_tops[mctx->nsub_tops]->node = node;
4300 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4301 return REG_NOERROR;
4304 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4305 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4307 static re_sub_match_last_t *
4308 internal_function
4309 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4311 re_sub_match_last_t *new_entry;
4312 if (BE (subtop->nlasts == subtop->alasts, 0))
4314 int new_alasts = 2 * subtop->alasts + 1;
4315 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4316 re_sub_match_last_t *,
4317 new_alasts);
4318 if (BE (new_array == NULL, 0))
4319 return NULL;
4320 subtop->lasts = new_array;
4321 subtop->alasts = new_alasts;
4323 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4324 if (BE (new_entry != NULL, 1))
4326 subtop->lasts[subtop->nlasts] = new_entry;
4327 new_entry->node = node;
4328 new_entry->str_idx = str_idx;
4329 ++subtop->nlasts;
4331 return new_entry;
4334 static void
4335 internal_function
4336 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4337 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4339 sctx->sifted_states = sifted_sts;
4340 sctx->limited_states = limited_sts;
4341 sctx->last_node = last_node;
4342 sctx->last_str_idx = last_str_idx;
4343 re_node_set_init_empty (&sctx->limits);