Fix typo in locale source file.
[glibc.git] / posix / regexec.c
blobf87701672b12b99ff4a478e5cc5b126f13b78545
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
77 #ifdef RE_ENABLE_I18N
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99 static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137 #if 0
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142 #endif
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188 # ifdef _LIBC
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192 # endif /* _LIBC */
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198 static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
207 string STRING.
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg, string, nmatch, pmatch, eflags)
222 const regex_t *__restrict preg;
223 const char *__restrict string;
224 size_t nmatch;
225 regmatch_t pmatch[];
226 int eflags;
228 reg_errcode_t err;
229 int start, length;
230 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
232 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
233 return REG_BADPAT;
235 if (eflags & REG_STARTEND)
237 start = pmatch[0].rm_so;
238 length = pmatch[0].rm_eo;
240 else
242 start = 0;
243 length = strlen (string);
246 __libc_lock_lock (dfa->lock);
247 if (preg->no_sub)
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, 0, NULL, eflags);
250 else
251 err = re_search_internal (preg, string, length, start, length - start,
252 length, nmatch, pmatch, eflags);
253 __libc_lock_unlock (dfa->lock);
254 return err != REG_NOERROR;
257 #ifdef _LIBC
258 # include <shlib-compat.h>
259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec) __compat_regexec;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t *__restrict preg,
267 const char *__restrict string, size_t nmatch,
268 regmatch_t pmatch[], int eflags)
270 return regexec (preg, string, nmatch, pmatch,
271 eflags & (REG_NOTBOL | REG_NOTEOL));
273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
274 # endif
275 #endif
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
291 way as re_match().)
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
295 concerned.
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
300 strings.)
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp, string, length, start, regs)
308 struct re_pattern_buffer *bufp;
309 const char *string;
310 int length, start;
311 struct re_registers *regs;
313 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
315 #ifdef _LIBC
316 weak_alias (__re_match, re_match)
317 #endif
320 re_search (bufp, string, length, start, range, regs)
321 struct re_pattern_buffer *bufp;
322 const char *string;
323 int length, start, range;
324 struct re_registers *regs;
326 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
328 #ifdef _LIBC
329 weak_alias (__re_search, re_search)
330 #endif
333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
334 struct re_pattern_buffer *bufp;
335 const char *string1, *string2;
336 int length1, length2, start, stop;
337 struct re_registers *regs;
339 return re_search_2_stub (bufp, string1, length1, string2, length2,
340 start, 0, regs, stop, 1);
342 #ifdef _LIBC
343 weak_alias (__re_match_2, re_match_2)
344 #endif
347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
348 struct re_pattern_buffer *bufp;
349 const char *string1, *string2;
350 int length1, length2, start, range, stop;
351 struct re_registers *regs;
353 return re_search_2_stub (bufp, string1, length1, string2, length2,
354 start, range, regs, stop, 0);
356 #ifdef _LIBC
357 weak_alias (__re_search_2, re_search_2)
358 #endif
360 static int
361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
362 stop, ret_len)
363 struct re_pattern_buffer *bufp;
364 const char *string1, *string2;
365 int length1, length2, start, range, stop, ret_len;
366 struct re_registers *regs;
368 const char *str;
369 int rval;
370 int len = length1 + length2;
371 char *s = NULL;
373 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
374 return -2;
376 /* Concatenate the strings. */
377 if (length2 > 0)
378 if (length1 > 0)
380 s = re_malloc (char, len);
382 if (BE (s == NULL, 0))
383 return -2;
384 #ifdef _LIBC
385 memcpy (__mempcpy (s, string1, length1), string2, length2);
386 #else
387 memcpy (s, string1, length1);
388 memcpy (s + length1, string2, length2);
389 #endif
390 str = s;
392 else
393 str = string2;
394 else
395 str = string1;
397 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
398 re_free (s);
399 return rval;
402 /* The parameters have the same meaning as those of re_search.
403 Additional parameters:
404 If RET_LEN is nonzero the length of the match is returned (re_match style);
405 otherwise the position of the match is returned. */
407 static int
408 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
409 struct re_pattern_buffer *bufp;
410 const char *string;
411 int length, start, range, stop, ret_len;
412 struct re_registers *regs;
414 reg_errcode_t result;
415 regmatch_t *pmatch;
416 int nregs, rval;
417 int eflags = 0;
418 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
420 /* Check for out-of-range. */
421 if (BE (start < 0 || start > length, 0))
422 return -1;
423 if (BE (start + range > length, 0))
424 range = length - start;
425 else if (BE (start + range < 0, 0))
426 range = -start;
428 __libc_lock_lock (dfa->lock);
430 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
431 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
433 /* Compile fastmap if we haven't yet. */
434 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
435 re_compile_fastmap (bufp);
437 if (BE (bufp->no_sub, 0))
438 regs = NULL;
440 /* We need at least 1 register. */
441 if (regs == NULL)
442 nregs = 1;
443 else if (BE (bufp->regs_allocated == REGS_FIXED &&
444 regs->num_regs < bufp->re_nsub + 1, 0))
446 nregs = regs->num_regs;
447 if (BE (nregs < 1, 0))
449 /* Nothing can be copied to regs. */
450 regs = NULL;
451 nregs = 1;
454 else
455 nregs = bufp->re_nsub + 1;
456 pmatch = re_malloc (regmatch_t, nregs);
457 if (BE (pmatch == NULL, 0))
459 rval = -2;
460 goto out;
463 result = re_search_internal (bufp, string, length, start, range, stop,
464 nregs, pmatch, eflags);
466 rval = 0;
468 /* I hope we needn't fill ther regs with -1's when no match was found. */
469 if (result != REG_NOERROR)
470 rval = -1;
471 else if (regs != NULL)
473 /* If caller wants register contents data back, copy them. */
474 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
475 bufp->regs_allocated);
476 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
477 rval = -2;
480 if (BE (rval == 0, 1))
482 if (ret_len)
484 assert (pmatch[0].rm_so == start);
485 rval = pmatch[0].rm_eo - start;
487 else
488 rval = pmatch[0].rm_so;
490 re_free (pmatch);
491 out:
492 __libc_lock_unlock (dfa->lock);
493 return rval;
496 static unsigned
497 re_copy_regs (regs, pmatch, nregs, regs_allocated)
498 struct re_registers *regs;
499 regmatch_t *pmatch;
500 int nregs, regs_allocated;
502 int rval = REGS_REALLOCATE;
503 int i;
504 int need_regs = nregs + 1;
505 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
506 uses. */
508 /* Have the register data arrays been allocated? */
509 if (regs_allocated == REGS_UNALLOCATED)
510 { /* No. So allocate them with malloc. */
511 regs->start = re_malloc (regoff_t, need_regs);
512 if (BE (regs->start == NULL, 0))
513 return REGS_UNALLOCATED;
514 regs->end = re_malloc (regoff_t, need_regs);
515 if (BE (regs->end == NULL, 0))
517 re_free (regs->start);
518 return REGS_UNALLOCATED;
520 regs->num_regs = need_regs;
522 else if (regs_allocated == REGS_REALLOCATE)
523 { /* Yes. If we need more elements than were already
524 allocated, reallocate them. If we need fewer, just
525 leave it alone. */
526 if (BE (need_regs > regs->num_regs, 0))
528 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
529 regoff_t *new_end;
530 if (BE (new_start == NULL, 0))
531 return REGS_UNALLOCATED;
532 new_end = re_realloc (regs->end, regoff_t, need_regs);
533 if (BE (new_end == NULL, 0))
535 re_free (new_start);
536 return REGS_UNALLOCATED;
538 regs->start = new_start;
539 regs->end = new_end;
540 regs->num_regs = need_regs;
543 else
545 assert (regs_allocated == REGS_FIXED);
546 /* This function may not be called with REGS_FIXED and nregs too big. */
547 assert (regs->num_regs >= nregs);
548 rval = REGS_FIXED;
551 /* Copy the regs. */
552 for (i = 0; i < nregs; ++i)
554 regs->start[i] = pmatch[i].rm_so;
555 regs->end[i] = pmatch[i].rm_eo;
557 for ( ; i < regs->num_regs; ++i)
558 regs->start[i] = regs->end[i] = -1;
560 return rval;
563 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
564 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
565 this memory for recording register information. STARTS and ENDS
566 must be allocated using the malloc library routine, and must each
567 be at least NUM_REGS * sizeof (regoff_t) bytes long.
569 If NUM_REGS == 0, then subsequent matches should allocate their own
570 register data.
572 Unless this function is called, the first search or match using
573 PATTERN_BUFFER will allocate its own register data, without
574 freeing the old data. */
576 void
577 re_set_registers (bufp, regs, num_regs, starts, ends)
578 struct re_pattern_buffer *bufp;
579 struct re_registers *regs;
580 unsigned num_regs;
581 regoff_t *starts, *ends;
583 if (num_regs)
585 bufp->regs_allocated = REGS_REALLOCATE;
586 regs->num_regs = num_regs;
587 regs->start = starts;
588 regs->end = ends;
590 else
592 bufp->regs_allocated = REGS_UNALLOCATED;
593 regs->num_regs = 0;
594 regs->start = regs->end = (regoff_t *) 0;
597 #ifdef _LIBC
598 weak_alias (__re_set_registers, re_set_registers)
599 #endif
601 /* Entry points compatible with 4.2 BSD regex library. We don't define
602 them unless specifically requested. */
604 #if defined _REGEX_RE_COMP || defined _LIBC
606 # ifdef _LIBC
607 weak_function
608 # endif
609 re_exec (s)
610 const char *s;
612 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
614 #endif /* _REGEX_RE_COMP */
616 /* Internal entry point. */
618 /* Searches for a compiled pattern PREG in the string STRING, whose
619 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
620 mingings with regexec. START, and RANGE have the same meanings
621 with re_search.
622 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
623 otherwise return the error code.
624 Note: We assume front end functions already check ranges.
625 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
627 static reg_errcode_t
628 __attribute_warn_unused_result__
629 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
630 eflags)
631 const regex_t *preg;
632 const char *string;
633 int length, start, range, stop, eflags;
634 size_t nmatch;
635 regmatch_t pmatch[];
637 reg_errcode_t err;
638 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
639 int left_lim, right_lim, incr;
640 int fl_longest_match, match_first, match_kind, match_last = -1;
641 int extra_nmatch;
642 int sb, ch;
643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644 re_match_context_t mctx = { .dfa = dfa };
645 #else
646 re_match_context_t mctx;
647 #endif
648 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
649 && range && !preg->can_be_null) ? preg->fastmap : NULL;
650 RE_TRANSLATE_TYPE t = preg->translate;
652 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
653 memset (&mctx, '\0', sizeof (re_match_context_t));
654 mctx.dfa = dfa;
655 #endif
657 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
658 nmatch -= extra_nmatch;
660 /* Check if the DFA haven't been compiled. */
661 if (BE (preg->used == 0 || dfa->init_state == NULL
662 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
663 || dfa->init_state_begbuf == NULL, 0))
664 return REG_NOMATCH;
666 #ifdef DEBUG
667 /* We assume front-end functions already check them. */
668 assert (start + range >= 0 && start + range <= length);
669 #endif
671 /* If initial states with non-begbuf contexts have no elements,
672 the regex must be anchored. If preg->newline_anchor is set,
673 we'll never use init_state_nl, so do not check it. */
674 if (dfa->init_state->nodes.nelem == 0
675 && dfa->init_state_word->nodes.nelem == 0
676 && (dfa->init_state_nl->nodes.nelem == 0
677 || !preg->newline_anchor))
679 if (start != 0 && start + range != 0)
680 return REG_NOMATCH;
681 start = range = 0;
684 /* We must check the longest matching, if nmatch > 0. */
685 fl_longest_match = (nmatch != 0 || dfa->nbackref);
687 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
688 preg->translate, preg->syntax & RE_ICASE, dfa);
689 if (BE (err != REG_NOERROR, 0))
690 goto free_return;
691 mctx.input.stop = stop;
692 mctx.input.raw_stop = stop;
693 mctx.input.newline_anchor = preg->newline_anchor;
695 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
696 if (BE (err != REG_NOERROR, 0))
697 goto free_return;
699 /* We will log all the DFA states through which the dfa pass,
700 if nmatch > 1, or this dfa has "multibyte node", which is a
701 back-reference or a node which can accept multibyte character or
702 multi character collating element. */
703 if (nmatch > 1 || dfa->has_mb_node)
705 /* Avoid overflow. */
706 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
708 err = REG_ESPACE;
709 goto free_return;
712 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
713 if (BE (mctx.state_log == NULL, 0))
715 err = REG_ESPACE;
716 goto free_return;
719 else
720 mctx.state_log = NULL;
722 match_first = start;
723 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
724 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
726 /* Check incrementally whether of not the input string match. */
727 incr = (range < 0) ? -1 : 1;
728 left_lim = (range < 0) ? start + range : start;
729 right_lim = (range < 0) ? start : start + range;
730 sb = dfa->mb_cur_max == 1;
731 match_kind =
732 (fastmap
733 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
734 | (range >= 0 ? 2 : 0)
735 | (t != NULL ? 1 : 0))
736 : 8);
738 for (;; match_first += incr)
740 err = REG_NOMATCH;
741 if (match_first < left_lim || right_lim < match_first)
742 goto free_return;
744 /* Advance as rapidly as possible through the string, until we
745 find a plausible place to start matching. This may be done
746 with varying efficiency, so there are various possibilities:
747 only the most common of them are specialized, in order to
748 save on code size. We use a switch statement for speed. */
749 switch (match_kind)
751 case 8:
752 /* No fastmap. */
753 break;
755 case 7:
756 /* Fastmap with single-byte translation, match forward. */
757 while (BE (match_first < right_lim, 1)
758 && !fastmap[t[(unsigned char) string[match_first]]])
759 ++match_first;
760 goto forward_match_found_start_or_reached_end;
762 case 6:
763 /* Fastmap without translation, match forward. */
764 while (BE (match_first < right_lim, 1)
765 && !fastmap[(unsigned char) string[match_first]])
766 ++match_first;
768 forward_match_found_start_or_reached_end:
769 if (BE (match_first == right_lim, 0))
771 ch = match_first >= length
772 ? 0 : (unsigned char) string[match_first];
773 if (!fastmap[t ? t[ch] : ch])
774 goto free_return;
776 break;
778 case 4:
779 case 5:
780 /* Fastmap without multi-byte translation, match backwards. */
781 while (match_first >= left_lim)
783 ch = match_first >= length
784 ? 0 : (unsigned char) string[match_first];
785 if (fastmap[t ? t[ch] : ch])
786 break;
787 --match_first;
789 if (match_first < left_lim)
790 goto free_return;
791 break;
793 default:
794 /* In this case, we can't determine easily the current byte,
795 since it might be a component byte of a multibyte
796 character. Then we use the constructed buffer instead. */
797 for (;;)
799 /* If MATCH_FIRST is out of the valid range, reconstruct the
800 buffers. */
801 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
802 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
804 err = re_string_reconstruct (&mctx.input, match_first,
805 eflags);
806 if (BE (err != REG_NOERROR, 0))
807 goto free_return;
809 offset = match_first - mctx.input.raw_mbs_idx;
811 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
812 Note that MATCH_FIRST must not be smaller than 0. */
813 ch = (match_first >= length
814 ? 0 : re_string_byte_at (&mctx.input, offset));
815 if (fastmap[ch])
816 break;
817 match_first += incr;
818 if (match_first < left_lim || match_first > right_lim)
820 err = REG_NOMATCH;
821 goto free_return;
824 break;
827 /* Reconstruct the buffers so that the matcher can assume that
828 the matching starts from the beginning of the buffer. */
829 err = re_string_reconstruct (&mctx.input, match_first, eflags);
830 if (BE (err != REG_NOERROR, 0))
831 goto free_return;
833 #ifdef RE_ENABLE_I18N
834 /* Don't consider this char as a possible match start if it part,
835 yet isn't the head, of a multibyte character. */
836 if (!sb && !re_string_first_byte (&mctx.input, 0))
837 continue;
838 #endif
840 /* It seems to be appropriate one, then use the matcher. */
841 /* We assume that the matching starts from 0. */
842 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
843 match_last = check_matching (&mctx, fl_longest_match,
844 range >= 0 ? &match_first : NULL);
845 if (match_last != -1)
847 if (BE (match_last == -2, 0))
849 err = REG_ESPACE;
850 goto free_return;
852 else
854 mctx.match_last = match_last;
855 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
857 re_dfastate_t *pstate = mctx.state_log[match_last];
858 mctx.last_node = check_halt_state_context (&mctx, pstate,
859 match_last);
861 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
862 || dfa->nbackref)
864 err = prune_impossible_nodes (&mctx);
865 if (err == REG_NOERROR)
866 break;
867 if (BE (err != REG_NOMATCH, 0))
868 goto free_return;
869 match_last = -1;
871 else
872 break; /* We found a match. */
876 match_ctx_clean (&mctx);
879 #ifdef DEBUG
880 assert (match_last != -1);
881 assert (err == REG_NOERROR);
882 #endif
884 /* Set pmatch[] if we need. */
885 if (nmatch > 0)
887 int reg_idx;
889 /* Initialize registers. */
890 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
891 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
893 /* Set the points where matching start/end. */
894 pmatch[0].rm_so = 0;
895 pmatch[0].rm_eo = mctx.match_last;
897 if (!preg->no_sub && nmatch > 1)
899 err = set_regs (preg, &mctx, nmatch, pmatch,
900 dfa->has_plural_match && dfa->nbackref > 0);
901 if (BE (err != REG_NOERROR, 0))
902 goto free_return;
905 /* At last, add the offset to the each registers, since we slided
906 the buffers so that we could assume that the matching starts
907 from 0. */
908 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
909 if (pmatch[reg_idx].rm_so != -1)
911 #ifdef RE_ENABLE_I18N
912 if (BE (mctx.input.offsets_needed != 0, 0))
914 pmatch[reg_idx].rm_so =
915 (pmatch[reg_idx].rm_so == mctx.input.valid_len
916 ? mctx.input.valid_raw_len
917 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
918 pmatch[reg_idx].rm_eo =
919 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
920 ? mctx.input.valid_raw_len
921 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
923 #else
924 assert (mctx.input.offsets_needed == 0);
925 #endif
926 pmatch[reg_idx].rm_so += match_first;
927 pmatch[reg_idx].rm_eo += match_first;
929 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
931 pmatch[nmatch + reg_idx].rm_so = -1;
932 pmatch[nmatch + reg_idx].rm_eo = -1;
935 if (dfa->subexp_map)
936 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
937 if (dfa->subexp_map[reg_idx] != reg_idx)
939 pmatch[reg_idx + 1].rm_so
940 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
941 pmatch[reg_idx + 1].rm_eo
942 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
946 free_return:
947 re_free (mctx.state_log);
948 if (dfa->nbackref)
949 match_ctx_free (&mctx);
950 re_string_destruct (&mctx.input);
951 return err;
954 static reg_errcode_t
955 __attribute_warn_unused_result__
956 prune_impossible_nodes (mctx)
957 re_match_context_t *mctx;
959 const re_dfa_t *const dfa = mctx->dfa;
960 int halt_node, match_last;
961 reg_errcode_t ret;
962 re_dfastate_t **sifted_states;
963 re_dfastate_t **lim_states = NULL;
964 re_sift_context_t sctx;
965 #ifdef DEBUG
966 assert (mctx->state_log != NULL);
967 #endif
968 match_last = mctx->match_last;
969 halt_node = mctx->last_node;
971 /* Avoid overflow. */
972 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
973 return REG_ESPACE;
975 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
976 if (BE (sifted_states == NULL, 0))
978 ret = REG_ESPACE;
979 goto free_return;
981 if (dfa->nbackref)
983 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
984 if (BE (lim_states == NULL, 0))
986 ret = REG_ESPACE;
987 goto free_return;
989 while (1)
991 memset (lim_states, '\0',
992 sizeof (re_dfastate_t *) * (match_last + 1));
993 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
994 match_last);
995 ret = sift_states_backward (mctx, &sctx);
996 re_node_set_free (&sctx.limits);
997 if (BE (ret != REG_NOERROR, 0))
998 goto free_return;
999 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1000 break;
1003 --match_last;
1004 if (match_last < 0)
1006 ret = REG_NOMATCH;
1007 goto free_return;
1009 } while (mctx->state_log[match_last] == NULL
1010 || !mctx->state_log[match_last]->halt);
1011 halt_node = check_halt_state_context (mctx,
1012 mctx->state_log[match_last],
1013 match_last);
1015 ret = merge_state_array (dfa, sifted_states, lim_states,
1016 match_last + 1);
1017 re_free (lim_states);
1018 lim_states = NULL;
1019 if (BE (ret != REG_NOERROR, 0))
1020 goto free_return;
1022 else
1024 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1025 ret = sift_states_backward (mctx, &sctx);
1026 re_node_set_free (&sctx.limits);
1027 if (BE (ret != REG_NOERROR, 0))
1028 goto free_return;
1029 if (sifted_states[0] == NULL)
1031 ret = REG_NOMATCH;
1032 goto free_return;
1035 re_free (mctx->state_log);
1036 mctx->state_log = sifted_states;
1037 sifted_states = NULL;
1038 mctx->last_node = halt_node;
1039 mctx->match_last = match_last;
1040 ret = REG_NOERROR;
1041 free_return:
1042 re_free (sifted_states);
1043 re_free (lim_states);
1044 return ret;
1047 /* Acquire an initial state and return it.
1048 We must select appropriate initial state depending on the context,
1049 since initial states may have constraints like "\<", "^", etc.. */
1051 static inline re_dfastate_t *
1052 __attribute ((always_inline)) internal_function
1053 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1054 int idx)
1056 const re_dfa_t *const dfa = mctx->dfa;
1057 if (dfa->init_state->has_constraint)
1059 unsigned int context;
1060 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1061 if (IS_WORD_CONTEXT (context))
1062 return dfa->init_state_word;
1063 else if (IS_ORDINARY_CONTEXT (context))
1064 return dfa->init_state;
1065 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1066 return dfa->init_state_begbuf;
1067 else if (IS_NEWLINE_CONTEXT (context))
1068 return dfa->init_state_nl;
1069 else if (IS_BEGBUF_CONTEXT (context))
1071 /* It is relatively rare case, then calculate on demand. */
1072 return re_acquire_state_context (err, dfa,
1073 dfa->init_state->entrance_nodes,
1074 context);
1076 else
1077 /* Must not happen? */
1078 return dfa->init_state;
1080 else
1081 return dfa->init_state;
1084 /* Check whether the regular expression match input string INPUT or not,
1085 and return the index where the matching end, return -1 if not match,
1086 or return -2 in case of an error.
1087 FL_LONGEST_MATCH means we want the POSIX longest matching.
1088 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1089 next place where we may want to try matching.
1090 Note that the matcher assume that the maching starts from the current
1091 index of the buffer. */
1093 static int
1094 internal_function __attribute_warn_unused_result__
1095 check_matching (re_match_context_t *mctx, int fl_longest_match,
1096 int *p_match_first)
1098 const re_dfa_t *const dfa = mctx->dfa;
1099 reg_errcode_t err;
1100 int match = 0;
1101 int match_last = -1;
1102 int cur_str_idx = re_string_cur_idx (&mctx->input);
1103 re_dfastate_t *cur_state;
1104 int at_init_state = p_match_first != NULL;
1105 int next_start_idx = cur_str_idx;
1107 err = REG_NOERROR;
1108 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1109 /* An initial state must not be NULL (invalid). */
1110 if (BE (cur_state == NULL, 0))
1112 assert (err == REG_ESPACE);
1113 return -2;
1116 if (mctx->state_log != NULL)
1118 mctx->state_log[cur_str_idx] = cur_state;
1120 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1121 later. E.g. Processing back references. */
1122 if (BE (dfa->nbackref, 0))
1124 at_init_state = 0;
1125 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1126 if (BE (err != REG_NOERROR, 0))
1127 return err;
1129 if (cur_state->has_backref)
1131 err = transit_state_bkref (mctx, &cur_state->nodes);
1132 if (BE (err != REG_NOERROR, 0))
1133 return err;
1138 /* If the RE accepts NULL string. */
1139 if (BE (cur_state->halt, 0))
1141 if (!cur_state->has_constraint
1142 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1144 if (!fl_longest_match)
1145 return cur_str_idx;
1146 else
1148 match_last = cur_str_idx;
1149 match = 1;
1154 while (!re_string_eoi (&mctx->input))
1156 re_dfastate_t *old_state = cur_state;
1157 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1159 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1160 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1161 && mctx->input.valid_len < mctx->input.len))
1163 err = extend_buffers (mctx);
1164 if (BE (err != REG_NOERROR, 0))
1166 assert (err == REG_ESPACE);
1167 return -2;
1171 cur_state = transit_state (&err, mctx, cur_state);
1172 if (mctx->state_log != NULL)
1173 cur_state = merge_state_with_log (&err, mctx, cur_state);
1175 if (cur_state == NULL)
1177 /* Reached the invalid state or an error. Try to recover a valid
1178 state using the state log, if available and if we have not
1179 already found a valid (even if not the longest) match. */
1180 if (BE (err != REG_NOERROR, 0))
1181 return -2;
1183 if (mctx->state_log == NULL
1184 || (match && !fl_longest_match)
1185 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1186 break;
1189 if (BE (at_init_state, 0))
1191 if (old_state == cur_state)
1192 next_start_idx = next_char_idx;
1193 else
1194 at_init_state = 0;
1197 if (cur_state->halt)
1199 /* Reached a halt state.
1200 Check the halt state can satisfy the current context. */
1201 if (!cur_state->has_constraint
1202 || check_halt_state_context (mctx, cur_state,
1203 re_string_cur_idx (&mctx->input)))
1205 /* We found an appropriate halt state. */
1206 match_last = re_string_cur_idx (&mctx->input);
1207 match = 1;
1209 /* We found a match, do not modify match_first below. */
1210 p_match_first = NULL;
1211 if (!fl_longest_match)
1212 break;
1217 if (p_match_first)
1218 *p_match_first += next_start_idx;
1220 return match_last;
1223 /* Check NODE match the current context. */
1225 static int
1226 internal_function
1227 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1229 re_token_type_t type = dfa->nodes[node].type;
1230 unsigned int constraint = dfa->nodes[node].constraint;
1231 if (type != END_OF_RE)
1232 return 0;
1233 if (!constraint)
1234 return 1;
1235 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1236 return 0;
1237 return 1;
1240 /* Check the halt state STATE match the current context.
1241 Return 0 if not match, if the node, STATE has, is a halt node and
1242 match the context, return the node. */
1244 static int
1245 internal_function
1246 check_halt_state_context (const re_match_context_t *mctx,
1247 const re_dfastate_t *state, int idx)
1249 int i;
1250 unsigned int context;
1251 #ifdef DEBUG
1252 assert (state->halt);
1253 #endif
1254 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1255 for (i = 0; i < state->nodes.nelem; ++i)
1256 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1257 return state->nodes.elems[i];
1258 return 0;
1261 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1262 corresponding to the DFA).
1263 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1264 of errors. */
1266 static int
1267 internal_function
1268 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1269 int *pidx, int node, re_node_set *eps_via_nodes,
1270 struct re_fail_stack_t *fs)
1272 const re_dfa_t *const dfa = mctx->dfa;
1273 int i, err;
1274 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1276 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1277 re_node_set *edests = &dfa->edests[node];
1278 int dest_node;
1279 err = re_node_set_insert (eps_via_nodes, node);
1280 if (BE (err < 0, 0))
1281 return -2;
1282 /* Pick up a valid destination, or return -1 if none is found. */
1283 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1285 int candidate = edests->elems[i];
1286 if (!re_node_set_contains (cur_nodes, candidate))
1287 continue;
1288 if (dest_node == -1)
1289 dest_node = candidate;
1291 else
1293 /* In order to avoid infinite loop like "(a*)*", return the second
1294 epsilon-transition if the first was already considered. */
1295 if (re_node_set_contains (eps_via_nodes, dest_node))
1296 return candidate;
1298 /* Otherwise, push the second epsilon-transition on the fail stack. */
1299 else if (fs != NULL
1300 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1301 eps_via_nodes))
1302 return -2;
1304 /* We know we are going to exit. */
1305 break;
1308 return dest_node;
1310 else
1312 int naccepted = 0;
1313 re_token_type_t type = dfa->nodes[node].type;
1315 #ifdef RE_ENABLE_I18N
1316 if (dfa->nodes[node].accept_mb)
1317 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1318 else
1319 #endif /* RE_ENABLE_I18N */
1320 if (type == OP_BACK_REF)
1322 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1323 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1324 if (fs != NULL)
1326 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1327 return -1;
1328 else if (naccepted)
1330 char *buf = (char *) re_string_get_buffer (&mctx->input);
1331 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1332 naccepted) != 0)
1333 return -1;
1337 if (naccepted == 0)
1339 int dest_node;
1340 err = re_node_set_insert (eps_via_nodes, node);
1341 if (BE (err < 0, 0))
1342 return -2;
1343 dest_node = dfa->edests[node].elems[0];
1344 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1345 dest_node))
1346 return dest_node;
1350 if (naccepted != 0
1351 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1353 int dest_node = dfa->nexts[node];
1354 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1355 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1356 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1357 dest_node)))
1358 return -1;
1359 re_node_set_empty (eps_via_nodes);
1360 return dest_node;
1363 return -1;
1366 static reg_errcode_t
1367 internal_function __attribute_warn_unused_result__
1368 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1369 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1371 reg_errcode_t err;
1372 int num = fs->num++;
1373 if (fs->num == fs->alloc)
1375 struct re_fail_stack_ent_t *new_array;
1376 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1377 * fs->alloc * 2));
1378 if (new_array == NULL)
1379 return REG_ESPACE;
1380 fs->alloc *= 2;
1381 fs->stack = new_array;
1383 fs->stack[num].idx = str_idx;
1384 fs->stack[num].node = dest_node;
1385 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1386 if (fs->stack[num].regs == NULL)
1387 return REG_ESPACE;
1388 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1389 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1390 return err;
1393 static int
1394 internal_function
1395 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1396 regmatch_t *regs, re_node_set *eps_via_nodes)
1398 int num = --fs->num;
1399 assert (num >= 0);
1400 *pidx = fs->stack[num].idx;
1401 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1402 re_node_set_free (eps_via_nodes);
1403 re_free (fs->stack[num].regs);
1404 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1405 return fs->stack[num].node;
1408 /* Set the positions where the subexpressions are starts/ends to registers
1409 PMATCH.
1410 Note: We assume that pmatch[0] is already set, and
1411 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1413 static reg_errcode_t
1414 internal_function __attribute_warn_unused_result__
1415 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1416 regmatch_t *pmatch, int fl_backtrack)
1418 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1419 int idx, cur_node;
1420 re_node_set eps_via_nodes;
1421 struct re_fail_stack_t *fs;
1422 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1423 regmatch_t *prev_idx_match;
1424 int prev_idx_match_malloced = 0;
1426 #ifdef DEBUG
1427 assert (nmatch > 1);
1428 assert (mctx->state_log != NULL);
1429 #endif
1430 if (fl_backtrack)
1432 fs = &fs_body;
1433 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1434 if (fs->stack == NULL)
1435 return REG_ESPACE;
1437 else
1438 fs = NULL;
1440 cur_node = dfa->init_node;
1441 re_node_set_init_empty (&eps_via_nodes);
1443 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1444 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1445 else
1447 prev_idx_match = re_malloc (regmatch_t, nmatch);
1448 if (prev_idx_match == NULL)
1450 free_fail_stack_return (fs);
1451 return REG_ESPACE;
1453 prev_idx_match_malloced = 1;
1455 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1457 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1459 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1461 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1463 int reg_idx;
1464 if (fs)
1466 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1467 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1468 break;
1469 if (reg_idx == nmatch)
1471 re_node_set_free (&eps_via_nodes);
1472 if (prev_idx_match_malloced)
1473 re_free (prev_idx_match);
1474 return free_fail_stack_return (fs);
1476 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477 &eps_via_nodes);
1479 else
1481 re_node_set_free (&eps_via_nodes);
1482 if (prev_idx_match_malloced)
1483 re_free (prev_idx_match);
1484 return REG_NOERROR;
1488 /* Proceed to next node. */
1489 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1490 &eps_via_nodes, fs);
1492 if (BE (cur_node < 0, 0))
1494 if (BE (cur_node == -2, 0))
1496 re_node_set_free (&eps_via_nodes);
1497 if (prev_idx_match_malloced)
1498 re_free (prev_idx_match);
1499 free_fail_stack_return (fs);
1500 return REG_ESPACE;
1502 if (fs)
1503 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1504 &eps_via_nodes);
1505 else
1507 re_node_set_free (&eps_via_nodes);
1508 if (prev_idx_match_malloced)
1509 re_free (prev_idx_match);
1510 return REG_NOMATCH;
1514 re_node_set_free (&eps_via_nodes);
1515 if (prev_idx_match_malloced)
1516 re_free (prev_idx_match);
1517 return free_fail_stack_return (fs);
1520 static reg_errcode_t
1521 internal_function
1522 free_fail_stack_return (struct re_fail_stack_t *fs)
1524 if (fs)
1526 int fs_idx;
1527 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1529 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1530 re_free (fs->stack[fs_idx].regs);
1532 re_free (fs->stack);
1534 return REG_NOERROR;
1537 static void
1538 internal_function
1539 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1540 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1542 int type = dfa->nodes[cur_node].type;
1543 if (type == OP_OPEN_SUBEXP)
1545 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1547 /* We are at the first node of this sub expression. */
1548 if (reg_num < nmatch)
1550 pmatch[reg_num].rm_so = cur_idx;
1551 pmatch[reg_num].rm_eo = -1;
1554 else if (type == OP_CLOSE_SUBEXP)
1556 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1557 if (reg_num < nmatch)
1559 /* We are at the last node of this sub expression. */
1560 if (pmatch[reg_num].rm_so < cur_idx)
1562 pmatch[reg_num].rm_eo = cur_idx;
1563 /* This is a non-empty match or we are not inside an optional
1564 subexpression. Accept this right away. */
1565 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1567 else
1569 if (dfa->nodes[cur_node].opt_subexp
1570 && prev_idx_match[reg_num].rm_so != -1)
1571 /* We transited through an empty match for an optional
1572 subexpression, like (a?)*, and this is not the subexp's
1573 first match. Copy back the old content of the registers
1574 so that matches of an inner subexpression are undone as
1575 well, like in ((a?))*. */
1576 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1577 else
1578 /* We completed a subexpression, but it may be part of
1579 an optional one, so do not update PREV_IDX_MATCH. */
1580 pmatch[reg_num].rm_eo = cur_idx;
1586 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1587 and sift the nodes in each states according to the following rules.
1588 Updated state_log will be wrote to STATE_LOG.
1590 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1591 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1592 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1593 the LAST_NODE, we throw away the node `a'.
1594 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1595 string `s' and transit to `b':
1596 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1597 away the node `a'.
1598 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1599 thrown away, we throw away the node `a'.
1600 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1601 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1602 node `a'.
1603 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1604 we throw away the node `a'. */
1606 #define STATE_NODE_CONTAINS(state,node) \
1607 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1609 static reg_errcode_t
1610 internal_function
1611 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1613 reg_errcode_t err;
1614 int null_cnt = 0;
1615 int str_idx = sctx->last_str_idx;
1616 re_node_set cur_dest;
1618 #ifdef DEBUG
1619 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1620 #endif
1622 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1623 transit to the last_node and the last_node itself. */
1624 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1625 if (BE (err != REG_NOERROR, 0))
1626 return err;
1627 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1628 if (BE (err != REG_NOERROR, 0))
1629 goto free_return;
1631 /* Then check each states in the state_log. */
1632 while (str_idx > 0)
1634 /* Update counters. */
1635 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1636 if (null_cnt > mctx->max_mb_elem_len)
1638 memset (sctx->sifted_states, '\0',
1639 sizeof (re_dfastate_t *) * str_idx);
1640 re_node_set_free (&cur_dest);
1641 return REG_NOERROR;
1643 re_node_set_empty (&cur_dest);
1644 --str_idx;
1646 if (mctx->state_log[str_idx])
1648 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1649 if (BE (err != REG_NOERROR, 0))
1650 goto free_return;
1653 /* Add all the nodes which satisfy the following conditions:
1654 - It can epsilon transit to a node in CUR_DEST.
1655 - It is in CUR_SRC.
1656 And update state_log. */
1657 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1658 if (BE (err != REG_NOERROR, 0))
1659 goto free_return;
1661 err = REG_NOERROR;
1662 free_return:
1663 re_node_set_free (&cur_dest);
1664 return err;
1667 static reg_errcode_t
1668 internal_function __attribute_warn_unused_result__
1669 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1670 int str_idx, re_node_set *cur_dest)
1672 const re_dfa_t *const dfa = mctx->dfa;
1673 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1674 int i;
1676 /* Then build the next sifted state.
1677 We build the next sifted state on `cur_dest', and update
1678 `sifted_states[str_idx]' with `cur_dest'.
1679 Note:
1680 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1681 `cur_src' points the node_set of the old `state_log[str_idx]'
1682 (with the epsilon nodes pre-filtered out). */
1683 for (i = 0; i < cur_src->nelem; i++)
1685 int prev_node = cur_src->elems[i];
1686 int naccepted = 0;
1687 int ret;
1689 #ifdef DEBUG
1690 re_token_type_t type = dfa->nodes[prev_node].type;
1691 assert (!IS_EPSILON_NODE (type));
1692 #endif
1693 #ifdef RE_ENABLE_I18N
1694 /* If the node may accept `multi byte'. */
1695 if (dfa->nodes[prev_node].accept_mb)
1696 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1697 str_idx, sctx->last_str_idx);
1698 #endif /* RE_ENABLE_I18N */
1700 /* We don't check backreferences here.
1701 See update_cur_sifted_state(). */
1702 if (!naccepted
1703 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1704 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1705 dfa->nexts[prev_node]))
1706 naccepted = 1;
1708 if (naccepted == 0)
1709 continue;
1711 if (sctx->limits.nelem)
1713 int to_idx = str_idx + naccepted;
1714 if (check_dst_limits (mctx, &sctx->limits,
1715 dfa->nexts[prev_node], to_idx,
1716 prev_node, str_idx))
1717 continue;
1719 ret = re_node_set_insert (cur_dest, prev_node);
1720 if (BE (ret == -1, 0))
1721 return REG_ESPACE;
1724 return REG_NOERROR;
1727 /* Helper functions. */
1729 static reg_errcode_t
1730 internal_function
1731 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1733 int top = mctx->state_log_top;
1735 if (next_state_log_idx >= mctx->input.bufs_len
1736 || (next_state_log_idx >= mctx->input.valid_len
1737 && mctx->input.valid_len < mctx->input.len))
1739 reg_errcode_t err;
1740 err = extend_buffers (mctx);
1741 if (BE (err != REG_NOERROR, 0))
1742 return err;
1745 if (top < next_state_log_idx)
1747 memset (mctx->state_log + top + 1, '\0',
1748 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1749 mctx->state_log_top = next_state_log_idx;
1751 return REG_NOERROR;
1754 static reg_errcode_t
1755 internal_function
1756 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1757 re_dfastate_t **src, int num)
1759 int st_idx;
1760 reg_errcode_t err;
1761 for (st_idx = 0; st_idx < num; ++st_idx)
1763 if (dst[st_idx] == NULL)
1764 dst[st_idx] = src[st_idx];
1765 else if (src[st_idx] != NULL)
1767 re_node_set merged_set;
1768 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1769 &src[st_idx]->nodes);
1770 if (BE (err != REG_NOERROR, 0))
1771 return err;
1772 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1773 re_node_set_free (&merged_set);
1774 if (BE (err != REG_NOERROR, 0))
1775 return err;
1778 return REG_NOERROR;
1781 static reg_errcode_t
1782 internal_function
1783 update_cur_sifted_state (const re_match_context_t *mctx,
1784 re_sift_context_t *sctx, int str_idx,
1785 re_node_set *dest_nodes)
1787 const re_dfa_t *const dfa = mctx->dfa;
1788 reg_errcode_t err = REG_NOERROR;
1789 const re_node_set *candidates;
1790 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1791 : &mctx->state_log[str_idx]->nodes);
1793 if (dest_nodes->nelem == 0)
1794 sctx->sifted_states[str_idx] = NULL;
1795 else
1797 if (candidates)
1799 /* At first, add the nodes which can epsilon transit to a node in
1800 DEST_NODE. */
1801 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1802 if (BE (err != REG_NOERROR, 0))
1803 return err;
1805 /* Then, check the limitations in the current sift_context. */
1806 if (sctx->limits.nelem)
1808 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1809 mctx->bkref_ents, str_idx);
1810 if (BE (err != REG_NOERROR, 0))
1811 return err;
1815 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1816 if (BE (err != REG_NOERROR, 0))
1817 return err;
1820 if (candidates && mctx->state_log[str_idx]->has_backref)
1822 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1823 if (BE (err != REG_NOERROR, 0))
1824 return err;
1826 return REG_NOERROR;
1829 static reg_errcode_t
1830 internal_function __attribute_warn_unused_result__
1831 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1832 const re_node_set *candidates)
1834 reg_errcode_t err = REG_NOERROR;
1835 int i;
1837 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1838 if (BE (err != REG_NOERROR, 0))
1839 return err;
1841 if (!state->inveclosure.alloc)
1843 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1844 if (BE (err != REG_NOERROR, 0))
1845 return REG_ESPACE;
1846 for (i = 0; i < dest_nodes->nelem; i++)
1848 err = re_node_set_merge (&state->inveclosure,
1849 dfa->inveclosures + dest_nodes->elems[i]);
1850 if (BE (err != REG_NOERROR, 0))
1851 return REG_ESPACE;
1854 return re_node_set_add_intersect (dest_nodes, candidates,
1855 &state->inveclosure);
1858 static reg_errcode_t
1859 internal_function
1860 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1861 const re_node_set *candidates)
1863 int ecl_idx;
1864 reg_errcode_t err;
1865 re_node_set *inv_eclosure = dfa->inveclosures + node;
1866 re_node_set except_nodes;
1867 re_node_set_init_empty (&except_nodes);
1868 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1870 int cur_node = inv_eclosure->elems[ecl_idx];
1871 if (cur_node == node)
1872 continue;
1873 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1875 int edst1 = dfa->edests[cur_node].elems[0];
1876 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1877 ? dfa->edests[cur_node].elems[1] : -1);
1878 if ((!re_node_set_contains (inv_eclosure, edst1)
1879 && re_node_set_contains (dest_nodes, edst1))
1880 || (edst2 > 0
1881 && !re_node_set_contains (inv_eclosure, edst2)
1882 && re_node_set_contains (dest_nodes, edst2)))
1884 err = re_node_set_add_intersect (&except_nodes, candidates,
1885 dfa->inveclosures + cur_node);
1886 if (BE (err != REG_NOERROR, 0))
1888 re_node_set_free (&except_nodes);
1889 return err;
1894 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1896 int cur_node = inv_eclosure->elems[ecl_idx];
1897 if (!re_node_set_contains (&except_nodes, cur_node))
1899 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1900 re_node_set_remove_at (dest_nodes, idx);
1903 re_node_set_free (&except_nodes);
1904 return REG_NOERROR;
1907 static int
1908 internal_function
1909 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1910 int dst_node, int dst_idx, int src_node, int src_idx)
1912 const re_dfa_t *const dfa = mctx->dfa;
1913 int lim_idx, src_pos, dst_pos;
1915 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1916 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1917 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1919 int subexp_idx;
1920 struct re_backref_cache_entry *ent;
1921 ent = mctx->bkref_ents + limits->elems[lim_idx];
1922 subexp_idx = dfa->nodes[ent->node].opr.idx;
1924 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1925 subexp_idx, dst_node, dst_idx,
1926 dst_bkref_idx);
1927 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1928 subexp_idx, src_node, src_idx,
1929 src_bkref_idx);
1931 /* In case of:
1932 <src> <dst> ( <subexp> )
1933 ( <subexp> ) <src> <dst>
1934 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1935 if (src_pos == dst_pos)
1936 continue; /* This is unrelated limitation. */
1937 else
1938 return 1;
1940 return 0;
1943 static int
1944 internal_function
1945 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1946 int subexp_idx, int from_node, int bkref_idx)
1948 const re_dfa_t *const dfa = mctx->dfa;
1949 const re_node_set *eclosures = dfa->eclosures + from_node;
1950 int node_idx;
1952 /* Else, we are on the boundary: examine the nodes on the epsilon
1953 closure. */
1954 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1956 int node = eclosures->elems[node_idx];
1957 switch (dfa->nodes[node].type)
1959 case OP_BACK_REF:
1960 if (bkref_idx != -1)
1962 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1965 int dst, cpos;
1967 if (ent->node != node)
1968 continue;
1970 if (subexp_idx < BITSET_WORD_BITS
1971 && !(ent->eps_reachable_subexps_map
1972 & ((bitset_word_t) 1 << subexp_idx)))
1973 continue;
1975 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1976 OP_CLOSE_SUBEXP cases below. But, if the
1977 destination node is the same node as the source
1978 node, don't recurse because it would cause an
1979 infinite loop: a regex that exhibits this behavior
1980 is ()\1*\1* */
1981 dst = dfa->edests[node].elems[0];
1982 if (dst == from_node)
1984 if (boundaries & 1)
1985 return -1;
1986 else /* if (boundaries & 2) */
1987 return 0;
1990 cpos =
1991 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1992 dst, bkref_idx);
1993 if (cpos == -1 /* && (boundaries & 1) */)
1994 return -1;
1995 if (cpos == 0 && (boundaries & 2))
1996 return 0;
1998 if (subexp_idx < BITSET_WORD_BITS)
1999 ent->eps_reachable_subexps_map
2000 &= ~((bitset_word_t) 1 << subexp_idx);
2002 while (ent++->more);
2004 break;
2006 case OP_OPEN_SUBEXP:
2007 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2008 return -1;
2009 break;
2011 case OP_CLOSE_SUBEXP:
2012 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2013 return 0;
2014 break;
2016 default:
2017 break;
2021 return (boundaries & 2) ? 1 : 0;
2024 static int
2025 internal_function
2026 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2027 int subexp_idx, int from_node, int str_idx,
2028 int bkref_idx)
2030 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2031 int boundaries;
2033 /* If we are outside the range of the subexpression, return -1 or 1. */
2034 if (str_idx < lim->subexp_from)
2035 return -1;
2037 if (lim->subexp_to < str_idx)
2038 return 1;
2040 /* If we are within the subexpression, return 0. */
2041 boundaries = (str_idx == lim->subexp_from);
2042 boundaries |= (str_idx == lim->subexp_to) << 1;
2043 if (boundaries == 0)
2044 return 0;
2046 /* Else, examine epsilon closure. */
2047 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2048 from_node, bkref_idx);
2051 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2052 which are against limitations from DEST_NODES. */
2054 static reg_errcode_t
2055 internal_function
2056 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2057 const re_node_set *candidates, re_node_set *limits,
2058 struct re_backref_cache_entry *bkref_ents, int str_idx)
2060 reg_errcode_t err;
2061 int node_idx, lim_idx;
2063 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2065 int subexp_idx;
2066 struct re_backref_cache_entry *ent;
2067 ent = bkref_ents + limits->elems[lim_idx];
2069 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2070 continue; /* This is unrelated limitation. */
2072 subexp_idx = dfa->nodes[ent->node].opr.idx;
2073 if (ent->subexp_to == str_idx)
2075 int ops_node = -1;
2076 int cls_node = -1;
2077 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2079 int node = dest_nodes->elems[node_idx];
2080 re_token_type_t type = dfa->nodes[node].type;
2081 if (type == OP_OPEN_SUBEXP
2082 && subexp_idx == dfa->nodes[node].opr.idx)
2083 ops_node = node;
2084 else if (type == OP_CLOSE_SUBEXP
2085 && subexp_idx == dfa->nodes[node].opr.idx)
2086 cls_node = node;
2089 /* Check the limitation of the open subexpression. */
2090 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2091 if (ops_node >= 0)
2093 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2094 candidates);
2095 if (BE (err != REG_NOERROR, 0))
2096 return err;
2099 /* Check the limitation of the close subexpression. */
2100 if (cls_node >= 0)
2101 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2103 int node = dest_nodes->elems[node_idx];
2104 if (!re_node_set_contains (dfa->inveclosures + node,
2105 cls_node)
2106 && !re_node_set_contains (dfa->eclosures + node,
2107 cls_node))
2109 /* It is against this limitation.
2110 Remove it form the current sifted state. */
2111 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2112 candidates);
2113 if (BE (err != REG_NOERROR, 0))
2114 return err;
2115 --node_idx;
2119 else /* (ent->subexp_to != str_idx) */
2121 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2123 int node = dest_nodes->elems[node_idx];
2124 re_token_type_t type = dfa->nodes[node].type;
2125 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2127 if (subexp_idx != dfa->nodes[node].opr.idx)
2128 continue;
2129 /* It is against this limitation.
2130 Remove it form the current sifted state. */
2131 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2132 candidates);
2133 if (BE (err != REG_NOERROR, 0))
2134 return err;
2139 return REG_NOERROR;
2142 static reg_errcode_t
2143 internal_function __attribute_warn_unused_result__
2144 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2145 int str_idx, const re_node_set *candidates)
2147 const re_dfa_t *const dfa = mctx->dfa;
2148 reg_errcode_t err;
2149 int node_idx, node;
2150 re_sift_context_t local_sctx;
2151 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2153 if (first_idx == -1)
2154 return REG_NOERROR;
2156 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2158 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2160 int enabled_idx;
2161 re_token_type_t type;
2162 struct re_backref_cache_entry *entry;
2163 node = candidates->elems[node_idx];
2164 type = dfa->nodes[node].type;
2165 /* Avoid infinite loop for the REs like "()\1+". */
2166 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2167 continue;
2168 if (type != OP_BACK_REF)
2169 continue;
2171 entry = mctx->bkref_ents + first_idx;
2172 enabled_idx = first_idx;
2175 int subexp_len;
2176 int to_idx;
2177 int dst_node;
2178 int ret;
2179 re_dfastate_t *cur_state;
2181 if (entry->node != node)
2182 continue;
2183 subexp_len = entry->subexp_to - entry->subexp_from;
2184 to_idx = str_idx + subexp_len;
2185 dst_node = (subexp_len ? dfa->nexts[node]
2186 : dfa->edests[node].elems[0]);
2188 if (to_idx > sctx->last_str_idx
2189 || sctx->sifted_states[to_idx] == NULL
2190 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2191 || check_dst_limits (mctx, &sctx->limits, node,
2192 str_idx, dst_node, to_idx))
2193 continue;
2195 if (local_sctx.sifted_states == NULL)
2197 local_sctx = *sctx;
2198 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2199 if (BE (err != REG_NOERROR, 0))
2200 goto free_return;
2202 local_sctx.last_node = node;
2203 local_sctx.last_str_idx = str_idx;
2204 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2205 if (BE (ret < 0, 0))
2207 err = REG_ESPACE;
2208 goto free_return;
2210 cur_state = local_sctx.sifted_states[str_idx];
2211 err = sift_states_backward (mctx, &local_sctx);
2212 if (BE (err != REG_NOERROR, 0))
2213 goto free_return;
2214 if (sctx->limited_states != NULL)
2216 err = merge_state_array (dfa, sctx->limited_states,
2217 local_sctx.sifted_states,
2218 str_idx + 1);
2219 if (BE (err != REG_NOERROR, 0))
2220 goto free_return;
2222 local_sctx.sifted_states[str_idx] = cur_state;
2223 re_node_set_remove (&local_sctx.limits, enabled_idx);
2225 /* mctx->bkref_ents may have changed, reload the pointer. */
2226 entry = mctx->bkref_ents + enabled_idx;
2228 while (enabled_idx++, entry++->more);
2230 err = REG_NOERROR;
2231 free_return:
2232 if (local_sctx.sifted_states != NULL)
2234 re_node_set_free (&local_sctx.limits);
2237 return err;
2241 #ifdef RE_ENABLE_I18N
2242 static int
2243 internal_function
2244 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2245 int node_idx, int str_idx, int max_str_idx)
2247 const re_dfa_t *const dfa = mctx->dfa;
2248 int naccepted;
2249 /* Check the node can accept `multi byte'. */
2250 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2251 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2252 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2253 dfa->nexts[node_idx]))
2254 /* The node can't accept the `multi byte', or the
2255 destination was already thrown away, then the node
2256 could't accept the current input `multi byte'. */
2257 naccepted = 0;
2258 /* Otherwise, it is sure that the node could accept
2259 `naccepted' bytes input. */
2260 return naccepted;
2262 #endif /* RE_ENABLE_I18N */
2265 /* Functions for state transition. */
2267 /* Return the next state to which the current state STATE will transit by
2268 accepting the current input byte, and update STATE_LOG if necessary.
2269 If STATE can accept a multibyte char/collating element/back reference
2270 update the destination of STATE_LOG. */
2272 static re_dfastate_t *
2273 internal_function __attribute_warn_unused_result__
2274 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2275 re_dfastate_t *state)
2277 re_dfastate_t **trtable;
2278 unsigned char ch;
2280 #ifdef RE_ENABLE_I18N
2281 /* If the current state can accept multibyte. */
2282 if (BE (state->accept_mb, 0))
2284 *err = transit_state_mb (mctx, state);
2285 if (BE (*err != REG_NOERROR, 0))
2286 return NULL;
2288 #endif /* RE_ENABLE_I18N */
2290 /* Then decide the next state with the single byte. */
2291 #if 0
2292 if (0)
2293 /* don't use transition table */
2294 return transit_state_sb (err, mctx, state);
2295 #endif
2297 /* Use transition table */
2298 ch = re_string_fetch_byte (&mctx->input);
2299 for (;;)
2301 trtable = state->trtable;
2302 if (BE (trtable != NULL, 1))
2303 return trtable[ch];
2305 trtable = state->word_trtable;
2306 if (BE (trtable != NULL, 1))
2308 unsigned int context;
2309 context
2310 = re_string_context_at (&mctx->input,
2311 re_string_cur_idx (&mctx->input) - 1,
2312 mctx->eflags);
2313 if (IS_WORD_CONTEXT (context))
2314 return trtable[ch + SBC_MAX];
2315 else
2316 return trtable[ch];
2319 if (!build_trtable (mctx->dfa, state))
2321 *err = REG_ESPACE;
2322 return NULL;
2325 /* Retry, we now have a transition table. */
2329 /* Update the state_log if we need */
2330 re_dfastate_t *
2331 internal_function
2332 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2333 re_dfastate_t *next_state)
2335 const re_dfa_t *const dfa = mctx->dfa;
2336 int cur_idx = re_string_cur_idx (&mctx->input);
2338 if (cur_idx > mctx->state_log_top)
2340 mctx->state_log[cur_idx] = next_state;
2341 mctx->state_log_top = cur_idx;
2343 else if (mctx->state_log[cur_idx] == 0)
2345 mctx->state_log[cur_idx] = next_state;
2347 else
2349 re_dfastate_t *pstate;
2350 unsigned int context;
2351 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2352 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2353 the destination of a multibyte char/collating element/
2354 back reference. Then the next state is the union set of
2355 these destinations and the results of the transition table. */
2356 pstate = mctx->state_log[cur_idx];
2357 log_nodes = pstate->entrance_nodes;
2358 if (next_state != NULL)
2360 table_nodes = next_state->entrance_nodes;
2361 *err = re_node_set_init_union (&next_nodes, table_nodes,
2362 log_nodes);
2363 if (BE (*err != REG_NOERROR, 0))
2364 return NULL;
2366 else
2367 next_nodes = *log_nodes;
2368 /* Note: We already add the nodes of the initial state,
2369 then we don't need to add them here. */
2371 context = re_string_context_at (&mctx->input,
2372 re_string_cur_idx (&mctx->input) - 1,
2373 mctx->eflags);
2374 next_state = mctx->state_log[cur_idx]
2375 = re_acquire_state_context (err, dfa, &next_nodes, context);
2376 /* We don't need to check errors here, since the return value of
2377 this function is next_state and ERR is already set. */
2379 if (table_nodes != NULL)
2380 re_node_set_free (&next_nodes);
2383 if (BE (dfa->nbackref, 0) && next_state != NULL)
2385 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2386 later. We must check them here, since the back references in the
2387 next state might use them. */
2388 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2389 cur_idx);
2390 if (BE (*err != REG_NOERROR, 0))
2391 return NULL;
2393 /* If the next state has back references. */
2394 if (next_state->has_backref)
2396 *err = transit_state_bkref (mctx, &next_state->nodes);
2397 if (BE (*err != REG_NOERROR, 0))
2398 return NULL;
2399 next_state = mctx->state_log[cur_idx];
2403 return next_state;
2406 /* Skip bytes in the input that correspond to part of a
2407 multi-byte match, then look in the log for a state
2408 from which to restart matching. */
2409 re_dfastate_t *
2410 internal_function
2411 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2413 re_dfastate_t *cur_state;
2416 int max = mctx->state_log_top;
2417 int cur_str_idx = re_string_cur_idx (&mctx->input);
2421 if (++cur_str_idx > max)
2422 return NULL;
2423 re_string_skip_bytes (&mctx->input, 1);
2425 while (mctx->state_log[cur_str_idx] == NULL);
2427 cur_state = merge_state_with_log (err, mctx, NULL);
2429 while (*err == REG_NOERROR && cur_state == NULL);
2430 return cur_state;
2433 /* Helper functions for transit_state. */
2435 /* From the node set CUR_NODES, pick up the nodes whose types are
2436 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2437 expression. And register them to use them later for evaluating the
2438 correspoding back references. */
2440 static reg_errcode_t
2441 internal_function
2442 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2443 int str_idx)
2445 const re_dfa_t *const dfa = mctx->dfa;
2446 int node_idx;
2447 reg_errcode_t err;
2449 /* TODO: This isn't efficient.
2450 Because there might be more than one nodes whose types are
2451 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2452 nodes.
2453 E.g. RE: (a){2} */
2454 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2456 int node = cur_nodes->elems[node_idx];
2457 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2458 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2459 && (dfa->used_bkref_map
2460 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2462 err = match_ctx_add_subtop (mctx, node, str_idx);
2463 if (BE (err != REG_NOERROR, 0))
2464 return err;
2467 return REG_NOERROR;
2470 #if 0
2471 /* Return the next state to which the current state STATE will transit by
2472 accepting the current input byte. */
2474 static re_dfastate_t *
2475 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2476 re_dfastate_t *state)
2478 const re_dfa_t *const dfa = mctx->dfa;
2479 re_node_set next_nodes;
2480 re_dfastate_t *next_state;
2481 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2482 unsigned int context;
2484 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2485 if (BE (*err != REG_NOERROR, 0))
2486 return NULL;
2487 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2489 int cur_node = state->nodes.elems[node_cnt];
2490 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2492 *err = re_node_set_merge (&next_nodes,
2493 dfa->eclosures + dfa->nexts[cur_node]);
2494 if (BE (*err != REG_NOERROR, 0))
2496 re_node_set_free (&next_nodes);
2497 return NULL;
2501 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2502 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2503 /* We don't need to check errors here, since the return value of
2504 this function is next_state and ERR is already set. */
2506 re_node_set_free (&next_nodes);
2507 re_string_skip_bytes (&mctx->input, 1);
2508 return next_state;
2510 #endif
2512 #ifdef RE_ENABLE_I18N
2513 static reg_errcode_t
2514 internal_function
2515 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2517 const re_dfa_t *const dfa = mctx->dfa;
2518 reg_errcode_t err;
2519 int i;
2521 for (i = 0; i < pstate->nodes.nelem; ++i)
2523 re_node_set dest_nodes, *new_nodes;
2524 int cur_node_idx = pstate->nodes.elems[i];
2525 int naccepted, dest_idx;
2526 unsigned int context;
2527 re_dfastate_t *dest_state;
2529 if (!dfa->nodes[cur_node_idx].accept_mb)
2530 continue;
2532 if (dfa->nodes[cur_node_idx].constraint)
2534 context = re_string_context_at (&mctx->input,
2535 re_string_cur_idx (&mctx->input),
2536 mctx->eflags);
2537 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2538 context))
2539 continue;
2542 /* How many bytes the node can accept? */
2543 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2544 re_string_cur_idx (&mctx->input));
2545 if (naccepted == 0)
2546 continue;
2548 /* The node can accepts `naccepted' bytes. */
2549 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2550 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2551 : mctx->max_mb_elem_len);
2552 err = clean_state_log_if_needed (mctx, dest_idx);
2553 if (BE (err != REG_NOERROR, 0))
2554 return err;
2555 #ifdef DEBUG
2556 assert (dfa->nexts[cur_node_idx] != -1);
2557 #endif
2558 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2560 dest_state = mctx->state_log[dest_idx];
2561 if (dest_state == NULL)
2562 dest_nodes = *new_nodes;
2563 else
2565 err = re_node_set_init_union (&dest_nodes,
2566 dest_state->entrance_nodes, new_nodes);
2567 if (BE (err != REG_NOERROR, 0))
2568 return err;
2570 context = re_string_context_at (&mctx->input, dest_idx - 1,
2571 mctx->eflags);
2572 mctx->state_log[dest_idx]
2573 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2574 if (dest_state != NULL)
2575 re_node_set_free (&dest_nodes);
2576 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2577 return err;
2579 return REG_NOERROR;
2581 #endif /* RE_ENABLE_I18N */
2583 static reg_errcode_t
2584 internal_function
2585 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2587 const re_dfa_t *const dfa = mctx->dfa;
2588 reg_errcode_t err;
2589 int i;
2590 int cur_str_idx = re_string_cur_idx (&mctx->input);
2592 for (i = 0; i < nodes->nelem; ++i)
2594 int dest_str_idx, prev_nelem, bkc_idx;
2595 int node_idx = nodes->elems[i];
2596 unsigned int context;
2597 const re_token_t *node = dfa->nodes + node_idx;
2598 re_node_set *new_dest_nodes;
2600 /* Check whether `node' is a backreference or not. */
2601 if (node->type != OP_BACK_REF)
2602 continue;
2604 if (node->constraint)
2606 context = re_string_context_at (&mctx->input, cur_str_idx,
2607 mctx->eflags);
2608 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2609 continue;
2612 /* `node' is a backreference.
2613 Check the substring which the substring matched. */
2614 bkc_idx = mctx->nbkref_ents;
2615 err = get_subexp (mctx, node_idx, cur_str_idx);
2616 if (BE (err != REG_NOERROR, 0))
2617 goto free_return;
2619 /* And add the epsilon closures (which is `new_dest_nodes') of
2620 the backreference to appropriate state_log. */
2621 #ifdef DEBUG
2622 assert (dfa->nexts[node_idx] != -1);
2623 #endif
2624 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2626 int subexp_len;
2627 re_dfastate_t *dest_state;
2628 struct re_backref_cache_entry *bkref_ent;
2629 bkref_ent = mctx->bkref_ents + bkc_idx;
2630 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2631 continue;
2632 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2633 new_dest_nodes = (subexp_len == 0
2634 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2635 : dfa->eclosures + dfa->nexts[node_idx]);
2636 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2637 - bkref_ent->subexp_from);
2638 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2639 mctx->eflags);
2640 dest_state = mctx->state_log[dest_str_idx];
2641 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2642 : mctx->state_log[cur_str_idx]->nodes.nelem);
2643 /* Add `new_dest_node' to state_log. */
2644 if (dest_state == NULL)
2646 mctx->state_log[dest_str_idx]
2647 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2648 context);
2649 if (BE (mctx->state_log[dest_str_idx] == NULL
2650 && err != REG_NOERROR, 0))
2651 goto free_return;
2653 else
2655 re_node_set dest_nodes;
2656 err = re_node_set_init_union (&dest_nodes,
2657 dest_state->entrance_nodes,
2658 new_dest_nodes);
2659 if (BE (err != REG_NOERROR, 0))
2661 re_node_set_free (&dest_nodes);
2662 goto free_return;
2664 mctx->state_log[dest_str_idx]
2665 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2666 re_node_set_free (&dest_nodes);
2667 if (BE (mctx->state_log[dest_str_idx] == NULL
2668 && err != REG_NOERROR, 0))
2669 goto free_return;
2671 /* We need to check recursively if the backreference can epsilon
2672 transit. */
2673 if (subexp_len == 0
2674 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2676 err = check_subexp_matching_top (mctx, new_dest_nodes,
2677 cur_str_idx);
2678 if (BE (err != REG_NOERROR, 0))
2679 goto free_return;
2680 err = transit_state_bkref (mctx, new_dest_nodes);
2681 if (BE (err != REG_NOERROR, 0))
2682 goto free_return;
2686 err = REG_NOERROR;
2687 free_return:
2688 return err;
2691 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2692 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2693 Note that we might collect inappropriate candidates here.
2694 However, the cost of checking them strictly here is too high, then we
2695 delay these checking for prune_impossible_nodes(). */
2697 static reg_errcode_t
2698 internal_function __attribute_warn_unused_result__
2699 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2701 const re_dfa_t *const dfa = mctx->dfa;
2702 int subexp_num, sub_top_idx;
2703 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2704 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2705 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2706 if (cache_idx != -1)
2708 const struct re_backref_cache_entry *entry
2709 = mctx->bkref_ents + cache_idx;
2711 if (entry->node == bkref_node)
2712 return REG_NOERROR; /* We already checked it. */
2713 while (entry++->more);
2716 subexp_num = dfa->nodes[bkref_node].opr.idx;
2718 /* For each sub expression */
2719 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2721 reg_errcode_t err;
2722 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2723 re_sub_match_last_t *sub_last;
2724 int sub_last_idx, sl_str, bkref_str_off;
2726 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2727 continue; /* It isn't related. */
2729 sl_str = sub_top->str_idx;
2730 bkref_str_off = bkref_str_idx;
2731 /* At first, check the last node of sub expressions we already
2732 evaluated. */
2733 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2735 int sl_str_diff;
2736 sub_last = sub_top->lasts[sub_last_idx];
2737 sl_str_diff = sub_last->str_idx - sl_str;
2738 /* The matched string by the sub expression match with the substring
2739 at the back reference? */
2740 if (sl_str_diff > 0)
2742 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2744 /* Not enough chars for a successful match. */
2745 if (bkref_str_off + sl_str_diff > mctx->input.len)
2746 break;
2748 err = clean_state_log_if_needed (mctx,
2749 bkref_str_off
2750 + sl_str_diff);
2751 if (BE (err != REG_NOERROR, 0))
2752 return err;
2753 buf = (const char *) re_string_get_buffer (&mctx->input);
2755 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2756 /* We don't need to search this sub expression any more. */
2757 break;
2759 bkref_str_off += sl_str_diff;
2760 sl_str += sl_str_diff;
2761 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2762 bkref_str_idx);
2764 /* Reload buf, since the preceding call might have reallocated
2765 the buffer. */
2766 buf = (const char *) re_string_get_buffer (&mctx->input);
2768 if (err == REG_NOMATCH)
2769 continue;
2770 if (BE (err != REG_NOERROR, 0))
2771 return err;
2774 if (sub_last_idx < sub_top->nlasts)
2775 continue;
2776 if (sub_last_idx > 0)
2777 ++sl_str;
2778 /* Then, search for the other last nodes of the sub expression. */
2779 for (; sl_str <= bkref_str_idx; ++sl_str)
2781 int cls_node, sl_str_off;
2782 const re_node_set *nodes;
2783 sl_str_off = sl_str - sub_top->str_idx;
2784 /* The matched string by the sub expression match with the substring
2785 at the back reference? */
2786 if (sl_str_off > 0)
2788 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2790 /* If we are at the end of the input, we cannot match. */
2791 if (bkref_str_off >= mctx->input.len)
2792 break;
2794 err = extend_buffers (mctx);
2795 if (BE (err != REG_NOERROR, 0))
2796 return err;
2798 buf = (const char *) re_string_get_buffer (&mctx->input);
2800 if (buf [bkref_str_off++] != buf[sl_str - 1])
2801 break; /* We don't need to search this sub expression
2802 any more. */
2804 if (mctx->state_log[sl_str] == NULL)
2805 continue;
2806 /* Does this state have a ')' of the sub expression? */
2807 nodes = &mctx->state_log[sl_str]->nodes;
2808 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2809 OP_CLOSE_SUBEXP);
2810 if (cls_node == -1)
2811 continue; /* No. */
2812 if (sub_top->path == NULL)
2814 sub_top->path = calloc (sizeof (state_array_t),
2815 sl_str - sub_top->str_idx + 1);
2816 if (sub_top->path == NULL)
2817 return REG_ESPACE;
2819 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2820 in the current context? */
2821 err = check_arrival (mctx, sub_top->path, sub_top->node,
2822 sub_top->str_idx, cls_node, sl_str,
2823 OP_CLOSE_SUBEXP);
2824 if (err == REG_NOMATCH)
2825 continue;
2826 if (BE (err != REG_NOERROR, 0))
2827 return err;
2828 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2829 if (BE (sub_last == NULL, 0))
2830 return REG_ESPACE;
2831 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2832 bkref_str_idx);
2833 if (err == REG_NOMATCH)
2834 continue;
2837 return REG_NOERROR;
2840 /* Helper functions for get_subexp(). */
2842 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2843 If it can arrive, register the sub expression expressed with SUB_TOP
2844 and SUB_LAST. */
2846 static reg_errcode_t
2847 internal_function
2848 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2849 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2851 reg_errcode_t err;
2852 int to_idx;
2853 /* Can the subexpression arrive the back reference? */
2854 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2855 sub_last->str_idx, bkref_node, bkref_str,
2856 OP_OPEN_SUBEXP);
2857 if (err != REG_NOERROR)
2858 return err;
2859 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2860 sub_last->str_idx);
2861 if (BE (err != REG_NOERROR, 0))
2862 return err;
2863 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2864 return clean_state_log_if_needed (mctx, to_idx);
2867 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2868 Search '(' if FL_OPEN, or search ')' otherwise.
2869 TODO: This function isn't efficient...
2870 Because there might be more than one nodes whose types are
2871 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2872 nodes.
2873 E.g. RE: (a){2} */
2875 static int
2876 internal_function
2877 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2878 int subexp_idx, int type)
2880 int cls_idx;
2881 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2883 int cls_node = nodes->elems[cls_idx];
2884 const re_token_t *node = dfa->nodes + cls_node;
2885 if (node->type == type
2886 && node->opr.idx == subexp_idx)
2887 return cls_node;
2889 return -1;
2892 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2893 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2894 heavily reused.
2895 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2897 static reg_errcode_t
2898 internal_function __attribute_warn_unused_result__
2899 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2900 int top_str, int last_node, int last_str, int type)
2902 const re_dfa_t *const dfa = mctx->dfa;
2903 reg_errcode_t err = REG_NOERROR;
2904 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2905 re_dfastate_t *cur_state = NULL;
2906 re_node_set *cur_nodes, next_nodes;
2907 re_dfastate_t **backup_state_log;
2908 unsigned int context;
2910 subexp_num = dfa->nodes[top_node].opr.idx;
2911 /* Extend the buffer if we need. */
2912 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2914 re_dfastate_t **new_array;
2915 int old_alloc = path->alloc;
2916 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2917 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2918 if (BE (new_array == NULL, 0))
2920 path->alloc = old_alloc;
2921 return REG_ESPACE;
2923 path->array = new_array;
2924 memset (new_array + old_alloc, '\0',
2925 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2928 str_idx = path->next_idx ?: top_str;
2930 /* Temporary modify MCTX. */
2931 backup_state_log = mctx->state_log;
2932 backup_cur_idx = mctx->input.cur_idx;
2933 mctx->state_log = path->array;
2934 mctx->input.cur_idx = str_idx;
2936 /* Setup initial node set. */
2937 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2938 if (str_idx == top_str)
2940 err = re_node_set_init_1 (&next_nodes, top_node);
2941 if (BE (err != REG_NOERROR, 0))
2942 return err;
2943 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2944 if (BE (err != REG_NOERROR, 0))
2946 re_node_set_free (&next_nodes);
2947 return err;
2950 else
2952 cur_state = mctx->state_log[str_idx];
2953 if (cur_state && cur_state->has_backref)
2955 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2956 if (BE (err != REG_NOERROR, 0))
2957 return err;
2959 else
2960 re_node_set_init_empty (&next_nodes);
2962 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2964 if (next_nodes.nelem)
2966 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2967 subexp_num, type);
2968 if (BE (err != REG_NOERROR, 0))
2970 re_node_set_free (&next_nodes);
2971 return err;
2974 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2977 re_node_set_free (&next_nodes);
2978 return err;
2980 mctx->state_log[str_idx] = cur_state;
2983 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2985 re_node_set_empty (&next_nodes);
2986 if (mctx->state_log[str_idx + 1])
2988 err = re_node_set_merge (&next_nodes,
2989 &mctx->state_log[str_idx + 1]->nodes);
2990 if (BE (err != REG_NOERROR, 0))
2992 re_node_set_free (&next_nodes);
2993 return err;
2996 if (cur_state)
2998 err = check_arrival_add_next_nodes (mctx, str_idx,
2999 &cur_state->non_eps_nodes,
3000 &next_nodes);
3001 if (BE (err != REG_NOERROR, 0))
3003 re_node_set_free (&next_nodes);
3004 return err;
3007 ++str_idx;
3008 if (next_nodes.nelem)
3010 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3011 if (BE (err != REG_NOERROR, 0))
3013 re_node_set_free (&next_nodes);
3014 return err;
3016 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3017 subexp_num, type);
3018 if (BE (err != REG_NOERROR, 0))
3020 re_node_set_free (&next_nodes);
3021 return err;
3024 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3025 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3026 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3028 re_node_set_free (&next_nodes);
3029 return err;
3031 mctx->state_log[str_idx] = cur_state;
3032 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3034 re_node_set_free (&next_nodes);
3035 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3036 : &mctx->state_log[last_str]->nodes);
3037 path->next_idx = str_idx;
3039 /* Fix MCTX. */
3040 mctx->state_log = backup_state_log;
3041 mctx->input.cur_idx = backup_cur_idx;
3043 /* Then check the current node set has the node LAST_NODE. */
3044 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3045 return REG_NOERROR;
3047 return REG_NOMATCH;
3050 /* Helper functions for check_arrival. */
3052 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3053 to NEXT_NODES.
3054 TODO: This function is similar to the functions transit_state*(),
3055 however this function has many additional works.
3056 Can't we unify them? */
3058 static reg_errcode_t
3059 internal_function __attribute_warn_unused_result__
3060 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3061 re_node_set *cur_nodes, re_node_set *next_nodes)
3063 const re_dfa_t *const dfa = mctx->dfa;
3064 int result;
3065 int cur_idx;
3066 reg_errcode_t err = REG_NOERROR;
3067 re_node_set union_set;
3068 re_node_set_init_empty (&union_set);
3069 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3071 int naccepted = 0;
3072 int cur_node = cur_nodes->elems[cur_idx];
3073 #ifdef DEBUG
3074 re_token_type_t type = dfa->nodes[cur_node].type;
3075 assert (!IS_EPSILON_NODE (type));
3076 #endif
3077 #ifdef RE_ENABLE_I18N
3078 /* If the node may accept `multi byte'. */
3079 if (dfa->nodes[cur_node].accept_mb)
3081 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3082 str_idx);
3083 if (naccepted > 1)
3085 re_dfastate_t *dest_state;
3086 int next_node = dfa->nexts[cur_node];
3087 int next_idx = str_idx + naccepted;
3088 dest_state = mctx->state_log[next_idx];
3089 re_node_set_empty (&union_set);
3090 if (dest_state)
3092 err = re_node_set_merge (&union_set, &dest_state->nodes);
3093 if (BE (err != REG_NOERROR, 0))
3095 re_node_set_free (&union_set);
3096 return err;
3099 result = re_node_set_insert (&union_set, next_node);
3100 if (BE (result < 0, 0))
3102 re_node_set_free (&union_set);
3103 return REG_ESPACE;
3105 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3106 &union_set);
3107 if (BE (mctx->state_log[next_idx] == NULL
3108 && err != REG_NOERROR, 0))
3110 re_node_set_free (&union_set);
3111 return err;
3115 #endif /* RE_ENABLE_I18N */
3116 if (naccepted
3117 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3119 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3120 if (BE (result < 0, 0))
3122 re_node_set_free (&union_set);
3123 return REG_ESPACE;
3127 re_node_set_free (&union_set);
3128 return REG_NOERROR;
3131 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3132 CUR_NODES, however exclude the nodes which are:
3133 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3134 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3137 static reg_errcode_t
3138 internal_function
3139 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3140 int ex_subexp, int type)
3142 reg_errcode_t err;
3143 int idx, outside_node;
3144 re_node_set new_nodes;
3145 #ifdef DEBUG
3146 assert (cur_nodes->nelem);
3147 #endif
3148 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3149 if (BE (err != REG_NOERROR, 0))
3150 return err;
3151 /* Create a new node set NEW_NODES with the nodes which are epsilon
3152 closures of the node in CUR_NODES. */
3154 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3156 int cur_node = cur_nodes->elems[idx];
3157 const re_node_set *eclosure = dfa->eclosures + cur_node;
3158 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3159 if (outside_node == -1)
3161 /* There are no problematic nodes, just merge them. */
3162 err = re_node_set_merge (&new_nodes, eclosure);
3163 if (BE (err != REG_NOERROR, 0))
3165 re_node_set_free (&new_nodes);
3166 return err;
3169 else
3171 /* There are problematic nodes, re-calculate incrementally. */
3172 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3173 ex_subexp, type);
3174 if (BE (err != REG_NOERROR, 0))
3176 re_node_set_free (&new_nodes);
3177 return err;
3181 re_node_set_free (cur_nodes);
3182 *cur_nodes = new_nodes;
3183 return REG_NOERROR;
3186 /* Helper function for check_arrival_expand_ecl.
3187 Check incrementally the epsilon closure of TARGET, and if it isn't
3188 problematic append it to DST_NODES. */
3190 static reg_errcode_t
3191 internal_function __attribute_warn_unused_result__
3192 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3193 int target, int ex_subexp, int type)
3195 int cur_node;
3196 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3198 int err;
3200 if (dfa->nodes[cur_node].type == type
3201 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3203 if (type == OP_CLOSE_SUBEXP)
3205 err = re_node_set_insert (dst_nodes, cur_node);
3206 if (BE (err == -1, 0))
3207 return REG_ESPACE;
3209 break;
3211 err = re_node_set_insert (dst_nodes, cur_node);
3212 if (BE (err == -1, 0))
3213 return REG_ESPACE;
3214 if (dfa->edests[cur_node].nelem == 0)
3215 break;
3216 if (dfa->edests[cur_node].nelem == 2)
3218 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3219 dfa->edests[cur_node].elems[1],
3220 ex_subexp, type);
3221 if (BE (err != REG_NOERROR, 0))
3222 return err;
3224 cur_node = dfa->edests[cur_node].elems[0];
3226 return REG_NOERROR;
3230 /* For all the back references in the current state, calculate the
3231 destination of the back references by the appropriate entry
3232 in MCTX->BKREF_ENTS. */
3234 static reg_errcode_t
3235 internal_function __attribute_warn_unused_result__
3236 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3237 int cur_str, int subexp_num, int type)
3239 const re_dfa_t *const dfa = mctx->dfa;
3240 reg_errcode_t err;
3241 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3242 struct re_backref_cache_entry *ent;
3244 if (cache_idx_start == -1)
3245 return REG_NOERROR;
3247 restart:
3248 ent = mctx->bkref_ents + cache_idx_start;
3251 int to_idx, next_node;
3253 /* Is this entry ENT is appropriate? */
3254 if (!re_node_set_contains (cur_nodes, ent->node))
3255 continue; /* No. */
3257 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3258 /* Calculate the destination of the back reference, and append it
3259 to MCTX->STATE_LOG. */
3260 if (to_idx == cur_str)
3262 /* The backreference did epsilon transit, we must re-check all the
3263 node in the current state. */
3264 re_node_set new_dests;
3265 reg_errcode_t err2, err3;
3266 next_node = dfa->edests[ent->node].elems[0];
3267 if (re_node_set_contains (cur_nodes, next_node))
3268 continue;
3269 err = re_node_set_init_1 (&new_dests, next_node);
3270 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3271 err3 = re_node_set_merge (cur_nodes, &new_dests);
3272 re_node_set_free (&new_dests);
3273 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3274 || err3 != REG_NOERROR, 0))
3276 err = (err != REG_NOERROR ? err
3277 : (err2 != REG_NOERROR ? err2 : err3));
3278 return err;
3280 /* TODO: It is still inefficient... */
3281 goto restart;
3283 else
3285 re_node_set union_set;
3286 next_node = dfa->nexts[ent->node];
3287 if (mctx->state_log[to_idx])
3289 int ret;
3290 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3291 next_node))
3292 continue;
3293 err = re_node_set_init_copy (&union_set,
3294 &mctx->state_log[to_idx]->nodes);
3295 ret = re_node_set_insert (&union_set, next_node);
3296 if (BE (err != REG_NOERROR || ret < 0, 0))
3298 re_node_set_free (&union_set);
3299 err = err != REG_NOERROR ? err : REG_ESPACE;
3300 return err;
3303 else
3305 err = re_node_set_init_1 (&union_set, next_node);
3306 if (BE (err != REG_NOERROR, 0))
3307 return err;
3309 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3310 re_node_set_free (&union_set);
3311 if (BE (mctx->state_log[to_idx] == NULL
3312 && err != REG_NOERROR, 0))
3313 return err;
3316 while (ent++->more);
3317 return REG_NOERROR;
3320 /* Build transition table for the state.
3321 Return 1 if succeeded, otherwise return NULL. */
3323 static int
3324 internal_function
3325 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3327 reg_errcode_t err;
3328 int i, j, ch, need_word_trtable = 0;
3329 bitset_word_t elem, mask;
3330 bool dests_node_malloced = false;
3331 bool dest_states_malloced = false;
3332 int ndests; /* Number of the destination states from `state'. */
3333 re_dfastate_t **trtable;
3334 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3335 re_node_set follows, *dests_node;
3336 bitset_t *dests_ch;
3337 bitset_t acceptable;
3339 struct dests_alloc
3341 re_node_set dests_node[SBC_MAX];
3342 bitset_t dests_ch[SBC_MAX];
3343 } *dests_alloc;
3345 /* We build DFA states which corresponds to the destination nodes
3346 from `state'. `dests_node[i]' represents the nodes which i-th
3347 destination state contains, and `dests_ch[i]' represents the
3348 characters which i-th destination state accepts. */
3349 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3350 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3351 else
3353 dests_alloc = re_malloc (struct dests_alloc, 1);
3354 if (BE (dests_alloc == NULL, 0))
3355 return 0;
3356 dests_node_malloced = true;
3358 dests_node = dests_alloc->dests_node;
3359 dests_ch = dests_alloc->dests_ch;
3361 /* Initialize transiton table. */
3362 state->word_trtable = state->trtable = NULL;
3364 /* At first, group all nodes belonging to `state' into several
3365 destinations. */
3366 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3367 if (BE (ndests <= 0, 0))
3369 if (dests_node_malloced)
3370 free (dests_alloc);
3371 /* Return 0 in case of an error, 1 otherwise. */
3372 if (ndests == 0)
3374 state->trtable = (re_dfastate_t **)
3375 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3376 return 1;
3378 return 0;
3381 err = re_node_set_alloc (&follows, ndests + 1);
3382 if (BE (err != REG_NOERROR, 0))
3383 goto out_free;
3385 /* Avoid arithmetic overflow in size calculation. */
3386 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3387 / (3 * sizeof (re_dfastate_t *)))
3388 < ndests),
3390 goto out_free;
3392 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3393 + ndests * 3 * sizeof (re_dfastate_t *)))
3394 dest_states = (re_dfastate_t **)
3395 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3396 else
3398 dest_states = (re_dfastate_t **)
3399 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3400 if (BE (dest_states == NULL, 0))
3402 out_free:
3403 if (dest_states_malloced)
3404 free (dest_states);
3405 re_node_set_free (&follows);
3406 for (i = 0; i < ndests; ++i)
3407 re_node_set_free (dests_node + i);
3408 if (dests_node_malloced)
3409 free (dests_alloc);
3410 return 0;
3412 dest_states_malloced = true;
3414 dest_states_word = dest_states + ndests;
3415 dest_states_nl = dest_states_word + ndests;
3416 bitset_empty (acceptable);
3418 /* Then build the states for all destinations. */
3419 for (i = 0; i < ndests; ++i)
3421 int next_node;
3422 re_node_set_empty (&follows);
3423 /* Merge the follows of this destination states. */
3424 for (j = 0; j < dests_node[i].nelem; ++j)
3426 next_node = dfa->nexts[dests_node[i].elems[j]];
3427 if (next_node != -1)
3429 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3430 if (BE (err != REG_NOERROR, 0))
3431 goto out_free;
3434 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3435 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3436 goto out_free;
3437 /* If the new state has context constraint,
3438 build appropriate states for these contexts. */
3439 if (dest_states[i]->has_constraint)
3441 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3442 CONTEXT_WORD);
3443 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3444 goto out_free;
3446 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3447 need_word_trtable = 1;
3449 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3450 CONTEXT_NEWLINE);
3451 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3452 goto out_free;
3454 else
3456 dest_states_word[i] = dest_states[i];
3457 dest_states_nl[i] = dest_states[i];
3459 bitset_merge (acceptable, dests_ch[i]);
3462 if (!BE (need_word_trtable, 0))
3464 /* We don't care about whether the following character is a word
3465 character, or we are in a single-byte character set so we can
3466 discern by looking at the character code: allocate a
3467 256-entry transition table. */
3468 trtable = state->trtable =
3469 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3470 if (BE (trtable == NULL, 0))
3471 goto out_free;
3473 /* For all characters ch...: */
3474 for (i = 0; i < BITSET_WORDS; ++i)
3475 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3476 elem;
3477 mask <<= 1, elem >>= 1, ++ch)
3478 if (BE (elem & 1, 0))
3480 /* There must be exactly one destination which accepts
3481 character ch. See group_nodes_into_DFAstates. */
3482 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3485 /* j-th destination accepts the word character ch. */
3486 if (dfa->word_char[i] & mask)
3487 trtable[ch] = dest_states_word[j];
3488 else
3489 trtable[ch] = dest_states[j];
3492 else
3494 /* We care about whether the following character is a word
3495 character, and we are in a multi-byte character set: discern
3496 by looking at the character code: build two 256-entry
3497 transition tables, one starting at trtable[0] and one
3498 starting at trtable[SBC_MAX]. */
3499 trtable = state->word_trtable =
3500 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3501 if (BE (trtable == NULL, 0))
3502 goto out_free;
3504 /* For all characters ch...: */
3505 for (i = 0; i < BITSET_WORDS; ++i)
3506 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3507 elem;
3508 mask <<= 1, elem >>= 1, ++ch)
3509 if (BE (elem & 1, 0))
3511 /* There must be exactly one destination which accepts
3512 character ch. See group_nodes_into_DFAstates. */
3513 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3516 /* j-th destination accepts the word character ch. */
3517 trtable[ch] = dest_states[j];
3518 trtable[ch + SBC_MAX] = dest_states_word[j];
3522 /* new line */
3523 if (bitset_contain (acceptable, NEWLINE_CHAR))
3525 /* The current state accepts newline character. */
3526 for (j = 0; j < ndests; ++j)
3527 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3529 /* k-th destination accepts newline character. */
3530 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3531 if (need_word_trtable)
3532 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3533 /* There must be only one destination which accepts
3534 newline. See group_nodes_into_DFAstates. */
3535 break;
3539 if (dest_states_malloced)
3540 free (dest_states);
3542 re_node_set_free (&follows);
3543 for (i = 0; i < ndests; ++i)
3544 re_node_set_free (dests_node + i);
3546 if (dests_node_malloced)
3547 free (dests_alloc);
3549 return 1;
3552 /* Group all nodes belonging to STATE into several destinations.
3553 Then for all destinations, set the nodes belonging to the destination
3554 to DESTS_NODE[i] and set the characters accepted by the destination
3555 to DEST_CH[i]. This function return the number of destinations. */
3557 static int
3558 internal_function
3559 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3560 re_node_set *dests_node, bitset_t *dests_ch)
3562 reg_errcode_t err;
3563 int result;
3564 int i, j, k;
3565 int ndests; /* Number of the destinations from `state'. */
3566 bitset_t accepts; /* Characters a node can accept. */
3567 const re_node_set *cur_nodes = &state->nodes;
3568 bitset_empty (accepts);
3569 ndests = 0;
3571 /* For all the nodes belonging to `state', */
3572 for (i = 0; i < cur_nodes->nelem; ++i)
3574 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3575 re_token_type_t type = node->type;
3576 unsigned int constraint = node->constraint;
3578 /* Enumerate all single byte character this node can accept. */
3579 if (type == CHARACTER)
3580 bitset_set (accepts, node->opr.c);
3581 else if (type == SIMPLE_BRACKET)
3583 bitset_merge (accepts, node->opr.sbcset);
3585 else if (type == OP_PERIOD)
3587 #ifdef RE_ENABLE_I18N
3588 if (dfa->mb_cur_max > 1)
3589 bitset_merge (accepts, dfa->sb_char);
3590 else
3591 #endif
3592 bitset_set_all (accepts);
3593 if (!(dfa->syntax & RE_DOT_NEWLINE))
3594 bitset_clear (accepts, '\n');
3595 if (dfa->syntax & RE_DOT_NOT_NULL)
3596 bitset_clear (accepts, '\0');
3598 #ifdef RE_ENABLE_I18N
3599 else if (type == OP_UTF8_PERIOD)
3601 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3602 if (!(dfa->syntax & RE_DOT_NEWLINE))
3603 bitset_clear (accepts, '\n');
3604 if (dfa->syntax & RE_DOT_NOT_NULL)
3605 bitset_clear (accepts, '\0');
3607 #endif
3608 else
3609 continue;
3611 /* Check the `accepts' and sift the characters which are not
3612 match it the context. */
3613 if (constraint)
3615 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3617 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3618 bitset_empty (accepts);
3619 if (accepts_newline)
3620 bitset_set (accepts, NEWLINE_CHAR);
3621 else
3622 continue;
3624 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3626 bitset_empty (accepts);
3627 continue;
3630 if (constraint & NEXT_WORD_CONSTRAINT)
3632 bitset_word_t any_set = 0;
3633 if (type == CHARACTER && !node->word_char)
3635 bitset_empty (accepts);
3636 continue;
3638 #ifdef RE_ENABLE_I18N
3639 if (dfa->mb_cur_max > 1)
3640 for (j = 0; j < BITSET_WORDS; ++j)
3641 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3642 else
3643 #endif
3644 for (j = 0; j < BITSET_WORDS; ++j)
3645 any_set |= (accepts[j] &= dfa->word_char[j]);
3646 if (!any_set)
3647 continue;
3649 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3651 bitset_word_t any_set = 0;
3652 if (type == CHARACTER && node->word_char)
3654 bitset_empty (accepts);
3655 continue;
3657 #ifdef RE_ENABLE_I18N
3658 if (dfa->mb_cur_max > 1)
3659 for (j = 0; j < BITSET_WORDS; ++j)
3660 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3661 else
3662 #endif
3663 for (j = 0; j < BITSET_WORDS; ++j)
3664 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3665 if (!any_set)
3666 continue;
3670 /* Then divide `accepts' into DFA states, or create a new
3671 state. Above, we make sure that accepts is not empty. */
3672 for (j = 0; j < ndests; ++j)
3674 bitset_t intersec; /* Intersection sets, see below. */
3675 bitset_t remains;
3676 /* Flags, see below. */
3677 bitset_word_t has_intersec, not_subset, not_consumed;
3679 /* Optimization, skip if this state doesn't accept the character. */
3680 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3681 continue;
3683 /* Enumerate the intersection set of this state and `accepts'. */
3684 has_intersec = 0;
3685 for (k = 0; k < BITSET_WORDS; ++k)
3686 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3687 /* And skip if the intersection set is empty. */
3688 if (!has_intersec)
3689 continue;
3691 /* Then check if this state is a subset of `accepts'. */
3692 not_subset = not_consumed = 0;
3693 for (k = 0; k < BITSET_WORDS; ++k)
3695 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3696 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3699 /* If this state isn't a subset of `accepts', create a
3700 new group state, which has the `remains'. */
3701 if (not_subset)
3703 bitset_copy (dests_ch[ndests], remains);
3704 bitset_copy (dests_ch[j], intersec);
3705 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3706 if (BE (err != REG_NOERROR, 0))
3707 goto error_return;
3708 ++ndests;
3711 /* Put the position in the current group. */
3712 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3713 if (BE (result < 0, 0))
3714 goto error_return;
3716 /* If all characters are consumed, go to next node. */
3717 if (!not_consumed)
3718 break;
3720 /* Some characters remain, create a new group. */
3721 if (j == ndests)
3723 bitset_copy (dests_ch[ndests], accepts);
3724 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3725 if (BE (err != REG_NOERROR, 0))
3726 goto error_return;
3727 ++ndests;
3728 bitset_empty (accepts);
3731 return ndests;
3732 error_return:
3733 for (j = 0; j < ndests; ++j)
3734 re_node_set_free (dests_node + j);
3735 return -1;
3738 #ifdef RE_ENABLE_I18N
3739 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3740 Return the number of the bytes the node accepts.
3741 STR_IDX is the current index of the input string.
3743 This function handles the nodes which can accept one character, or
3744 one collating element like '.', '[a-z]', opposite to the other nodes
3745 can only accept one byte. */
3747 static int
3748 internal_function
3749 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3750 const re_string_t *input, int str_idx)
3752 const re_token_t *node = dfa->nodes + node_idx;
3753 int char_len, elem_len;
3754 int i;
3756 if (BE (node->type == OP_UTF8_PERIOD, 0))
3758 unsigned char c = re_string_byte_at (input, str_idx), d;
3759 if (BE (c < 0xc2, 1))
3760 return 0;
3762 if (str_idx + 2 > input->len)
3763 return 0;
3765 d = re_string_byte_at (input, str_idx + 1);
3766 if (c < 0xe0)
3767 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3768 else if (c < 0xf0)
3770 char_len = 3;
3771 if (c == 0xe0 && d < 0xa0)
3772 return 0;
3774 else if (c < 0xf8)
3776 char_len = 4;
3777 if (c == 0xf0 && d < 0x90)
3778 return 0;
3780 else if (c < 0xfc)
3782 char_len = 5;
3783 if (c == 0xf8 && d < 0x88)
3784 return 0;
3786 else if (c < 0xfe)
3788 char_len = 6;
3789 if (c == 0xfc && d < 0x84)
3790 return 0;
3792 else
3793 return 0;
3795 if (str_idx + char_len > input->len)
3796 return 0;
3798 for (i = 1; i < char_len; ++i)
3800 d = re_string_byte_at (input, str_idx + i);
3801 if (d < 0x80 || d > 0xbf)
3802 return 0;
3804 return char_len;
3807 char_len = re_string_char_size_at (input, str_idx);
3808 if (node->type == OP_PERIOD)
3810 if (char_len <= 1)
3811 return 0;
3812 /* FIXME: I don't think this if is needed, as both '\n'
3813 and '\0' are char_len == 1. */
3814 /* '.' accepts any one character except the following two cases. */
3815 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3816 re_string_byte_at (input, str_idx) == '\n') ||
3817 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3818 re_string_byte_at (input, str_idx) == '\0'))
3819 return 0;
3820 return char_len;
3823 elem_len = re_string_elem_size_at (input, str_idx);
3824 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3825 return 0;
3827 if (node->type == COMPLEX_BRACKET)
3829 const re_charset_t *cset = node->opr.mbcset;
3830 # ifdef _LIBC
3831 const unsigned char *pin
3832 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3833 int j;
3834 uint32_t nrules;
3835 # endif /* _LIBC */
3836 int match_len = 0;
3837 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3838 ? re_string_wchar_at (input, str_idx) : 0);
3840 /* match with multibyte character? */
3841 for (i = 0; i < cset->nmbchars; ++i)
3842 if (wc == cset->mbchars[i])
3844 match_len = char_len;
3845 goto check_node_accept_bytes_match;
3847 /* match with character_class? */
3848 for (i = 0; i < cset->nchar_classes; ++i)
3850 wctype_t wt = cset->char_classes[i];
3851 if (__iswctype (wc, wt))
3853 match_len = char_len;
3854 goto check_node_accept_bytes_match;
3858 # ifdef _LIBC
3859 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3860 if (nrules != 0)
3862 unsigned int in_collseq = 0;
3863 const int32_t *table, *indirect;
3864 const unsigned char *weights, *extra;
3865 const char *collseqwc;
3866 /* This #include defines a local function! */
3867 # include <locale/weight.h>
3869 /* match with collating_symbol? */
3870 if (cset->ncoll_syms)
3871 extra = (const unsigned char *)
3872 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3873 for (i = 0; i < cset->ncoll_syms; ++i)
3875 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3876 /* Compare the length of input collating element and
3877 the length of current collating element. */
3878 if (*coll_sym != elem_len)
3879 continue;
3880 /* Compare each bytes. */
3881 for (j = 0; j < *coll_sym; j++)
3882 if (pin[j] != coll_sym[1 + j])
3883 break;
3884 if (j == *coll_sym)
3886 /* Match if every bytes is equal. */
3887 match_len = j;
3888 goto check_node_accept_bytes_match;
3892 if (cset->nranges)
3894 if (elem_len <= char_len)
3896 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3897 in_collseq = __collseq_table_lookup (collseqwc, wc);
3899 else
3900 in_collseq = find_collation_sequence_value (pin, elem_len);
3902 /* match with range expression? */
3903 for (i = 0; i < cset->nranges; ++i)
3904 if (cset->range_starts[i] <= in_collseq
3905 && in_collseq <= cset->range_ends[i])
3907 match_len = elem_len;
3908 goto check_node_accept_bytes_match;
3911 /* match with equivalence_class? */
3912 if (cset->nequiv_classes)
3914 const unsigned char *cp = pin;
3915 table = (const int32_t *)
3916 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3917 weights = (const unsigned char *)
3918 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3919 extra = (const unsigned char *)
3920 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3921 indirect = (const int32_t *)
3922 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3923 int32_t idx = findidx (&cp);
3924 if (idx > 0)
3925 for (i = 0; i < cset->nequiv_classes; ++i)
3927 int32_t equiv_class_idx = cset->equiv_classes[i];
3928 size_t weight_len = weights[idx & 0xffffff];
3929 if (weight_len == weights[equiv_class_idx & 0xffffff]
3930 && (idx >> 24) == (equiv_class_idx >> 24))
3932 int cnt = 0;
3934 idx &= 0xffffff;
3935 equiv_class_idx &= 0xffffff;
3937 while (cnt <= weight_len
3938 && (weights[equiv_class_idx + 1 + cnt]
3939 == weights[idx + 1 + cnt]))
3940 ++cnt;
3941 if (cnt > weight_len)
3943 match_len = elem_len;
3944 goto check_node_accept_bytes_match;
3950 else
3951 # endif /* _LIBC */
3953 /* match with range expression? */
3954 #if __GNUC__ >= 2
3955 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3956 #else
3957 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3958 cmp_buf[2] = wc;
3959 #endif
3960 for (i = 0; i < cset->nranges; ++i)
3962 cmp_buf[0] = cset->range_starts[i];
3963 cmp_buf[4] = cset->range_ends[i];
3964 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3965 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3967 match_len = char_len;
3968 goto check_node_accept_bytes_match;
3972 check_node_accept_bytes_match:
3973 if (!cset->non_match)
3974 return match_len;
3975 else
3977 if (match_len > 0)
3978 return 0;
3979 else
3980 return (elem_len > char_len) ? elem_len : char_len;
3983 return 0;
3986 # ifdef _LIBC
3987 static unsigned int
3988 internal_function
3989 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3991 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3992 if (nrules == 0)
3994 if (mbs_len == 1)
3996 /* No valid character. Match it as a single byte character. */
3997 const unsigned char *collseq = (const unsigned char *)
3998 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3999 return collseq[mbs[0]];
4001 return UINT_MAX;
4003 else
4005 int32_t idx;
4006 const unsigned char *extra = (const unsigned char *)
4007 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4008 int32_t extrasize = (const unsigned char *)
4009 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4011 for (idx = 0; idx < extrasize;)
4013 int mbs_cnt, found = 0;
4014 int32_t elem_mbs_len;
4015 /* Skip the name of collating element name. */
4016 idx = idx + extra[idx] + 1;
4017 elem_mbs_len = extra[idx++];
4018 if (mbs_len == elem_mbs_len)
4020 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4021 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4022 break;
4023 if (mbs_cnt == elem_mbs_len)
4024 /* Found the entry. */
4025 found = 1;
4027 /* Skip the byte sequence of the collating element. */
4028 idx += elem_mbs_len;
4029 /* Adjust for the alignment. */
4030 idx = (idx + 3) & ~3;
4031 /* Skip the collation sequence value. */
4032 idx += sizeof (uint32_t);
4033 /* Skip the wide char sequence of the collating element. */
4034 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4035 /* If we found the entry, return the sequence value. */
4036 if (found)
4037 return *(uint32_t *) (extra + idx);
4038 /* Skip the collation sequence value. */
4039 idx += sizeof (uint32_t);
4041 return UINT_MAX;
4044 # endif /* _LIBC */
4045 #endif /* RE_ENABLE_I18N */
4047 /* Check whether the node accepts the byte which is IDX-th
4048 byte of the INPUT. */
4050 static int
4051 internal_function
4052 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4053 int idx)
4055 unsigned char ch;
4056 ch = re_string_byte_at (&mctx->input, idx);
4057 switch (node->type)
4059 case CHARACTER:
4060 if (node->opr.c != ch)
4061 return 0;
4062 break;
4064 case SIMPLE_BRACKET:
4065 if (!bitset_contain (node->opr.sbcset, ch))
4066 return 0;
4067 break;
4069 #ifdef RE_ENABLE_I18N
4070 case OP_UTF8_PERIOD:
4071 if (ch >= 0x80)
4072 return 0;
4073 /* FALLTHROUGH */
4074 #endif
4075 case OP_PERIOD:
4076 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4077 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4078 return 0;
4079 break;
4081 default:
4082 return 0;
4085 if (node->constraint)
4087 /* The node has constraints. Check whether the current context
4088 satisfies the constraints. */
4089 unsigned int context = re_string_context_at (&mctx->input, idx,
4090 mctx->eflags);
4091 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4092 return 0;
4095 return 1;
4098 /* Extend the buffers, if the buffers have run out. */
4100 static reg_errcode_t
4101 internal_function __attribute_warn_unused_result__
4102 extend_buffers (re_match_context_t *mctx)
4104 reg_errcode_t ret;
4105 re_string_t *pstr = &mctx->input;
4107 /* Avoid overflow. */
4108 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4109 return REG_ESPACE;
4111 /* Double the lengthes of the buffers. */
4112 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4113 if (BE (ret != REG_NOERROR, 0))
4114 return ret;
4116 if (mctx->state_log != NULL)
4118 /* And double the length of state_log. */
4119 /* XXX We have no indication of the size of this buffer. If this
4120 allocation fail we have no indication that the state_log array
4121 does not have the right size. */
4122 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4123 pstr->bufs_len + 1);
4124 if (BE (new_array == NULL, 0))
4125 return REG_ESPACE;
4126 mctx->state_log = new_array;
4129 /* Then reconstruct the buffers. */
4130 if (pstr->icase)
4132 #ifdef RE_ENABLE_I18N
4133 if (pstr->mb_cur_max > 1)
4135 ret = build_wcs_upper_buffer (pstr);
4136 if (BE (ret != REG_NOERROR, 0))
4137 return ret;
4139 else
4140 #endif /* RE_ENABLE_I18N */
4141 build_upper_buffer (pstr);
4143 else
4145 #ifdef RE_ENABLE_I18N
4146 if (pstr->mb_cur_max > 1)
4147 build_wcs_buffer (pstr);
4148 else
4149 #endif /* RE_ENABLE_I18N */
4151 if (pstr->trans != NULL)
4152 re_string_translate_buffer (pstr);
4155 return REG_NOERROR;
4159 /* Functions for matching context. */
4161 /* Initialize MCTX. */
4163 static reg_errcode_t
4164 internal_function __attribute_warn_unused_result__
4165 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4167 mctx->eflags = eflags;
4168 mctx->match_last = -1;
4169 if (n > 0)
4171 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4172 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4173 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4174 return REG_ESPACE;
4176 /* Already zero-ed by the caller.
4177 else
4178 mctx->bkref_ents = NULL;
4179 mctx->nbkref_ents = 0;
4180 mctx->nsub_tops = 0; */
4181 mctx->abkref_ents = n;
4182 mctx->max_mb_elem_len = 1;
4183 mctx->asub_tops = n;
4184 return REG_NOERROR;
4187 /* Clean the entries which depend on the current input in MCTX.
4188 This function must be invoked when the matcher changes the start index
4189 of the input, or changes the input string. */
4191 static void
4192 internal_function
4193 match_ctx_clean (re_match_context_t *mctx)
4195 int st_idx;
4196 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4198 int sl_idx;
4199 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4200 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4202 re_sub_match_last_t *last = top->lasts[sl_idx];
4203 re_free (last->path.array);
4204 re_free (last);
4206 re_free (top->lasts);
4207 if (top->path)
4209 re_free (top->path->array);
4210 re_free (top->path);
4212 free (top);
4215 mctx->nsub_tops = 0;
4216 mctx->nbkref_ents = 0;
4219 /* Free all the memory associated with MCTX. */
4221 static void
4222 internal_function
4223 match_ctx_free (re_match_context_t *mctx)
4225 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4226 match_ctx_clean (mctx);
4227 re_free (mctx->sub_tops);
4228 re_free (mctx->bkref_ents);
4231 /* Add a new backreference entry to MCTX.
4232 Note that we assume that caller never call this function with duplicate
4233 entry, and call with STR_IDX which isn't smaller than any existing entry.
4236 static reg_errcode_t
4237 internal_function __attribute_warn_unused_result__
4238 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4239 int to)
4241 if (mctx->nbkref_ents >= mctx->abkref_ents)
4243 struct re_backref_cache_entry* new_entry;
4244 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4245 mctx->abkref_ents * 2);
4246 if (BE (new_entry == NULL, 0))
4248 re_free (mctx->bkref_ents);
4249 return REG_ESPACE;
4251 mctx->bkref_ents = new_entry;
4252 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4253 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4254 mctx->abkref_ents *= 2;
4256 if (mctx->nbkref_ents > 0
4257 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4258 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4260 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4261 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4262 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4263 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4265 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4266 If bit N is clear, means that this entry won't epsilon-transition to
4267 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4268 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4269 such node.
4271 A backreference does not epsilon-transition unless it is empty, so set
4272 to all zeros if FROM != TO. */
4273 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4274 = (from == to ? ~0 : 0);
4276 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4277 if (mctx->max_mb_elem_len < to - from)
4278 mctx->max_mb_elem_len = to - from;
4279 return REG_NOERROR;
4282 /* Search for the first entry which has the same str_idx, or -1 if none is
4283 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4285 static int
4286 internal_function
4287 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4289 int left, right, mid, last;
4290 last = right = mctx->nbkref_ents;
4291 for (left = 0; left < right;)
4293 mid = (left + right) / 2;
4294 if (mctx->bkref_ents[mid].str_idx < str_idx)
4295 left = mid + 1;
4296 else
4297 right = mid;
4299 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4300 return left;
4301 else
4302 return -1;
4305 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4306 at STR_IDX. */
4308 static reg_errcode_t
4309 internal_function __attribute_warn_unused_result__
4310 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4312 #ifdef DEBUG
4313 assert (mctx->sub_tops != NULL);
4314 assert (mctx->asub_tops > 0);
4315 #endif
4316 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4318 int new_asub_tops = mctx->asub_tops * 2;
4319 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4320 re_sub_match_top_t *,
4321 new_asub_tops);
4322 if (BE (new_array == NULL, 0))
4323 return REG_ESPACE;
4324 mctx->sub_tops = new_array;
4325 mctx->asub_tops = new_asub_tops;
4327 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4328 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4329 return REG_ESPACE;
4330 mctx->sub_tops[mctx->nsub_tops]->node = node;
4331 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4332 return REG_NOERROR;
4335 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4336 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4338 static re_sub_match_last_t *
4339 internal_function
4340 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4342 re_sub_match_last_t *new_entry;
4343 if (BE (subtop->nlasts == subtop->alasts, 0))
4345 int new_alasts = 2 * subtop->alasts + 1;
4346 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4347 re_sub_match_last_t *,
4348 new_alasts);
4349 if (BE (new_array == NULL, 0))
4350 return NULL;
4351 subtop->lasts = new_array;
4352 subtop->alasts = new_alasts;
4354 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4355 if (BE (new_entry != NULL, 1))
4357 subtop->lasts[subtop->nlasts] = new_entry;
4358 new_entry->node = node;
4359 new_entry->str_idx = str_idx;
4360 ++subtop->nlasts;
4362 return new_entry;
4365 static void
4366 internal_function
4367 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4368 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4370 sctx->sifted_states = sifted_sts;
4371 sctx->limited_states = limited_sts;
4372 sctx->last_node = last_node;
4373 sctx->last_str_idx = last_str_idx;
4374 re_node_set_init_empty (&sctx->limits);