* sysdeps/x86_64/fpu/libm-test-ulps: Adjust expected atan2f results.
[glibc.git] / posix / regexec.c
blobe635261d05d62005f76b26601a5841aa1fa81f70
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65 static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71 static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int dest_node, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81 static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
87 #ifdef RE_ENABLE_I18N
88 static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132 static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141 #if 0
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145 #endif
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178 static int build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183 # ifdef _LIBC
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186 # endif /* _LIBC */
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192 static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
199 string STRING.
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg, string, nmatch, pmatch, eflags)
214 const regex_t *__restrict preg;
215 const char *__restrict string;
216 size_t nmatch;
217 regmatch_t pmatch[];
218 int eflags;
220 reg_errcode_t err;
221 int start, length;
222 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
224 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
225 return REG_BADPAT;
227 if (eflags & REG_STARTEND)
229 start = pmatch[0].rm_so;
230 length = pmatch[0].rm_eo;
232 else
234 start = 0;
235 length = strlen (string);
238 __libc_lock_lock (dfa->lock);
239 if (preg->no_sub)
240 err = re_search_internal (preg, string, length, start, length - start,
241 length, 0, NULL, eflags);
242 else
243 err = re_search_internal (preg, string, length, start, length - start,
244 length, nmatch, pmatch, eflags);
245 __libc_lock_unlock (dfa->lock);
246 return err != REG_NOERROR;
249 #ifdef _LIBC
250 # include <shlib-compat.h>
251 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
253 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
254 __typeof__ (__regexec) __compat_regexec;
257 attribute_compat_text_section
258 __compat_regexec (const regex_t *__restrict preg,
259 const char *__restrict string, size_t nmatch,
260 regmatch_t pmatch[], int eflags)
262 return regexec (preg, string, nmatch, pmatch,
263 eflags & (REG_NOTBOL | REG_NOTEOL));
265 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
266 # endif
267 #endif
269 /* Entry points for GNU code. */
271 /* re_match, re_search, re_match_2, re_search_2
273 The former two functions operate on STRING with length LENGTH,
274 while the later two operate on concatenation of STRING1 and STRING2
275 with lengths LENGTH1 and LENGTH2, respectively.
277 re_match() matches the compiled pattern in BUFP against the string,
278 starting at index START.
280 re_search() first tries matching at index START, then it tries to match
281 starting from index START + 1, and so on. The last start position tried
282 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
283 way as re_match().)
285 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
286 the first STOP characters of the concatenation of the strings should be
287 concerned.
289 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
290 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
291 computed relative to the concatenation, not relative to the individual
292 strings.)
294 On success, re_match* functions return the length of the match, re_search*
295 return the position of the start of the match. Return value -1 means no
296 match was found and -2 indicates an internal error. */
299 re_match (bufp, string, length, start, regs)
300 struct re_pattern_buffer *bufp;
301 const char *string;
302 int length, start;
303 struct re_registers *regs;
305 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
307 #ifdef _LIBC
308 weak_alias (__re_match, re_match)
309 #endif
312 re_search (bufp, string, length, start, range, regs)
313 struct re_pattern_buffer *bufp;
314 const char *string;
315 int length, start, range;
316 struct re_registers *regs;
318 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
320 #ifdef _LIBC
321 weak_alias (__re_search, re_search)
322 #endif
325 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
326 struct re_pattern_buffer *bufp;
327 const char *string1, *string2;
328 int length1, length2, start, stop;
329 struct re_registers *regs;
331 return re_search_2_stub (bufp, string1, length1, string2, length2,
332 start, 0, regs, stop, 1);
334 #ifdef _LIBC
335 weak_alias (__re_match_2, re_match_2)
336 #endif
339 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
340 struct re_pattern_buffer *bufp;
341 const char *string1, *string2;
342 int length1, length2, start, range, stop;
343 struct re_registers *regs;
345 return re_search_2_stub (bufp, string1, length1, string2, length2,
346 start, range, regs, stop, 0);
348 #ifdef _LIBC
349 weak_alias (__re_search_2, re_search_2)
350 #endif
352 static int
353 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
354 stop, ret_len)
355 struct re_pattern_buffer *bufp;
356 const char *string1, *string2;
357 int length1, length2, start, range, stop, ret_len;
358 struct re_registers *regs;
360 const char *str;
361 int rval;
362 int len = length1 + length2;
363 int free_str = 0;
365 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
366 return -2;
368 /* Concatenate the strings. */
369 if (length2 > 0)
370 if (length1 > 0)
372 char *s = re_malloc (char, len);
374 if (BE (s == NULL, 0))
375 return -2;
376 memcpy (s, string1, length1);
377 memcpy (s + length1, string2, length2);
378 str = s;
379 free_str = 1;
381 else
382 str = string2;
383 else
384 str = string1;
386 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
387 ret_len);
388 if (free_str)
389 re_free ((char *) str);
390 return rval;
393 /* The parameters have the same meaning as those of re_search.
394 Additional parameters:
395 If RET_LEN is nonzero the length of the match is returned (re_match style);
396 otherwise the position of the match is returned. */
398 static int
399 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
400 struct re_pattern_buffer *bufp;
401 const char *string;
402 int length, start, range, stop, ret_len;
403 struct re_registers *regs;
405 reg_errcode_t result;
406 regmatch_t *pmatch;
407 int nregs, rval;
408 int eflags = 0;
409 re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
411 /* Check for out-of-range. */
412 if (BE (start < 0 || start > length, 0))
413 return -1;
414 if (BE (start + range > length, 0))
415 range = length - start;
416 else if (BE (start + range < 0, 0))
417 range = -start;
419 __libc_lock_lock (dfa->lock);
421 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
422 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
424 /* Compile fastmap if we haven't yet. */
425 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
426 re_compile_fastmap (bufp);
428 if (BE (bufp->no_sub, 0))
429 regs = NULL;
431 /* We need at least 1 register. */
432 if (regs == NULL)
433 nregs = 1;
434 else if (BE (bufp->regs_allocated == REGS_FIXED &&
435 regs->num_regs < bufp->re_nsub + 1, 0))
437 nregs = regs->num_regs;
438 if (BE (nregs < 1, 0))
440 /* Nothing can be copied to regs. */
441 regs = NULL;
442 nregs = 1;
445 else
446 nregs = bufp->re_nsub + 1;
447 pmatch = re_malloc (regmatch_t, nregs);
448 if (BE (pmatch == NULL, 0))
450 rval = -2;
451 goto out;
454 result = re_search_internal (bufp, string, length, start, range, stop,
455 nregs, pmatch, eflags);
457 rval = 0;
459 /* I hope we needn't fill ther regs with -1's when no match was found. */
460 if (result != REG_NOERROR)
461 rval = -1;
462 else if (regs != NULL)
464 /* If caller wants register contents data back, copy them. */
465 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
466 bufp->regs_allocated);
467 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
468 rval = -2;
471 if (BE (rval == 0, 1))
473 if (ret_len)
475 assert (pmatch[0].rm_so == start);
476 rval = pmatch[0].rm_eo - start;
478 else
479 rval = pmatch[0].rm_so;
481 re_free (pmatch);
482 out:
483 __libc_lock_unlock (dfa->lock);
484 return rval;
487 static unsigned
488 re_copy_regs (regs, pmatch, nregs, regs_allocated)
489 struct re_registers *regs;
490 regmatch_t *pmatch;
491 int nregs, regs_allocated;
493 int rval = REGS_REALLOCATE;
494 int i;
495 int need_regs = nregs + 1;
496 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
497 uses. */
499 /* Have the register data arrays been allocated? */
500 if (regs_allocated == REGS_UNALLOCATED)
501 { /* No. So allocate them with malloc. */
502 regs->start = re_malloc (regoff_t, need_regs);
503 regs->end = re_malloc (regoff_t, need_regs);
504 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
505 return REGS_UNALLOCATED;
506 regs->num_regs = need_regs;
508 else if (regs_allocated == REGS_REALLOCATE)
509 { /* Yes. If we need more elements than were already
510 allocated, reallocate them. If we need fewer, just
511 leave it alone. */
512 if (BE (need_regs > regs->num_regs, 0))
514 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
515 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
516 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
517 return REGS_UNALLOCATED;
518 regs->start = new_start;
519 regs->end = new_end;
520 regs->num_regs = need_regs;
523 else
525 assert (regs_allocated == REGS_FIXED);
526 /* This function may not be called with REGS_FIXED and nregs too big. */
527 assert (regs->num_regs >= nregs);
528 rval = REGS_FIXED;
531 /* Copy the regs. */
532 for (i = 0; i < nregs; ++i)
534 regs->start[i] = pmatch[i].rm_so;
535 regs->end[i] = pmatch[i].rm_eo;
537 for ( ; i < regs->num_regs; ++i)
538 regs->start[i] = regs->end[i] = -1;
540 return rval;
543 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
544 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
545 this memory for recording register information. STARTS and ENDS
546 must be allocated using the malloc library routine, and must each
547 be at least NUM_REGS * sizeof (regoff_t) bytes long.
549 If NUM_REGS == 0, then subsequent matches should allocate their own
550 register data.
552 Unless this function is called, the first search or match using
553 PATTERN_BUFFER will allocate its own register data, without
554 freeing the old data. */
556 void
557 re_set_registers (bufp, regs, num_regs, starts, ends)
558 struct re_pattern_buffer *bufp;
559 struct re_registers *regs;
560 unsigned num_regs;
561 regoff_t *starts, *ends;
563 if (num_regs)
565 bufp->regs_allocated = REGS_REALLOCATE;
566 regs->num_regs = num_regs;
567 regs->start = starts;
568 regs->end = ends;
570 else
572 bufp->regs_allocated = REGS_UNALLOCATED;
573 regs->num_regs = 0;
574 regs->start = regs->end = (regoff_t *) 0;
577 #ifdef _LIBC
578 weak_alias (__re_set_registers, re_set_registers)
579 #endif
581 /* Entry points compatible with 4.2 BSD regex library. We don't define
582 them unless specifically requested. */
584 #if defined _REGEX_RE_COMP || defined _LIBC
586 # ifdef _LIBC
587 weak_function
588 # endif
589 re_exec (s)
590 const char *s;
592 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
594 #endif /* _REGEX_RE_COMP */
596 /* Internal entry point. */
598 /* Searches for a compiled pattern PREG in the string STRING, whose
599 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
600 mingings with regexec. START, and RANGE have the same meanings
601 with re_search.
602 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
603 otherwise return the error code.
604 Note: We assume front end functions already check ranges.
605 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
607 static reg_errcode_t
608 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
609 eflags)
610 const regex_t *preg;
611 const char *string;
612 int length, start, range, stop, eflags;
613 size_t nmatch;
614 regmatch_t pmatch[];
616 reg_errcode_t err;
617 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
618 int left_lim, right_lim, incr;
619 int fl_longest_match, match_first, match_kind, match_last = -1;
620 int extra_nmatch;
621 int sb, ch;
622 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
623 re_match_context_t mctx = { .dfa = dfa };
624 #else
625 re_match_context_t mctx;
626 #endif
627 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
628 && range && !preg->can_be_null) ? preg->fastmap : NULL;
629 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
631 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
632 memset (&mctx, '\0', sizeof (re_match_context_t));
633 mctx.dfa = dfa;
634 #endif
636 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
637 nmatch -= extra_nmatch;
639 /* Check if the DFA haven't been compiled. */
640 if (BE (preg->used == 0 || dfa->init_state == NULL
641 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
642 || dfa->init_state_begbuf == NULL, 0))
643 return REG_NOMATCH;
645 #ifdef DEBUG
646 /* We assume front-end functions already check them. */
647 assert (start + range >= 0 && start + range <= length);
648 #endif
650 /* If initial states with non-begbuf contexts have no elements,
651 the regex must be anchored. If preg->newline_anchor is set,
652 we'll never use init_state_nl, so do not check it. */
653 if (dfa->init_state->nodes.nelem == 0
654 && dfa->init_state_word->nodes.nelem == 0
655 && (dfa->init_state_nl->nodes.nelem == 0
656 || !preg->newline_anchor))
658 if (start != 0 && start + range != 0)
659 return REG_NOMATCH;
660 start = range = 0;
663 /* We must check the longest matching, if nmatch > 0. */
664 fl_longest_match = (nmatch != 0 || dfa->nbackref);
666 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
667 preg->translate, preg->syntax & RE_ICASE, dfa);
668 if (BE (err != REG_NOERROR, 0))
669 goto free_return;
670 mctx.input.stop = stop;
671 mctx.input.raw_stop = stop;
672 mctx.input.newline_anchor = preg->newline_anchor;
674 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
675 if (BE (err != REG_NOERROR, 0))
676 goto free_return;
678 /* We will log all the DFA states through which the dfa pass,
679 if nmatch > 1, or this dfa has "multibyte node", which is a
680 back-reference or a node which can accept multibyte character or
681 multi character collating element. */
682 if (nmatch > 1 || dfa->has_mb_node)
684 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
685 if (BE (mctx.state_log == NULL, 0))
687 err = REG_ESPACE;
688 goto free_return;
691 else
692 mctx.state_log = NULL;
694 match_first = start;
695 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
696 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
698 /* Check incrementally whether of not the input string match. */
699 incr = (range < 0) ? -1 : 1;
700 left_lim = (range < 0) ? start + range : start;
701 right_lim = (range < 0) ? start : start + range;
702 sb = dfa->mb_cur_max == 1;
703 match_kind =
704 (fastmap
705 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
706 | (range >= 0 ? 2 : 0)
707 | (t != NULL ? 1 : 0))
708 : 8);
710 for (;; match_first += incr)
712 err = REG_NOMATCH;
713 if (match_first < left_lim || right_lim < match_first)
714 goto free_return;
716 /* Advance as rapidly as possible through the string, until we
717 find a plausible place to start matching. This may be done
718 with varying efficiency, so there are various possibilities:
719 only the most common of them are specialized, in order to
720 save on code size. We use a switch statement for speed. */
721 switch (match_kind)
723 case 8:
724 /* No fastmap. */
725 break;
727 case 7:
728 /* Fastmap with single-byte translation, match forward. */
729 while (BE (match_first < right_lim, 1)
730 && !fastmap[t[(unsigned char) string[match_first]]])
731 ++match_first;
732 goto forward_match_found_start_or_reached_end;
734 case 6:
735 /* Fastmap without translation, match forward. */
736 while (BE (match_first < right_lim, 1)
737 && !fastmap[(unsigned char) string[match_first]])
738 ++match_first;
740 forward_match_found_start_or_reached_end:
741 if (BE (match_first == right_lim, 0))
743 ch = match_first >= length
744 ? 0 : (unsigned char) string[match_first];
745 if (!fastmap[t ? t[ch] : ch])
746 goto free_return;
748 break;
750 case 4:
751 case 5:
752 /* Fastmap without multi-byte translation, match backwards. */
753 while (match_first >= left_lim)
755 ch = match_first >= length
756 ? 0 : (unsigned char) string[match_first];
757 if (fastmap[t ? t[ch] : ch])
758 break;
759 --match_first;
761 if (match_first < left_lim)
762 goto free_return;
763 break;
765 default:
766 /* In this case, we can't determine easily the current byte,
767 since it might be a component byte of a multibyte
768 character. Then we use the constructed buffer instead. */
769 for (;;)
771 /* If MATCH_FIRST is out of the valid range, reconstruct the
772 buffers. */
773 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
774 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
776 err = re_string_reconstruct (&mctx.input, match_first,
777 eflags);
778 if (BE (err != REG_NOERROR, 0))
779 goto free_return;
781 offset = match_first - mctx.input.raw_mbs_idx;
783 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
784 Note that MATCH_FIRST must not be smaller than 0. */
785 ch = (match_first >= length
786 ? 0 : re_string_byte_at (&mctx.input, offset));
787 if (fastmap[ch])
788 break;
789 match_first += incr;
790 if (match_first < left_lim || match_first > right_lim)
792 err = REG_NOMATCH;
793 goto free_return;
796 break;
799 /* Reconstruct the buffers so that the matcher can assume that
800 the matching starts from the beginning of the buffer. */
801 err = re_string_reconstruct (&mctx.input, match_first, eflags);
802 if (BE (err != REG_NOERROR, 0))
803 goto free_return;
805 #ifdef RE_ENABLE_I18N
806 /* Don't consider this char as a possible match start if it part,
807 yet isn't the head, of a multibyte character. */
808 if (!sb && !re_string_first_byte (&mctx.input, 0))
809 continue;
810 #endif
812 /* It seems to be appropriate one, then use the matcher. */
813 /* We assume that the matching starts from 0. */
814 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
815 match_last = check_matching (&mctx, fl_longest_match,
816 range >= 0 ? &match_first : NULL);
817 if (match_last != -1)
819 if (BE (match_last == -2, 0))
821 err = REG_ESPACE;
822 goto free_return;
824 else
826 mctx.match_last = match_last;
827 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
829 re_dfastate_t *pstate = mctx.state_log[match_last];
830 mctx.last_node = check_halt_state_context (&mctx, pstate,
831 match_last);
833 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
834 || dfa->nbackref)
836 err = prune_impossible_nodes (&mctx);
837 if (err == REG_NOERROR)
838 break;
839 if (BE (err != REG_NOMATCH, 0))
840 goto free_return;
841 match_last = -1;
843 else
844 break; /* We found a match. */
848 match_ctx_clean (&mctx);
851 #ifdef DEBUG
852 assert (match_last != -1);
853 assert (err == REG_NOERROR);
854 #endif
856 /* Set pmatch[] if we need. */
857 if (nmatch > 0)
859 int reg_idx;
861 /* Initialize registers. */
862 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
863 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
865 /* Set the points where matching start/end. */
866 pmatch[0].rm_so = 0;
867 pmatch[0].rm_eo = mctx.match_last;
869 if (!preg->no_sub && nmatch > 1)
871 err = set_regs (preg, &mctx, nmatch, pmatch,
872 dfa->has_plural_match && dfa->nbackref > 0);
873 if (BE (err != REG_NOERROR, 0))
874 goto free_return;
877 /* At last, add the offset to the each registers, since we slided
878 the buffers so that we could assume that the matching starts
879 from 0. */
880 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
881 if (pmatch[reg_idx].rm_so != -1)
883 #ifdef RE_ENABLE_I18N
884 if (BE (mctx.input.offsets_needed != 0, 0))
886 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
887 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
888 else
889 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
890 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
891 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
892 else
893 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
895 #else
896 assert (mctx.input.offsets_needed == 0);
897 #endif
898 pmatch[reg_idx].rm_so += match_first;
899 pmatch[reg_idx].rm_eo += match_first;
901 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
903 pmatch[nmatch + reg_idx].rm_so = -1;
904 pmatch[nmatch + reg_idx].rm_eo = -1;
907 if (dfa->subexp_map)
908 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
909 if (dfa->subexp_map[reg_idx] != reg_idx)
911 pmatch[reg_idx + 1].rm_so
912 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
913 pmatch[reg_idx + 1].rm_eo
914 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
918 free_return:
919 re_free (mctx.state_log);
920 if (dfa->nbackref)
921 match_ctx_free (&mctx);
922 re_string_destruct (&mctx.input);
923 return err;
926 static reg_errcode_t
927 prune_impossible_nodes (mctx)
928 re_match_context_t *mctx;
930 re_dfa_t *const dfa = mctx->dfa;
931 int halt_node, match_last;
932 reg_errcode_t ret;
933 re_dfastate_t **sifted_states;
934 re_dfastate_t **lim_states = NULL;
935 re_sift_context_t sctx;
936 #ifdef DEBUG
937 assert (mctx->state_log != NULL);
938 #endif
939 match_last = mctx->match_last;
940 halt_node = mctx->last_node;
941 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
942 if (BE (sifted_states == NULL, 0))
944 ret = REG_ESPACE;
945 goto free_return;
947 if (dfa->nbackref)
949 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
950 if (BE (lim_states == NULL, 0))
952 ret = REG_ESPACE;
953 goto free_return;
955 while (1)
957 memset (lim_states, '\0',
958 sizeof (re_dfastate_t *) * (match_last + 1));
959 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
960 match_last);
961 ret = sift_states_backward (mctx, &sctx);
962 re_node_set_free (&sctx.limits);
963 if (BE (ret != REG_NOERROR, 0))
964 goto free_return;
965 if (sifted_states[0] != NULL || lim_states[0] != NULL)
966 break;
969 --match_last;
970 if (match_last < 0)
972 ret = REG_NOMATCH;
973 goto free_return;
975 } while (mctx->state_log[match_last] == NULL
976 || !mctx->state_log[match_last]->halt);
977 halt_node = check_halt_state_context (mctx,
978 mctx->state_log[match_last],
979 match_last);
981 ret = merge_state_array (dfa, sifted_states, lim_states,
982 match_last + 1);
983 re_free (lim_states);
984 lim_states = NULL;
985 if (BE (ret != REG_NOERROR, 0))
986 goto free_return;
988 else
990 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
991 ret = sift_states_backward (mctx, &sctx);
992 re_node_set_free (&sctx.limits);
993 if (BE (ret != REG_NOERROR, 0))
994 goto free_return;
996 re_free (mctx->state_log);
997 mctx->state_log = sifted_states;
998 sifted_states = NULL;
999 mctx->last_node = halt_node;
1000 mctx->match_last = match_last;
1001 ret = REG_NOERROR;
1002 free_return:
1003 re_free (sifted_states);
1004 re_free (lim_states);
1005 return ret;
1008 /* Acquire an initial state and return it.
1009 We must select appropriate initial state depending on the context,
1010 since initial states may have constraints like "\<", "^", etc.. */
1012 static inline re_dfastate_t *
1013 acquire_init_state_context (err, mctx, idx)
1014 reg_errcode_t *err;
1015 const re_match_context_t *mctx;
1016 int idx;
1018 re_dfa_t *const dfa = mctx->dfa;
1019 if (dfa->init_state->has_constraint)
1021 unsigned int context;
1022 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1023 if (IS_WORD_CONTEXT (context))
1024 return dfa->init_state_word;
1025 else if (IS_ORDINARY_CONTEXT (context))
1026 return dfa->init_state;
1027 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1028 return dfa->init_state_begbuf;
1029 else if (IS_NEWLINE_CONTEXT (context))
1030 return dfa->init_state_nl;
1031 else if (IS_BEGBUF_CONTEXT (context))
1033 /* It is relatively rare case, then calculate on demand. */
1034 return re_acquire_state_context (err, dfa,
1035 dfa->init_state->entrance_nodes,
1036 context);
1038 else
1039 /* Must not happen? */
1040 return dfa->init_state;
1042 else
1043 return dfa->init_state;
1046 /* Check whether the regular expression match input string INPUT or not,
1047 and return the index where the matching end, return -1 if not match,
1048 or return -2 in case of an error.
1049 FL_LONGEST_MATCH means we want the POSIX longest matching.
1050 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1051 next place where we may want to try matching.
1052 Note that the matcher assume that the maching starts from the current
1053 index of the buffer. */
1055 static int
1056 check_matching (mctx, fl_longest_match, p_match_first)
1057 re_match_context_t *mctx;
1058 int fl_longest_match;
1059 int *p_match_first;
1061 re_dfa_t *const dfa = mctx->dfa;
1062 reg_errcode_t err;
1063 int match = 0;
1064 int match_last = -1;
1065 int cur_str_idx = re_string_cur_idx (&mctx->input);
1066 re_dfastate_t *cur_state;
1067 int at_init_state = p_match_first != NULL;
1068 int next_start_idx = cur_str_idx;
1070 err = REG_NOERROR;
1071 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1072 /* An initial state must not be NULL (invalid). */
1073 if (BE (cur_state == NULL, 0))
1075 assert (err == REG_ESPACE);
1076 return -2;
1079 if (mctx->state_log != NULL)
1081 mctx->state_log[cur_str_idx] = cur_state;
1083 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1084 later. E.g. Processing back references. */
1085 if (BE (dfa->nbackref, 0))
1087 at_init_state = 0;
1088 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1089 if (BE (err != REG_NOERROR, 0))
1090 return err;
1092 if (cur_state->has_backref)
1094 err = transit_state_bkref (mctx, &cur_state->nodes);
1095 if (BE (err != REG_NOERROR, 0))
1096 return err;
1101 /* If the RE accepts NULL string. */
1102 if (BE (cur_state->halt, 0))
1104 if (!cur_state->has_constraint
1105 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1107 if (!fl_longest_match)
1108 return cur_str_idx;
1109 else
1111 match_last = cur_str_idx;
1112 match = 1;
1117 while (!re_string_eoi (&mctx->input))
1119 re_dfastate_t *old_state = cur_state;
1120 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1122 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1123 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1124 && mctx->input.valid_len < mctx->input.len))
1126 err = extend_buffers (mctx);
1127 if (BE (err != REG_NOERROR, 0))
1129 assert (err == REG_ESPACE);
1130 return -2;
1134 cur_state = transit_state (&err, mctx, cur_state);
1135 if (mctx->state_log != NULL)
1136 cur_state = merge_state_with_log (&err, mctx, cur_state);
1138 if (cur_state == NULL)
1140 /* Reached the invalid state or an error. Try to recover a valid
1141 state using the state log, if available and if we have not
1142 already found a valid (even if not the longest) match. */
1143 if (BE (err != REG_NOERROR, 0))
1144 return -2;
1146 if (mctx->state_log == NULL
1147 || (match && !fl_longest_match)
1148 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1149 break;
1152 if (BE (at_init_state, 0))
1154 if (old_state == cur_state)
1155 next_start_idx = next_char_idx;
1156 else
1157 at_init_state = 0;
1160 if (cur_state->halt)
1162 /* Reached a halt state.
1163 Check the halt state can satisfy the current context. */
1164 if (!cur_state->has_constraint
1165 || check_halt_state_context (mctx, cur_state,
1166 re_string_cur_idx (&mctx->input)))
1168 /* We found an appropriate halt state. */
1169 match_last = re_string_cur_idx (&mctx->input);
1170 match = 1;
1172 /* We found a match, do not modify match_first below. */
1173 p_match_first = NULL;
1174 if (!fl_longest_match)
1175 break;
1180 if (p_match_first)
1181 *p_match_first += next_start_idx;
1183 return match_last;
1186 /* Check NODE match the current context. */
1188 static int check_halt_node_context (dfa, node, context)
1189 const re_dfa_t *dfa;
1190 int node;
1191 unsigned int context;
1193 re_token_type_t type = dfa->nodes[node].type;
1194 unsigned int constraint = dfa->nodes[node].constraint;
1195 if (type != END_OF_RE)
1196 return 0;
1197 if (!constraint)
1198 return 1;
1199 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1200 return 0;
1201 return 1;
1204 /* Check the halt state STATE match the current context.
1205 Return 0 if not match, if the node, STATE has, is a halt node and
1206 match the context, return the node. */
1208 static int
1209 check_halt_state_context (mctx, state, idx)
1210 const re_match_context_t *mctx;
1211 const re_dfastate_t *state;
1212 int idx;
1214 int i;
1215 unsigned int context;
1216 #ifdef DEBUG
1217 assert (state->halt);
1218 #endif
1219 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1220 for (i = 0; i < state->nodes.nelem; ++i)
1221 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1222 return state->nodes.elems[i];
1223 return 0;
1226 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1227 corresponding to the DFA).
1228 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1229 of errors. */
1231 static int
1232 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1233 const re_match_context_t *mctx;
1234 regmatch_t *regs;
1235 int nregs, *pidx, node;
1236 re_node_set *eps_via_nodes;
1237 struct re_fail_stack_t *fs;
1239 re_dfa_t *const dfa = mctx->dfa;
1240 int i, err, dest_node;
1241 dest_node = -1;
1242 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1244 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1245 re_node_set *edests = &dfa->edests[node];
1246 int dest_node;
1247 err = re_node_set_insert (eps_via_nodes, node);
1248 if (BE (err < 0, 0))
1249 return -2;
1250 /* Pick up a valid destination, or return -1 if none is found. */
1251 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1253 int candidate = edests->elems[i];
1254 if (!re_node_set_contains (cur_nodes, candidate))
1255 continue;
1256 if (dest_node == -1)
1257 dest_node = candidate;
1259 else
1261 /* In order to avoid infinite loop like "(a*)*", return the second
1262 epsilon-transition if the first was already considered. */
1263 if (re_node_set_contains (eps_via_nodes, dest_node))
1264 return candidate;
1266 /* Otherwise, push the second epsilon-transition on the fail stack. */
1267 else if (fs != NULL
1268 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1269 eps_via_nodes))
1270 return -2;
1272 /* We know we are going to exit. */
1273 break;
1276 return dest_node;
1278 else
1280 int naccepted = 0;
1281 re_token_type_t type = dfa->nodes[node].type;
1283 #ifdef RE_ENABLE_I18N
1284 if (dfa->nodes[node].accept_mb)
1285 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1286 else
1287 #endif /* RE_ENABLE_I18N */
1288 if (type == OP_BACK_REF)
1290 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1291 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1292 if (fs != NULL)
1294 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1295 return -1;
1296 else if (naccepted)
1298 char *buf = (char *) re_string_get_buffer (&mctx->input);
1299 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1300 naccepted) != 0)
1301 return -1;
1305 if (naccepted == 0)
1307 err = re_node_set_insert (eps_via_nodes, node);
1308 if (BE (err < 0, 0))
1309 return -2;
1310 dest_node = dfa->edests[node].elems[0];
1311 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312 dest_node))
1313 return dest_node;
1317 if (naccepted != 0
1318 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1320 dest_node = dfa->nexts[node];
1321 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1322 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1323 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1324 dest_node)))
1325 return -1;
1326 re_node_set_empty (eps_via_nodes);
1327 return dest_node;
1330 return -1;
1333 static reg_errcode_t
1334 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1335 struct re_fail_stack_t *fs;
1336 int str_idx, dest_node, nregs;
1337 regmatch_t *regs;
1338 re_node_set *eps_via_nodes;
1340 reg_errcode_t err;
1341 int num = fs->num++;
1342 if (fs->num == fs->alloc)
1344 struct re_fail_stack_ent_t *new_array;
1345 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1346 * fs->alloc * 2));
1347 if (new_array == NULL)
1348 return REG_ESPACE;
1349 fs->alloc *= 2;
1350 fs->stack = new_array;
1352 fs->stack[num].idx = str_idx;
1353 fs->stack[num].node = dest_node;
1354 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1355 if (fs->stack[num].regs == NULL)
1356 return REG_ESPACE;
1357 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1358 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1359 return err;
1362 static int
1363 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1364 struct re_fail_stack_t *fs;
1365 int *pidx, nregs;
1366 regmatch_t *regs;
1367 re_node_set *eps_via_nodes;
1369 int num = --fs->num;
1370 assert (num >= 0);
1371 *pidx = fs->stack[num].idx;
1372 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1373 re_node_set_free (eps_via_nodes);
1374 re_free (fs->stack[num].regs);
1375 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1376 return fs->stack[num].node;
1379 /* Set the positions where the subexpressions are starts/ends to registers
1380 PMATCH.
1381 Note: We assume that pmatch[0] is already set, and
1382 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1384 static reg_errcode_t
1385 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1386 const regex_t *preg;
1387 const re_match_context_t *mctx;
1388 size_t nmatch;
1389 regmatch_t *pmatch;
1390 int fl_backtrack;
1392 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1393 int idx, cur_node;
1394 re_node_set eps_via_nodes;
1395 struct re_fail_stack_t *fs;
1396 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1397 regmatch_t *prev_idx_match;
1399 #ifdef DEBUG
1400 assert (nmatch > 1);
1401 assert (mctx->state_log != NULL);
1402 #endif
1403 if (fl_backtrack)
1405 fs = &fs_body;
1406 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1407 if (fs->stack == NULL)
1408 return REG_ESPACE;
1410 else
1411 fs = NULL;
1413 cur_node = dfa->init_node;
1414 re_node_set_init_empty (&eps_via_nodes);
1416 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1417 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1419 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1421 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1423 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1425 int reg_idx;
1426 if (fs)
1428 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1429 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1430 break;
1431 if (reg_idx == nmatch)
1433 re_node_set_free (&eps_via_nodes);
1434 return free_fail_stack_return (fs);
1436 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437 &eps_via_nodes);
1439 else
1441 re_node_set_free (&eps_via_nodes);
1442 return REG_NOERROR;
1446 /* Proceed to next node. */
1447 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1448 &eps_via_nodes, fs);
1450 if (BE (cur_node < 0, 0))
1452 if (BE (cur_node == -2, 0))
1454 re_node_set_free (&eps_via_nodes);
1455 free_fail_stack_return (fs);
1456 return REG_ESPACE;
1458 if (fs)
1459 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1460 &eps_via_nodes);
1461 else
1463 re_node_set_free (&eps_via_nodes);
1464 return REG_NOMATCH;
1468 re_node_set_free (&eps_via_nodes);
1469 return free_fail_stack_return (fs);
1472 static reg_errcode_t
1473 free_fail_stack_return (fs)
1474 struct re_fail_stack_t *fs;
1476 if (fs)
1478 int fs_idx;
1479 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1481 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1482 re_free (fs->stack[fs_idx].regs);
1484 re_free (fs->stack);
1486 return REG_NOERROR;
1489 static void
1490 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1491 re_dfa_t *dfa;
1492 regmatch_t *pmatch, *prev_idx_match;
1493 int cur_node, cur_idx, nmatch;
1495 int type = dfa->nodes[cur_node].type;
1496 if (type == OP_OPEN_SUBEXP)
1498 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1500 /* We are at the first node of this sub expression. */
1501 if (reg_num < nmatch)
1503 pmatch[reg_num].rm_so = cur_idx;
1504 pmatch[reg_num].rm_eo = -1;
1507 else if (type == OP_CLOSE_SUBEXP)
1509 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1510 if (reg_num < nmatch)
1512 /* We are at the last node of this sub expression. */
1513 if (pmatch[reg_num].rm_so < cur_idx)
1515 pmatch[reg_num].rm_eo = cur_idx;
1516 /* This is a non-empty match or we are not inside an optional
1517 subexpression. Accept this right away. */
1518 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1520 else
1522 if (dfa->nodes[cur_node].opt_subexp
1523 && prev_idx_match[reg_num].rm_so != -1)
1524 /* We transited through an empty match for an optional
1525 subexpression, like (a?)*, and this is not the subexp's
1526 first match. Copy back the old content of the registers
1527 so that matches of an inner subexpression are undone as
1528 well, like in ((a?))*. */
1529 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1530 else
1531 /* We completed a subexpression, but it may be part of
1532 an optional one, so do not update PREV_IDX_MATCH. */
1533 pmatch[reg_num].rm_eo = cur_idx;
1539 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1540 and sift the nodes in each states according to the following rules.
1541 Updated state_log will be wrote to STATE_LOG.
1543 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1544 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1545 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1546 the LAST_NODE, we throw away the node `a'.
1547 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1548 string `s' and transit to `b':
1549 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1550 away the node `a'.
1551 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1552 thrown away, we throw away the node `a'.
1553 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1554 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1555 node `a'.
1556 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1557 we throw away the node `a'. */
1559 #define STATE_NODE_CONTAINS(state,node) \
1560 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1562 static reg_errcode_t
1563 sift_states_backward (mctx, sctx)
1564 re_match_context_t *mctx;
1565 re_sift_context_t *sctx;
1567 reg_errcode_t err;
1568 int null_cnt = 0;
1569 int str_idx = sctx->last_str_idx;
1570 re_node_set cur_dest;
1572 #ifdef DEBUG
1573 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1574 #endif
1576 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1577 transit to the last_node and the last_node itself. */
1578 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1579 if (BE (err != REG_NOERROR, 0))
1580 return err;
1581 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1582 if (BE (err != REG_NOERROR, 0))
1583 goto free_return;
1585 /* Then check each states in the state_log. */
1586 while (str_idx > 0)
1588 /* Update counters. */
1589 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1590 if (null_cnt > mctx->max_mb_elem_len)
1592 memset (sctx->sifted_states, '\0',
1593 sizeof (re_dfastate_t *) * str_idx);
1594 re_node_set_free (&cur_dest);
1595 return REG_NOERROR;
1597 re_node_set_empty (&cur_dest);
1598 --str_idx;
1600 if (mctx->state_log[str_idx])
1602 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1603 if (BE (err != REG_NOERROR, 0))
1604 goto free_return;
1607 /* Add all the nodes which satisfy the following conditions:
1608 - It can epsilon transit to a node in CUR_DEST.
1609 - It is in CUR_SRC.
1610 And update state_log. */
1611 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1612 if (BE (err != REG_NOERROR, 0))
1613 goto free_return;
1615 err = REG_NOERROR;
1616 free_return:
1617 re_node_set_free (&cur_dest);
1618 return err;
1621 static reg_errcode_t
1622 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1623 re_match_context_t *mctx;
1624 re_sift_context_t *sctx;
1625 int str_idx;
1626 re_node_set *cur_dest;
1628 re_dfa_t *const dfa = mctx->dfa;
1629 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1630 int i;
1632 /* Then build the next sifted state.
1633 We build the next sifted state on `cur_dest', and update
1634 `sifted_states[str_idx]' with `cur_dest'.
1635 Note:
1636 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1637 `cur_src' points the node_set of the old `state_log[str_idx]'
1638 (with the epsilon nodes pre-filtered out). */
1639 for (i = 0; i < cur_src->nelem; i++)
1641 int prev_node = cur_src->elems[i];
1642 int naccepted = 0;
1643 int ret;
1645 #ifdef DEBUG
1646 re_token_type_t type = dfa->nodes[prev_node].type;
1647 assert (!IS_EPSILON_NODE (type));
1648 #endif
1649 #ifdef RE_ENABLE_I18N
1650 /* If the node may accept `multi byte'. */
1651 if (dfa->nodes[prev_node].accept_mb)
1652 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1653 str_idx, sctx->last_str_idx);
1654 #endif /* RE_ENABLE_I18N */
1656 /* We don't check backreferences here.
1657 See update_cur_sifted_state(). */
1658 if (!naccepted
1659 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1660 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1661 dfa->nexts[prev_node]))
1662 naccepted = 1;
1664 if (naccepted == 0)
1665 continue;
1667 if (sctx->limits.nelem)
1669 int to_idx = str_idx + naccepted;
1670 if (check_dst_limits (mctx, &sctx->limits,
1671 dfa->nexts[prev_node], to_idx,
1672 prev_node, str_idx))
1673 continue;
1675 ret = re_node_set_insert (cur_dest, prev_node);
1676 if (BE (ret == -1, 0))
1677 return REG_ESPACE;
1680 return REG_NOERROR;
1683 /* Helper functions. */
1685 static reg_errcode_t
1686 clean_state_log_if_needed (mctx, next_state_log_idx)
1687 re_match_context_t *mctx;
1688 int next_state_log_idx;
1690 int top = mctx->state_log_top;
1692 if (next_state_log_idx >= mctx->input.bufs_len
1693 || (next_state_log_idx >= mctx->input.valid_len
1694 && mctx->input.valid_len < mctx->input.len))
1696 reg_errcode_t err;
1697 err = extend_buffers (mctx);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1702 if (top < next_state_log_idx)
1704 memset (mctx->state_log + top + 1, '\0',
1705 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706 mctx->state_log_top = next_state_log_idx;
1708 return REG_NOERROR;
1711 static reg_errcode_t
1712 merge_state_array (dfa, dst, src, num)
1713 re_dfa_t *dfa;
1714 re_dfastate_t **dst;
1715 re_dfastate_t **src;
1716 int num;
1718 int st_idx;
1719 reg_errcode_t err;
1720 for (st_idx = 0; st_idx < num; ++st_idx)
1722 if (dst[st_idx] == NULL)
1723 dst[st_idx] = src[st_idx];
1724 else if (src[st_idx] != NULL)
1726 re_node_set merged_set;
1727 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1728 &src[st_idx]->nodes);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1731 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1732 re_node_set_free (&merged_set);
1733 if (BE (err != REG_NOERROR, 0))
1734 return err;
1737 return REG_NOERROR;
1740 static reg_errcode_t
1741 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1742 re_match_context_t *mctx;
1743 re_sift_context_t *sctx;
1744 int str_idx;
1745 re_node_set *dest_nodes;
1747 re_dfa_t *const dfa = mctx->dfa;
1748 reg_errcode_t err;
1749 const re_node_set *candidates;
1750 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1751 : &mctx->state_log[str_idx]->nodes);
1753 if (dest_nodes->nelem == 0)
1754 sctx->sifted_states[str_idx] = NULL;
1755 else
1757 if (candidates)
1759 /* At first, add the nodes which can epsilon transit to a node in
1760 DEST_NODE. */
1761 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1762 if (BE (err != REG_NOERROR, 0))
1763 return err;
1765 /* Then, check the limitations in the current sift_context. */
1766 if (sctx->limits.nelem)
1768 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1769 mctx->bkref_ents, str_idx);
1770 if (BE (err != REG_NOERROR, 0))
1771 return err;
1775 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1776 if (BE (err != REG_NOERROR, 0))
1777 return err;
1780 if (candidates && mctx->state_log[str_idx]->has_backref)
1782 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1783 if (BE (err != REG_NOERROR, 0))
1784 return err;
1786 return REG_NOERROR;
1789 static reg_errcode_t
1790 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1791 re_dfa_t *dfa;
1792 re_node_set *dest_nodes;
1793 const re_node_set *candidates;
1795 reg_errcode_t err = REG_NOERROR;
1796 int i;
1798 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1799 if (BE (err != REG_NOERROR, 0))
1800 return err;
1802 if (!state->inveclosure.alloc)
1804 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1805 if (BE (err != REG_NOERROR, 0))
1806 return REG_ESPACE;
1807 for (i = 0; i < dest_nodes->nelem; i++)
1808 re_node_set_merge (&state->inveclosure,
1809 dfa->inveclosures + dest_nodes->elems[i]);
1811 return re_node_set_add_intersect (dest_nodes, candidates,
1812 &state->inveclosure);
1815 static reg_errcode_t
1816 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1817 re_dfa_t *dfa;
1818 int node;
1819 re_node_set *dest_nodes;
1820 const re_node_set *candidates;
1822 int ecl_idx;
1823 reg_errcode_t err;
1824 re_node_set *inv_eclosure = dfa->inveclosures + node;
1825 re_node_set except_nodes;
1826 re_node_set_init_empty (&except_nodes);
1827 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1829 int cur_node = inv_eclosure->elems[ecl_idx];
1830 if (cur_node == node)
1831 continue;
1832 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1834 int edst1 = dfa->edests[cur_node].elems[0];
1835 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1836 ? dfa->edests[cur_node].elems[1] : -1);
1837 if ((!re_node_set_contains (inv_eclosure, edst1)
1838 && re_node_set_contains (dest_nodes, edst1))
1839 || (edst2 > 0
1840 && !re_node_set_contains (inv_eclosure, edst2)
1841 && re_node_set_contains (dest_nodes, edst2)))
1843 err = re_node_set_add_intersect (&except_nodes, candidates,
1844 dfa->inveclosures + cur_node);
1845 if (BE (err != REG_NOERROR, 0))
1847 re_node_set_free (&except_nodes);
1848 return err;
1853 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1855 int cur_node = inv_eclosure->elems[ecl_idx];
1856 if (!re_node_set_contains (&except_nodes, cur_node))
1858 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1859 re_node_set_remove_at (dest_nodes, idx);
1862 re_node_set_free (&except_nodes);
1863 return REG_NOERROR;
1866 static int
1867 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1868 re_match_context_t *mctx;
1869 re_node_set *limits;
1870 int dst_node, dst_idx, src_node, src_idx;
1872 re_dfa_t *const dfa = mctx->dfa;
1873 int lim_idx, src_pos, dst_pos;
1875 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1876 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1877 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1879 int subexp_idx;
1880 struct re_backref_cache_entry *ent;
1881 ent = mctx->bkref_ents + limits->elems[lim_idx];
1882 subexp_idx = dfa->nodes[ent->node].opr.idx;
1884 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1885 subexp_idx, dst_node, dst_idx,
1886 dst_bkref_idx);
1887 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1888 subexp_idx, src_node, src_idx,
1889 src_bkref_idx);
1891 /* In case of:
1892 <src> <dst> ( <subexp> )
1893 ( <subexp> ) <src> <dst>
1894 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1895 if (src_pos == dst_pos)
1896 continue; /* This is unrelated limitation. */
1897 else
1898 return 1;
1900 return 0;
1903 static int
1904 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1905 re_match_context_t *mctx;
1906 int boundaries, subexp_idx, from_node, bkref_idx;
1908 re_dfa_t *const dfa = mctx->dfa;
1909 re_node_set *eclosures = dfa->eclosures + from_node;
1910 int node_idx;
1912 /* Else, we are on the boundary: examine the nodes on the epsilon
1913 closure. */
1914 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1916 int node = eclosures->elems[node_idx];
1917 switch (dfa->nodes[node].type)
1919 case OP_BACK_REF:
1920 if (bkref_idx != -1)
1922 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1925 int dst, cpos;
1927 if (ent->node != node)
1928 continue;
1930 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1931 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1932 continue;
1934 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1935 OP_CLOSE_SUBEXP cases below. But, if the
1936 destination node is the same node as the source
1937 node, don't recurse because it would cause an
1938 infinite loop: a regex that exhibits this behavior
1939 is ()\1*\1* */
1940 dst = dfa->edests[node].elems[0];
1941 if (dst == from_node)
1943 if (boundaries & 1)
1944 return -1;
1945 else /* if (boundaries & 2) */
1946 return 0;
1949 cpos =
1950 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1951 dst, bkref_idx);
1952 if (cpos == -1 /* && (boundaries & 1) */)
1953 return -1;
1954 if (cpos == 0 && (boundaries & 2))
1955 return 0;
1957 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1959 while (ent++->more);
1961 break;
1963 case OP_OPEN_SUBEXP:
1964 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1965 return -1;
1966 break;
1968 case OP_CLOSE_SUBEXP:
1969 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1970 return 0;
1971 break;
1973 default:
1974 break;
1978 return (boundaries & 2) ? 1 : 0;
1981 static int
1982 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1983 re_match_context_t *mctx;
1984 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1986 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1987 int boundaries;
1989 /* If we are outside the range of the subexpression, return -1 or 1. */
1990 if (str_idx < lim->subexp_from)
1991 return -1;
1993 if (lim->subexp_to < str_idx)
1994 return 1;
1996 /* If we are within the subexpression, return 0. */
1997 boundaries = (str_idx == lim->subexp_from);
1998 boundaries |= (str_idx == lim->subexp_to) << 1;
1999 if (boundaries == 0)
2000 return 0;
2002 /* Else, examine epsilon closure. */
2003 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2004 from_node, bkref_idx);
2007 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2008 which are against limitations from DEST_NODES. */
2010 static reg_errcode_t
2011 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2012 re_dfa_t *dfa;
2013 re_node_set *dest_nodes;
2014 const re_node_set *candidates;
2015 re_node_set *limits;
2016 struct re_backref_cache_entry *bkref_ents;
2017 int str_idx;
2019 reg_errcode_t err;
2020 int node_idx, lim_idx;
2022 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2024 int subexp_idx;
2025 struct re_backref_cache_entry *ent;
2026 ent = bkref_ents + limits->elems[lim_idx];
2028 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2029 continue; /* This is unrelated limitation. */
2031 subexp_idx = dfa->nodes[ent->node].opr.idx;
2032 if (ent->subexp_to == str_idx)
2034 int ops_node = -1;
2035 int cls_node = -1;
2036 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2038 int node = dest_nodes->elems[node_idx];
2039 re_token_type_t type = dfa->nodes[node].type;
2040 if (type == OP_OPEN_SUBEXP
2041 && subexp_idx == dfa->nodes[node].opr.idx)
2042 ops_node = node;
2043 else if (type == OP_CLOSE_SUBEXP
2044 && subexp_idx == dfa->nodes[node].opr.idx)
2045 cls_node = node;
2048 /* Check the limitation of the open subexpression. */
2049 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2050 if (ops_node >= 0)
2052 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2053 candidates);
2054 if (BE (err != REG_NOERROR, 0))
2055 return err;
2058 /* Check the limitation of the close subexpression. */
2059 if (cls_node >= 0)
2060 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2062 int node = dest_nodes->elems[node_idx];
2063 if (!re_node_set_contains (dfa->inveclosures + node,
2064 cls_node)
2065 && !re_node_set_contains (dfa->eclosures + node,
2066 cls_node))
2068 /* It is against this limitation.
2069 Remove it form the current sifted state. */
2070 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2071 candidates);
2072 if (BE (err != REG_NOERROR, 0))
2073 return err;
2074 --node_idx;
2078 else /* (ent->subexp_to != str_idx) */
2080 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2082 int node = dest_nodes->elems[node_idx];
2083 re_token_type_t type = dfa->nodes[node].type;
2084 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2086 if (subexp_idx != dfa->nodes[node].opr.idx)
2087 continue;
2088 /* It is against this limitation.
2089 Remove it form the current sifted state. */
2090 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2091 candidates);
2092 if (BE (err != REG_NOERROR, 0))
2093 return err;
2098 return REG_NOERROR;
2101 static reg_errcode_t
2102 sift_states_bkref (mctx, sctx, str_idx, candidates)
2103 re_match_context_t *mctx;
2104 re_sift_context_t *sctx;
2105 int str_idx;
2106 const re_node_set *candidates;
2108 re_dfa_t *const dfa = mctx->dfa;
2109 reg_errcode_t err;
2110 int node_idx, node;
2111 re_sift_context_t local_sctx;
2112 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2114 if (first_idx == -1)
2115 return REG_NOERROR;
2117 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2119 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2121 int enabled_idx;
2122 re_token_type_t type;
2123 struct re_backref_cache_entry *entry;
2124 node = candidates->elems[node_idx];
2125 type = dfa->nodes[node].type;
2126 /* Avoid infinite loop for the REs like "()\1+". */
2127 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2128 continue;
2129 if (type != OP_BACK_REF)
2130 continue;
2132 entry = mctx->bkref_ents + first_idx;
2133 enabled_idx = first_idx;
2136 int subexp_len, to_idx, dst_node;
2137 re_dfastate_t *cur_state;
2139 if (entry->node != node)
2140 continue;
2141 subexp_len = entry->subexp_to - entry->subexp_from;
2142 to_idx = str_idx + subexp_len;
2143 dst_node = (subexp_len ? dfa->nexts[node]
2144 : dfa->edests[node].elems[0]);
2146 if (to_idx > sctx->last_str_idx
2147 || sctx->sifted_states[to_idx] == NULL
2148 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2149 || check_dst_limits (mctx, &sctx->limits, node,
2150 str_idx, dst_node, to_idx))
2151 continue;
2153 if (local_sctx.sifted_states == NULL)
2155 local_sctx = *sctx;
2156 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2157 if (BE (err != REG_NOERROR, 0))
2158 goto free_return;
2160 local_sctx.last_node = node;
2161 local_sctx.last_str_idx = str_idx;
2162 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2163 if (BE (err < 0, 0))
2165 err = REG_ESPACE;
2166 goto free_return;
2168 cur_state = local_sctx.sifted_states[str_idx];
2169 err = sift_states_backward (mctx, &local_sctx);
2170 if (BE (err != REG_NOERROR, 0))
2171 goto free_return;
2172 if (sctx->limited_states != NULL)
2174 err = merge_state_array (dfa, sctx->limited_states,
2175 local_sctx.sifted_states,
2176 str_idx + 1);
2177 if (BE (err != REG_NOERROR, 0))
2178 goto free_return;
2180 local_sctx.sifted_states[str_idx] = cur_state;
2181 re_node_set_remove (&local_sctx.limits, enabled_idx);
2183 /* mctx->bkref_ents may have changed, reload the pointer. */
2184 entry = mctx->bkref_ents + enabled_idx;
2186 while (enabled_idx++, entry++->more);
2188 err = REG_NOERROR;
2189 free_return:
2190 if (local_sctx.sifted_states != NULL)
2192 re_node_set_free (&local_sctx.limits);
2195 return err;
2199 #ifdef RE_ENABLE_I18N
2200 static int
2201 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2202 const re_match_context_t *mctx;
2203 re_sift_context_t *sctx;
2204 int node_idx, str_idx, max_str_idx;
2206 re_dfa_t *const dfa = mctx->dfa;
2207 int naccepted;
2208 /* Check the node can accept `multi byte'. */
2209 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2210 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2211 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2212 dfa->nexts[node_idx]))
2213 /* The node can't accept the `multi byte', or the
2214 destination was already thrown away, then the node
2215 could't accept the current input `multi byte'. */
2216 naccepted = 0;
2217 /* Otherwise, it is sure that the node could accept
2218 `naccepted' bytes input. */
2219 return naccepted;
2221 #endif /* RE_ENABLE_I18N */
2224 /* Functions for state transition. */
2226 /* Return the next state to which the current state STATE will transit by
2227 accepting the current input byte, and update STATE_LOG if necessary.
2228 If STATE can accept a multibyte char/collating element/back reference
2229 update the destination of STATE_LOG. */
2231 static re_dfastate_t *
2232 transit_state (err, mctx, state)
2233 reg_errcode_t *err;
2234 re_match_context_t *mctx;
2235 re_dfastate_t *state;
2237 re_dfastate_t **trtable;
2238 unsigned char ch;
2240 #ifdef RE_ENABLE_I18N
2241 /* If the current state can accept multibyte. */
2242 if (BE (state->accept_mb, 0))
2244 *err = transit_state_mb (mctx, state);
2245 if (BE (*err != REG_NOERROR, 0))
2246 return NULL;
2248 #endif /* RE_ENABLE_I18N */
2250 /* Then decide the next state with the single byte. */
2251 #if 0
2252 if (0)
2253 /* don't use transition table */
2254 return transit_state_sb (err, mctx, state);
2255 #endif
2257 /* Use transition table */
2258 ch = re_string_fetch_byte (&mctx->input);
2259 for (;;)
2261 trtable = state->trtable;
2262 if (BE (trtable != NULL, 1))
2263 return trtable[ch];
2265 trtable = state->word_trtable;
2266 if (BE (trtable != NULL, 1))
2268 unsigned int context;
2269 context
2270 = re_string_context_at (&mctx->input,
2271 re_string_cur_idx (&mctx->input) - 1,
2272 mctx->eflags);
2273 if (IS_WORD_CONTEXT (context))
2274 return trtable[ch + SBC_MAX];
2275 else
2276 return trtable[ch];
2279 if (!build_trtable (mctx->dfa, state))
2281 *err = REG_ESPACE;
2282 return NULL;
2285 /* Retry, we now have a transition table. */
2289 /* Update the state_log if we need */
2290 re_dfastate_t *
2291 merge_state_with_log (err, mctx, next_state)
2292 reg_errcode_t *err;
2293 re_match_context_t *mctx;
2294 re_dfastate_t *next_state;
2296 re_dfa_t *const dfa = mctx->dfa;
2297 int cur_idx = re_string_cur_idx (&mctx->input);
2299 if (cur_idx > mctx->state_log_top)
2301 mctx->state_log[cur_idx] = next_state;
2302 mctx->state_log_top = cur_idx;
2304 else if (mctx->state_log[cur_idx] == 0)
2306 mctx->state_log[cur_idx] = next_state;
2308 else
2310 re_dfastate_t *pstate;
2311 unsigned int context;
2312 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2313 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2314 the destination of a multibyte char/collating element/
2315 back reference. Then the next state is the union set of
2316 these destinations and the results of the transition table. */
2317 pstate = mctx->state_log[cur_idx];
2318 log_nodes = pstate->entrance_nodes;
2319 if (next_state != NULL)
2321 table_nodes = next_state->entrance_nodes;
2322 *err = re_node_set_init_union (&next_nodes, table_nodes,
2323 log_nodes);
2324 if (BE (*err != REG_NOERROR, 0))
2325 return NULL;
2327 else
2328 next_nodes = *log_nodes;
2329 /* Note: We already add the nodes of the initial state,
2330 then we don't need to add them here. */
2332 context = re_string_context_at (&mctx->input,
2333 re_string_cur_idx (&mctx->input) - 1,
2334 mctx->eflags);
2335 next_state = mctx->state_log[cur_idx]
2336 = re_acquire_state_context (err, dfa, &next_nodes, context);
2337 /* We don't need to check errors here, since the return value of
2338 this function is next_state and ERR is already set. */
2340 if (table_nodes != NULL)
2341 re_node_set_free (&next_nodes);
2344 if (BE (dfa->nbackref, 0) && next_state != NULL)
2346 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2347 later. We must check them here, since the back references in the
2348 next state might use them. */
2349 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2350 cur_idx);
2351 if (BE (*err != REG_NOERROR, 0))
2352 return NULL;
2354 /* If the next state has back references. */
2355 if (next_state->has_backref)
2357 *err = transit_state_bkref (mctx, &next_state->nodes);
2358 if (BE (*err != REG_NOERROR, 0))
2359 return NULL;
2360 next_state = mctx->state_log[cur_idx];
2364 return next_state;
2367 /* Skip bytes in the input that correspond to part of a
2368 multi-byte match, then look in the log for a state
2369 from which to restart matching. */
2370 re_dfastate_t *
2371 find_recover_state (err, mctx)
2372 reg_errcode_t *err;
2373 re_match_context_t *mctx;
2375 re_dfastate_t *cur_state = NULL;
2378 int max = mctx->state_log_top;
2379 int cur_str_idx = re_string_cur_idx (&mctx->input);
2383 if (++cur_str_idx > max)
2384 return NULL;
2385 re_string_skip_bytes (&mctx->input, 1);
2387 while (mctx->state_log[cur_str_idx] == NULL);
2389 cur_state = merge_state_with_log (err, mctx, NULL);
2391 while (err == REG_NOERROR && cur_state == NULL);
2392 return cur_state;
2395 /* Helper functions for transit_state. */
2397 /* From the node set CUR_NODES, pick up the nodes whose types are
2398 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2399 expression. And register them to use them later for evaluating the
2400 correspoding back references. */
2402 static reg_errcode_t
2403 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2404 re_match_context_t *mctx;
2405 re_node_set *cur_nodes;
2406 int str_idx;
2408 re_dfa_t *const dfa = mctx->dfa;
2409 int node_idx;
2410 reg_errcode_t err;
2412 /* TODO: This isn't efficient.
2413 Because there might be more than one nodes whose types are
2414 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2415 nodes.
2416 E.g. RE: (a){2} */
2417 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2419 int node = cur_nodes->elems[node_idx];
2420 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2421 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2422 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2424 err = match_ctx_add_subtop (mctx, node, str_idx);
2425 if (BE (err != REG_NOERROR, 0))
2426 return err;
2429 return REG_NOERROR;
2432 #if 0
2433 /* Return the next state to which the current state STATE will transit by
2434 accepting the current input byte. */
2436 static re_dfastate_t *
2437 transit_state_sb (err, mctx, state)
2438 reg_errcode_t *err;
2439 re_match_context_t *mctx;
2440 re_dfastate_t *state;
2442 re_dfa_t *const dfa = mctx->dfa;
2443 re_node_set next_nodes;
2444 re_dfastate_t *next_state;
2445 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2446 unsigned int context;
2448 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2449 if (BE (*err != REG_NOERROR, 0))
2450 return NULL;
2451 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2453 int cur_node = state->nodes.elems[node_cnt];
2454 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2456 *err = re_node_set_merge (&next_nodes,
2457 dfa->eclosures + dfa->nexts[cur_node]);
2458 if (BE (*err != REG_NOERROR, 0))
2460 re_node_set_free (&next_nodes);
2461 return NULL;
2465 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2466 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2467 /* We don't need to check errors here, since the return value of
2468 this function is next_state and ERR is already set. */
2470 re_node_set_free (&next_nodes);
2471 re_string_skip_bytes (&mctx->input, 1);
2472 return next_state;
2474 #endif
2476 #ifdef RE_ENABLE_I18N
2477 static reg_errcode_t
2478 transit_state_mb (mctx, pstate)
2479 re_match_context_t *mctx;
2480 re_dfastate_t *pstate;
2482 re_dfa_t *const dfa = mctx->dfa;
2483 reg_errcode_t err;
2484 int i;
2486 for (i = 0; i < pstate->nodes.nelem; ++i)
2488 re_node_set dest_nodes, *new_nodes;
2489 int cur_node_idx = pstate->nodes.elems[i];
2490 int naccepted, dest_idx;
2491 unsigned int context;
2492 re_dfastate_t *dest_state;
2494 if (!dfa->nodes[cur_node_idx].accept_mb)
2495 continue;
2497 if (dfa->nodes[cur_node_idx].constraint)
2499 context = re_string_context_at (&mctx->input,
2500 re_string_cur_idx (&mctx->input),
2501 mctx->eflags);
2502 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2503 context))
2504 continue;
2507 /* How many bytes the node can accept? */
2508 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2509 re_string_cur_idx (&mctx->input));
2510 if (naccepted == 0)
2511 continue;
2513 /* The node can accepts `naccepted' bytes. */
2514 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2515 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2516 : mctx->max_mb_elem_len);
2517 err = clean_state_log_if_needed (mctx, dest_idx);
2518 if (BE (err != REG_NOERROR, 0))
2519 return err;
2520 #ifdef DEBUG
2521 assert (dfa->nexts[cur_node_idx] != -1);
2522 #endif
2523 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2525 dest_state = mctx->state_log[dest_idx];
2526 if (dest_state == NULL)
2527 dest_nodes = *new_nodes;
2528 else
2530 err = re_node_set_init_union (&dest_nodes,
2531 dest_state->entrance_nodes, new_nodes);
2532 if (BE (err != REG_NOERROR, 0))
2533 return err;
2535 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2536 mctx->state_log[dest_idx]
2537 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2538 if (dest_state != NULL)
2539 re_node_set_free (&dest_nodes);
2540 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2541 return err;
2543 return REG_NOERROR;
2545 #endif /* RE_ENABLE_I18N */
2547 static reg_errcode_t
2548 transit_state_bkref (mctx, nodes)
2549 re_match_context_t *mctx;
2550 const re_node_set *nodes;
2552 re_dfa_t *const dfa = mctx->dfa;
2553 reg_errcode_t err;
2554 int i;
2555 int cur_str_idx = re_string_cur_idx (&mctx->input);
2557 for (i = 0; i < nodes->nelem; ++i)
2559 int dest_str_idx, prev_nelem, bkc_idx;
2560 int node_idx = nodes->elems[i];
2561 unsigned int context;
2562 const re_token_t *node = dfa->nodes + node_idx;
2563 re_node_set *new_dest_nodes;
2565 /* Check whether `node' is a backreference or not. */
2566 if (node->type != OP_BACK_REF)
2567 continue;
2569 if (node->constraint)
2571 context = re_string_context_at (&mctx->input, cur_str_idx,
2572 mctx->eflags);
2573 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2574 continue;
2577 /* `node' is a backreference.
2578 Check the substring which the substring matched. */
2579 bkc_idx = mctx->nbkref_ents;
2580 err = get_subexp (mctx, node_idx, cur_str_idx);
2581 if (BE (err != REG_NOERROR, 0))
2582 goto free_return;
2584 /* And add the epsilon closures (which is `new_dest_nodes') of
2585 the backreference to appropriate state_log. */
2586 #ifdef DEBUG
2587 assert (dfa->nexts[node_idx] != -1);
2588 #endif
2589 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2591 int subexp_len;
2592 re_dfastate_t *dest_state;
2593 struct re_backref_cache_entry *bkref_ent;
2594 bkref_ent = mctx->bkref_ents + bkc_idx;
2595 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2596 continue;
2597 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2598 new_dest_nodes = (subexp_len == 0
2599 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2600 : dfa->eclosures + dfa->nexts[node_idx]);
2601 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2602 - bkref_ent->subexp_from);
2603 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2604 mctx->eflags);
2605 dest_state = mctx->state_log[dest_str_idx];
2606 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2607 : mctx->state_log[cur_str_idx]->nodes.nelem);
2608 /* Add `new_dest_node' to state_log. */
2609 if (dest_state == NULL)
2611 mctx->state_log[dest_str_idx]
2612 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2613 context);
2614 if (BE (mctx->state_log[dest_str_idx] == NULL
2615 && err != REG_NOERROR, 0))
2616 goto free_return;
2618 else
2620 re_node_set dest_nodes;
2621 err = re_node_set_init_union (&dest_nodes,
2622 dest_state->entrance_nodes,
2623 new_dest_nodes);
2624 if (BE (err != REG_NOERROR, 0))
2626 re_node_set_free (&dest_nodes);
2627 goto free_return;
2629 mctx->state_log[dest_str_idx]
2630 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2631 re_node_set_free (&dest_nodes);
2632 if (BE (mctx->state_log[dest_str_idx] == NULL
2633 && err != REG_NOERROR, 0))
2634 goto free_return;
2636 /* We need to check recursively if the backreference can epsilon
2637 transit. */
2638 if (subexp_len == 0
2639 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2641 err = check_subexp_matching_top (mctx, new_dest_nodes,
2642 cur_str_idx);
2643 if (BE (err != REG_NOERROR, 0))
2644 goto free_return;
2645 err = transit_state_bkref (mctx, new_dest_nodes);
2646 if (BE (err != REG_NOERROR, 0))
2647 goto free_return;
2651 err = REG_NOERROR;
2652 free_return:
2653 return err;
2656 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2657 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2658 Note that we might collect inappropriate candidates here.
2659 However, the cost of checking them strictly here is too high, then we
2660 delay these checking for prune_impossible_nodes(). */
2662 static reg_errcode_t
2663 get_subexp (mctx, bkref_node, bkref_str_idx)
2664 re_match_context_t *mctx;
2665 int bkref_node, bkref_str_idx;
2667 re_dfa_t *const dfa = mctx->dfa;
2668 int subexp_num, sub_top_idx;
2669 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2670 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2671 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2672 if (cache_idx != -1)
2674 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2676 if (entry->node == bkref_node)
2677 return REG_NOERROR; /* We already checked it. */
2678 while (entry++->more);
2681 subexp_num = dfa->nodes[bkref_node].opr.idx;
2683 /* For each sub expression */
2684 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2686 reg_errcode_t err;
2687 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2688 re_sub_match_last_t *sub_last;
2689 int sub_last_idx, sl_str, bkref_str_off;
2691 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2692 continue; /* It isn't related. */
2694 sl_str = sub_top->str_idx;
2695 bkref_str_off = bkref_str_idx;
2696 /* At first, check the last node of sub expressions we already
2697 evaluated. */
2698 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2700 int sl_str_diff;
2701 sub_last = sub_top->lasts[sub_last_idx];
2702 sl_str_diff = sub_last->str_idx - sl_str;
2703 /* The matched string by the sub expression match with the substring
2704 at the back reference? */
2705 if (sl_str_diff > 0)
2707 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2709 /* Not enough chars for a successful match. */
2710 if (bkref_str_off + sl_str_diff > mctx->input.len)
2711 break;
2713 err = clean_state_log_if_needed (mctx,
2714 bkref_str_off
2715 + sl_str_diff);
2716 if (BE (err != REG_NOERROR, 0))
2717 return err;
2718 buf = (const char *) re_string_get_buffer (&mctx->input);
2720 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2721 break; /* We don't need to search this sub expression any more. */
2723 bkref_str_off += sl_str_diff;
2724 sl_str += sl_str_diff;
2725 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2726 bkref_str_idx);
2728 /* Reload buf, since the preceding call might have reallocated
2729 the buffer. */
2730 buf = (const char *) re_string_get_buffer (&mctx->input);
2732 if (err == REG_NOMATCH)
2733 continue;
2734 if (BE (err != REG_NOERROR, 0))
2735 return err;
2738 if (sub_last_idx < sub_top->nlasts)
2739 continue;
2740 if (sub_last_idx > 0)
2741 ++sl_str;
2742 /* Then, search for the other last nodes of the sub expression. */
2743 for (; sl_str <= bkref_str_idx; ++sl_str)
2745 int cls_node, sl_str_off;
2746 const re_node_set *nodes;
2747 sl_str_off = sl_str - sub_top->str_idx;
2748 /* The matched string by the sub expression match with the substring
2749 at the back reference? */
2750 if (sl_str_off > 0)
2752 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2754 /* If we are at the end of the input, we cannot match. */
2755 if (bkref_str_off >= mctx->input.len)
2756 break;
2758 err = extend_buffers (mctx);
2759 if (BE (err != REG_NOERROR, 0))
2760 return err;
2762 buf = (const char *) re_string_get_buffer (&mctx->input);
2764 if (buf [bkref_str_off++] != buf[sl_str - 1])
2765 break; /* We don't need to search this sub expression
2766 any more. */
2768 if (mctx->state_log[sl_str] == NULL)
2769 continue;
2770 /* Does this state have a ')' of the sub expression? */
2771 nodes = &mctx->state_log[sl_str]->nodes;
2772 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2773 if (cls_node == -1)
2774 continue; /* No. */
2775 if (sub_top->path == NULL)
2777 sub_top->path = calloc (sizeof (state_array_t),
2778 sl_str - sub_top->str_idx + 1);
2779 if (sub_top->path == NULL)
2780 return REG_ESPACE;
2782 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2783 in the current context? */
2784 err = check_arrival (mctx, sub_top->path, sub_top->node,
2785 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2786 if (err == REG_NOMATCH)
2787 continue;
2788 if (BE (err != REG_NOERROR, 0))
2789 return err;
2790 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2791 if (BE (sub_last == NULL, 0))
2792 return REG_ESPACE;
2793 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2794 bkref_str_idx);
2795 if (err == REG_NOMATCH)
2796 continue;
2799 return REG_NOERROR;
2802 /* Helper functions for get_subexp(). */
2804 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2805 If it can arrive, register the sub expression expressed with SUB_TOP
2806 and SUB_LAST. */
2808 static reg_errcode_t
2809 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2810 re_match_context_t *mctx;
2811 const re_sub_match_top_t *sub_top;
2812 re_sub_match_last_t *sub_last;
2813 int bkref_node, bkref_str;
2815 reg_errcode_t err;
2816 int to_idx;
2817 /* Can the subexpression arrive the back reference? */
2818 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2819 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2820 if (err != REG_NOERROR)
2821 return err;
2822 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2823 sub_last->str_idx);
2824 if (BE (err != REG_NOERROR, 0))
2825 return err;
2826 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2827 return clean_state_log_if_needed (mctx, to_idx);
2830 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2831 Search '(' if FL_OPEN, or search ')' otherwise.
2832 TODO: This function isn't efficient...
2833 Because there might be more than one nodes whose types are
2834 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2835 nodes.
2836 E.g. RE: (a){2} */
2838 static int
2839 find_subexp_node (dfa, nodes, subexp_idx, type)
2840 const re_dfa_t *dfa;
2841 const re_node_set *nodes;
2842 int subexp_idx, type;
2844 int cls_idx;
2845 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2847 int cls_node = nodes->elems[cls_idx];
2848 const re_token_t *node = dfa->nodes + cls_node;
2849 if (node->type == type
2850 && node->opr.idx == subexp_idx)
2851 return cls_node;
2853 return -1;
2856 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2857 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2858 heavily reused.
2859 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2861 static reg_errcode_t
2862 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2863 type)
2864 re_match_context_t *mctx;
2865 state_array_t *path;
2866 int top_node, top_str, last_node, last_str, type;
2868 re_dfa_t *const dfa = mctx->dfa;
2869 reg_errcode_t err;
2870 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2871 re_dfastate_t *cur_state = NULL;
2872 re_node_set *cur_nodes, next_nodes;
2873 re_dfastate_t **backup_state_log;
2874 unsigned int context;
2876 subexp_num = dfa->nodes[top_node].opr.idx;
2877 /* Extend the buffer if we need. */
2878 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2880 re_dfastate_t **new_array;
2881 int old_alloc = path->alloc;
2882 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2883 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2884 if (new_array == NULL)
2886 path->alloc = old_alloc;
2887 return REG_ESPACE;
2889 path->array = new_array;
2890 memset (new_array + old_alloc, '\0',
2891 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2894 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2896 /* Temporary modify MCTX. */
2897 backup_state_log = mctx->state_log;
2898 backup_cur_idx = mctx->input.cur_idx;
2899 mctx->state_log = path->array;
2900 mctx->input.cur_idx = str_idx;
2902 /* Setup initial node set. */
2903 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2904 if (str_idx == top_str)
2906 err = re_node_set_init_1 (&next_nodes, top_node);
2907 if (BE (err != REG_NOERROR, 0))
2908 return err;
2909 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2910 if (BE (err != REG_NOERROR, 0))
2912 re_node_set_free (&next_nodes);
2913 return err;
2916 else
2918 cur_state = mctx->state_log[str_idx];
2919 if (cur_state && cur_state->has_backref)
2921 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2922 if (BE ( err != REG_NOERROR, 0))
2923 return err;
2925 else
2926 re_node_set_init_empty (&next_nodes);
2928 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2930 if (next_nodes.nelem)
2932 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2933 subexp_num, type);
2934 if (BE ( err != REG_NOERROR, 0))
2936 re_node_set_free (&next_nodes);
2937 return err;
2940 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2941 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2943 re_node_set_free (&next_nodes);
2944 return err;
2946 mctx->state_log[str_idx] = cur_state;
2949 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2951 re_node_set_empty (&next_nodes);
2952 if (mctx->state_log[str_idx + 1])
2954 err = re_node_set_merge (&next_nodes,
2955 &mctx->state_log[str_idx + 1]->nodes);
2956 if (BE (err != REG_NOERROR, 0))
2958 re_node_set_free (&next_nodes);
2959 return err;
2962 if (cur_state)
2964 err = check_arrival_add_next_nodes (mctx, str_idx,
2965 &cur_state->non_eps_nodes, &next_nodes);
2966 if (BE (err != REG_NOERROR, 0))
2968 re_node_set_free (&next_nodes);
2969 return err;
2972 ++str_idx;
2973 if (next_nodes.nelem)
2975 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2976 if (BE (err != REG_NOERROR, 0))
2978 re_node_set_free (&next_nodes);
2979 return err;
2981 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2982 subexp_num, type);
2983 if (BE ( err != REG_NOERROR, 0))
2985 re_node_set_free (&next_nodes);
2986 return err;
2989 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2990 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2991 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2993 re_node_set_free (&next_nodes);
2994 return err;
2996 mctx->state_log[str_idx] = cur_state;
2997 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2999 re_node_set_free (&next_nodes);
3000 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3001 : &mctx->state_log[last_str]->nodes);
3002 path->next_idx = str_idx;
3004 /* Fix MCTX. */
3005 mctx->state_log = backup_state_log;
3006 mctx->input.cur_idx = backup_cur_idx;
3008 /* Then check the current node set has the node LAST_NODE. */
3009 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3010 return REG_NOERROR;
3012 return REG_NOMATCH;
3015 /* Helper functions for check_arrival. */
3017 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3018 to NEXT_NODES.
3019 TODO: This function is similar to the functions transit_state*(),
3020 however this function has many additional works.
3021 Can't we unify them? */
3023 static reg_errcode_t
3024 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3025 re_match_context_t *mctx;
3026 int str_idx;
3027 re_node_set *cur_nodes, *next_nodes;
3029 re_dfa_t *const dfa = mctx->dfa;
3030 int result;
3031 int cur_idx;
3032 reg_errcode_t err;
3033 re_node_set union_set;
3034 re_node_set_init_empty (&union_set);
3035 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3037 int naccepted = 0;
3038 int cur_node = cur_nodes->elems[cur_idx];
3039 #ifdef DEBUG
3040 re_token_type_t type = dfa->nodes[cur_node].type;
3041 assert (!IS_EPSILON_NODE (type));
3042 #endif
3043 #ifdef RE_ENABLE_I18N
3044 /* If the node may accept `multi byte'. */
3045 if (dfa->nodes[cur_node].accept_mb)
3047 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3048 str_idx);
3049 if (naccepted > 1)
3051 re_dfastate_t *dest_state;
3052 int next_node = dfa->nexts[cur_node];
3053 int next_idx = str_idx + naccepted;
3054 dest_state = mctx->state_log[next_idx];
3055 re_node_set_empty (&union_set);
3056 if (dest_state)
3058 err = re_node_set_merge (&union_set, &dest_state->nodes);
3059 if (BE (err != REG_NOERROR, 0))
3061 re_node_set_free (&union_set);
3062 return err;
3065 result = re_node_set_insert (&union_set, next_node);
3066 if (BE (result < 0, 0))
3068 re_node_set_free (&union_set);
3069 return REG_ESPACE;
3071 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3072 &union_set);
3073 if (BE (mctx->state_log[next_idx] == NULL
3074 && err != REG_NOERROR, 0))
3076 re_node_set_free (&union_set);
3077 return err;
3081 #endif /* RE_ENABLE_I18N */
3082 if (naccepted
3083 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3085 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3086 if (BE (result < 0, 0))
3088 re_node_set_free (&union_set);
3089 return REG_ESPACE;
3093 re_node_set_free (&union_set);
3094 return REG_NOERROR;
3097 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3098 CUR_NODES, however exclude the nodes which are:
3099 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3100 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3103 static reg_errcode_t
3104 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3105 re_dfa_t *dfa;
3106 re_node_set *cur_nodes;
3107 int ex_subexp, type;
3109 reg_errcode_t err;
3110 int idx, outside_node;
3111 re_node_set new_nodes;
3112 #ifdef DEBUG
3113 assert (cur_nodes->nelem);
3114 #endif
3115 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3116 if (BE (err != REG_NOERROR, 0))
3117 return err;
3118 /* Create a new node set NEW_NODES with the nodes which are epsilon
3119 closures of the node in CUR_NODES. */
3121 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3123 int cur_node = cur_nodes->elems[idx];
3124 re_node_set *eclosure = dfa->eclosures + cur_node;
3125 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3126 if (outside_node == -1)
3128 /* There are no problematic nodes, just merge them. */
3129 err = re_node_set_merge (&new_nodes, eclosure);
3130 if (BE (err != REG_NOERROR, 0))
3132 re_node_set_free (&new_nodes);
3133 return err;
3136 else
3138 /* There are problematic nodes, re-calculate incrementally. */
3139 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3140 ex_subexp, type);
3141 if (BE (err != REG_NOERROR, 0))
3143 re_node_set_free (&new_nodes);
3144 return err;
3148 re_node_set_free (cur_nodes);
3149 *cur_nodes = new_nodes;
3150 return REG_NOERROR;
3153 /* Helper function for check_arrival_expand_ecl.
3154 Check incrementally the epsilon closure of TARGET, and if it isn't
3155 problematic append it to DST_NODES. */
3157 static reg_errcode_t
3158 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3159 re_dfa_t *dfa;
3160 int target, ex_subexp, type;
3161 re_node_set *dst_nodes;
3163 int cur_node;
3164 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3166 int err;
3168 if (dfa->nodes[cur_node].type == type
3169 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3171 if (type == OP_CLOSE_SUBEXP)
3173 err = re_node_set_insert (dst_nodes, cur_node);
3174 if (BE (err == -1, 0))
3175 return REG_ESPACE;
3177 break;
3179 err = re_node_set_insert (dst_nodes, cur_node);
3180 if (BE (err == -1, 0))
3181 return REG_ESPACE;
3182 if (dfa->edests[cur_node].nelem == 0)
3183 break;
3184 if (dfa->edests[cur_node].nelem == 2)
3186 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3187 dfa->edests[cur_node].elems[1],
3188 ex_subexp, type);
3189 if (BE (err != REG_NOERROR, 0))
3190 return err;
3192 cur_node = dfa->edests[cur_node].elems[0];
3194 return REG_NOERROR;
3198 /* For all the back references in the current state, calculate the
3199 destination of the back references by the appropriate entry
3200 in MCTX->BKREF_ENTS. */
3202 static reg_errcode_t
3203 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3204 type)
3205 re_match_context_t *mctx;
3206 int cur_str, subexp_num, type;
3207 re_node_set *cur_nodes;
3209 re_dfa_t *const dfa = mctx->dfa;
3210 reg_errcode_t err;
3211 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3212 struct re_backref_cache_entry *ent;
3214 if (cache_idx_start == -1)
3215 return REG_NOERROR;
3217 restart:
3218 ent = mctx->bkref_ents + cache_idx_start;
3221 int to_idx, next_node;
3223 /* Is this entry ENT is appropriate? */
3224 if (!re_node_set_contains (cur_nodes, ent->node))
3225 continue; /* No. */
3227 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3228 /* Calculate the destination of the back reference, and append it
3229 to MCTX->STATE_LOG. */
3230 if (to_idx == cur_str)
3232 /* The backreference did epsilon transit, we must re-check all the
3233 node in the current state. */
3234 re_node_set new_dests;
3235 reg_errcode_t err2, err3;
3236 next_node = dfa->edests[ent->node].elems[0];
3237 if (re_node_set_contains (cur_nodes, next_node))
3238 continue;
3239 err = re_node_set_init_1 (&new_dests, next_node);
3240 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3241 err3 = re_node_set_merge (cur_nodes, &new_dests);
3242 re_node_set_free (&new_dests);
3243 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3244 || err3 != REG_NOERROR, 0))
3246 err = (err != REG_NOERROR ? err
3247 : (err2 != REG_NOERROR ? err2 : err3));
3248 return err;
3250 /* TODO: It is still inefficient... */
3251 goto restart;
3253 else
3255 re_node_set union_set;
3256 next_node = dfa->nexts[ent->node];
3257 if (mctx->state_log[to_idx])
3259 int ret;
3260 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3261 next_node))
3262 continue;
3263 err = re_node_set_init_copy (&union_set,
3264 &mctx->state_log[to_idx]->nodes);
3265 ret = re_node_set_insert (&union_set, next_node);
3266 if (BE (err != REG_NOERROR || ret < 0, 0))
3268 re_node_set_free (&union_set);
3269 err = err != REG_NOERROR ? err : REG_ESPACE;
3270 return err;
3273 else
3275 err = re_node_set_init_1 (&union_set, next_node);
3276 if (BE (err != REG_NOERROR, 0))
3277 return err;
3279 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3280 re_node_set_free (&union_set);
3281 if (BE (mctx->state_log[to_idx] == NULL
3282 && err != REG_NOERROR, 0))
3283 return err;
3286 while (ent++->more);
3287 return REG_NOERROR;
3290 /* Build transition table for the state.
3291 Return 1 if succeeded, otherwise return NULL. */
3293 static int
3294 build_trtable (dfa, state)
3295 re_dfa_t *dfa;
3296 re_dfastate_t *state;
3298 reg_errcode_t err;
3299 int i, j, ch, need_word_trtable = 0;
3300 unsigned int elem, mask;
3301 int dests_node_malloced = 0, dest_states_malloced = 0;
3302 int ndests; /* Number of the destination states from `state'. */
3303 re_dfastate_t **trtable;
3304 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3305 re_node_set follows, *dests_node;
3306 bitset *dests_ch;
3307 bitset acceptable;
3309 /* We build DFA states which corresponds to the destination nodes
3310 from `state'. `dests_node[i]' represents the nodes which i-th
3311 destination state contains, and `dests_ch[i]' represents the
3312 characters which i-th destination state accepts. */
3313 #ifdef _LIBC
3314 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3315 dests_node = (re_node_set *)
3316 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3317 else
3318 #endif
3320 dests_node = (re_node_set *)
3321 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3322 if (BE (dests_node == NULL, 0))
3323 return 0;
3324 dests_node_malloced = 1;
3326 dests_ch = (bitset *) (dests_node + SBC_MAX);
3328 /* Initialize transiton table. */
3329 state->word_trtable = state->trtable = NULL;
3331 /* At first, group all nodes belonging to `state' into several
3332 destinations. */
3333 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3334 if (BE (ndests <= 0, 0))
3336 if (dests_node_malloced)
3337 free (dests_node);
3338 /* Return 0 in case of an error, 1 otherwise. */
3339 if (ndests == 0)
3341 state->trtable = (re_dfastate_t **)
3342 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3343 return 1;
3345 return 0;
3348 err = re_node_set_alloc (&follows, ndests + 1);
3349 if (BE (err != REG_NOERROR, 0))
3350 goto out_free;
3352 #ifdef _LIBC
3353 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3354 + ndests * 3 * sizeof (re_dfastate_t *)))
3355 dest_states = (re_dfastate_t **)
3356 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3357 else
3358 #endif
3360 dest_states = (re_dfastate_t **)
3361 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3362 if (BE (dest_states == NULL, 0))
3364 out_free:
3365 if (dest_states_malloced)
3366 free (dest_states);
3367 re_node_set_free (&follows);
3368 for (i = 0; i < ndests; ++i)
3369 re_node_set_free (dests_node + i);
3370 if (dests_node_malloced)
3371 free (dests_node);
3372 return 0;
3374 dest_states_malloced = 1;
3376 dest_states_word = dest_states + ndests;
3377 dest_states_nl = dest_states_word + ndests;
3378 bitset_empty (acceptable);
3380 /* Then build the states for all destinations. */
3381 for (i = 0; i < ndests; ++i)
3383 int next_node;
3384 re_node_set_empty (&follows);
3385 /* Merge the follows of this destination states. */
3386 for (j = 0; j < dests_node[i].nelem; ++j)
3388 next_node = dfa->nexts[dests_node[i].elems[j]];
3389 if (next_node != -1)
3391 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3392 if (BE (err != REG_NOERROR, 0))
3393 goto out_free;
3396 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3397 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3398 goto out_free;
3399 /* If the new state has context constraint,
3400 build appropriate states for these contexts. */
3401 if (dest_states[i]->has_constraint)
3403 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3404 CONTEXT_WORD);
3405 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3406 goto out_free;
3408 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3409 need_word_trtable = 1;
3411 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3412 CONTEXT_NEWLINE);
3413 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3414 goto out_free;
3416 else
3418 dest_states_word[i] = dest_states[i];
3419 dest_states_nl[i] = dest_states[i];
3421 bitset_merge (acceptable, dests_ch[i]);
3424 if (!BE (need_word_trtable, 0))
3426 /* We don't care about whether the following character is a word
3427 character, or we are in a single-byte character set so we can
3428 discern by looking at the character code: allocate a
3429 256-entry transition table. */
3430 trtable = state->trtable =
3431 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3432 if (BE (trtable == NULL, 0))
3433 goto out_free;
3435 /* For all characters ch...: */
3436 for (i = 0; i < BITSET_UINTS; ++i)
3437 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3438 elem;
3439 mask <<= 1, elem >>= 1, ++ch)
3440 if (BE (elem & 1, 0))
3442 /* There must be exactly one destination which accepts
3443 character ch. See group_nodes_into_DFAstates. */
3444 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3447 /* j-th destination accepts the word character ch. */
3448 if (dfa->word_char[i] & mask)
3449 trtable[ch] = dest_states_word[j];
3450 else
3451 trtable[ch] = dest_states[j];
3454 else
3456 /* We care about whether the following character is a word
3457 character, and we are in a multi-byte character set: discern
3458 by looking at the character code: build two 256-entry
3459 transition tables, one starting at trtable[0] and one
3460 starting at trtable[SBC_MAX]. */
3461 trtable = state->word_trtable =
3462 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3463 if (BE (trtable == NULL, 0))
3464 goto out_free;
3466 /* For all characters ch...: */
3467 for (i = 0; i < BITSET_UINTS; ++i)
3468 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3469 elem;
3470 mask <<= 1, elem >>= 1, ++ch)
3471 if (BE (elem & 1, 0))
3473 /* There must be exactly one destination which accepts
3474 character ch. See group_nodes_into_DFAstates. */
3475 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3478 /* j-th destination accepts the word character ch. */
3479 trtable[ch] = dest_states[j];
3480 trtable[ch + SBC_MAX] = dest_states_word[j];
3484 /* new line */
3485 if (bitset_contain (acceptable, NEWLINE_CHAR))
3487 /* The current state accepts newline character. */
3488 for (j = 0; j < ndests; ++j)
3489 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3491 /* k-th destination accepts newline character. */
3492 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3493 if (need_word_trtable)
3494 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3495 /* There must be only one destination which accepts
3496 newline. See group_nodes_into_DFAstates. */
3497 break;
3501 if (dest_states_malloced)
3502 free (dest_states);
3504 re_node_set_free (&follows);
3505 for (i = 0; i < ndests; ++i)
3506 re_node_set_free (dests_node + i);
3508 if (dests_node_malloced)
3509 free (dests_node);
3511 return 1;
3514 /* Group all nodes belonging to STATE into several destinations.
3515 Then for all destinations, set the nodes belonging to the destination
3516 to DESTS_NODE[i] and set the characters accepted by the destination
3517 to DEST_CH[i]. This function return the number of destinations. */
3519 static int
3520 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3521 re_dfa_t *dfa;
3522 const re_dfastate_t *state;
3523 re_node_set *dests_node;
3524 bitset *dests_ch;
3526 reg_errcode_t err;
3527 int result;
3528 int i, j, k;
3529 int ndests; /* Number of the destinations from `state'. */
3530 bitset accepts; /* Characters a node can accept. */
3531 const re_node_set *cur_nodes = &state->nodes;
3532 bitset_empty (accepts);
3533 ndests = 0;
3535 /* For all the nodes belonging to `state', */
3536 for (i = 0; i < cur_nodes->nelem; ++i)
3538 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3539 re_token_type_t type = node->type;
3540 unsigned int constraint = node->constraint;
3542 /* Enumerate all single byte character this node can accept. */
3543 if (type == CHARACTER)
3544 bitset_set (accepts, node->opr.c);
3545 else if (type == SIMPLE_BRACKET)
3547 bitset_merge (accepts, node->opr.sbcset);
3549 else if (type == OP_PERIOD)
3551 #ifdef RE_ENABLE_I18N
3552 if (dfa->mb_cur_max > 1)
3553 bitset_merge (accepts, dfa->sb_char);
3554 else
3555 #endif
3556 bitset_set_all (accepts);
3557 if (!(dfa->syntax & RE_DOT_NEWLINE))
3558 bitset_clear (accepts, '\n');
3559 if (dfa->syntax & RE_DOT_NOT_NULL)
3560 bitset_clear (accepts, '\0');
3562 #ifdef RE_ENABLE_I18N
3563 else if (type == OP_UTF8_PERIOD)
3565 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3566 if (!(dfa->syntax & RE_DOT_NEWLINE))
3567 bitset_clear (accepts, '\n');
3568 if (dfa->syntax & RE_DOT_NOT_NULL)
3569 bitset_clear (accepts, '\0');
3571 #endif
3572 else
3573 continue;
3575 /* Check the `accepts' and sift the characters which are not
3576 match it the context. */
3577 if (constraint)
3579 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3581 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3582 bitset_empty (accepts);
3583 if (accepts_newline)
3584 bitset_set (accepts, NEWLINE_CHAR);
3585 else
3586 continue;
3588 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3590 bitset_empty (accepts);
3591 continue;
3594 if (constraint & NEXT_WORD_CONSTRAINT)
3596 unsigned int any_set = 0;
3597 if (type == CHARACTER && !node->word_char)
3599 bitset_empty (accepts);
3600 continue;
3602 #ifdef RE_ENABLE_I18N
3603 if (dfa->mb_cur_max > 1)
3604 for (j = 0; j < BITSET_UINTS; ++j)
3605 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3606 else
3607 #endif
3608 for (j = 0; j < BITSET_UINTS; ++j)
3609 any_set |= (accepts[j] &= dfa->word_char[j]);
3610 if (!any_set)
3611 continue;
3613 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3615 unsigned int any_set = 0;
3616 if (type == CHARACTER && node->word_char)
3618 bitset_empty (accepts);
3619 continue;
3621 #ifdef RE_ENABLE_I18N
3622 if (dfa->mb_cur_max > 1)
3623 for (j = 0; j < BITSET_UINTS; ++j)
3624 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3625 else
3626 #endif
3627 for (j = 0; j < BITSET_UINTS; ++j)
3628 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3629 if (!any_set)
3630 continue;
3634 /* Then divide `accepts' into DFA states, or create a new
3635 state. Above, we make sure that accepts is not empty. */
3636 for (j = 0; j < ndests; ++j)
3638 bitset intersec; /* Intersection sets, see below. */
3639 bitset remains;
3640 /* Flags, see below. */
3641 int has_intersec, not_subset, not_consumed;
3643 /* Optimization, skip if this state doesn't accept the character. */
3644 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3645 continue;
3647 /* Enumerate the intersection set of this state and `accepts'. */
3648 has_intersec = 0;
3649 for (k = 0; k < BITSET_UINTS; ++k)
3650 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3651 /* And skip if the intersection set is empty. */
3652 if (!has_intersec)
3653 continue;
3655 /* Then check if this state is a subset of `accepts'. */
3656 not_subset = not_consumed = 0;
3657 for (k = 0; k < BITSET_UINTS; ++k)
3659 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3660 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3663 /* If this state isn't a subset of `accepts', create a
3664 new group state, which has the `remains'. */
3665 if (not_subset)
3667 bitset_copy (dests_ch[ndests], remains);
3668 bitset_copy (dests_ch[j], intersec);
3669 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3670 if (BE (err != REG_NOERROR, 0))
3671 goto error_return;
3672 ++ndests;
3675 /* Put the position in the current group. */
3676 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3677 if (BE (result < 0, 0))
3678 goto error_return;
3680 /* If all characters are consumed, go to next node. */
3681 if (!not_consumed)
3682 break;
3684 /* Some characters remain, create a new group. */
3685 if (j == ndests)
3687 bitset_copy (dests_ch[ndests], accepts);
3688 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3689 if (BE (err != REG_NOERROR, 0))
3690 goto error_return;
3691 ++ndests;
3692 bitset_empty (accepts);
3695 return ndests;
3696 error_return:
3697 for (j = 0; j < ndests; ++j)
3698 re_node_set_free (dests_node + j);
3699 return -1;
3702 #ifdef RE_ENABLE_I18N
3703 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3704 Return the number of the bytes the node accepts.
3705 STR_IDX is the current index of the input string.
3707 This function handles the nodes which can accept one character, or
3708 one collating element like '.', '[a-z]', opposite to the other nodes
3709 can only accept one byte. */
3711 static int
3712 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3713 re_dfa_t *dfa;
3714 int node_idx, str_idx;
3715 const re_string_t *input;
3717 const re_token_t *node = dfa->nodes + node_idx;
3718 int char_len, elem_len;
3719 int i;
3721 if (BE (node->type == OP_UTF8_PERIOD, 0))
3723 unsigned char c = re_string_byte_at (input, str_idx), d;
3724 if (BE (c < 0xc2, 1))
3725 return 0;
3727 if (str_idx + 2 > input->len)
3728 return 0;
3730 d = re_string_byte_at (input, str_idx + 1);
3731 if (c < 0xe0)
3732 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3733 else if (c < 0xf0)
3735 char_len = 3;
3736 if (c == 0xe0 && d < 0xa0)
3737 return 0;
3739 else if (c < 0xf8)
3741 char_len = 4;
3742 if (c == 0xf0 && d < 0x90)
3743 return 0;
3745 else if (c < 0xfc)
3747 char_len = 5;
3748 if (c == 0xf8 && d < 0x88)
3749 return 0;
3751 else if (c < 0xfe)
3753 char_len = 6;
3754 if (c == 0xfc && d < 0x84)
3755 return 0;
3757 else
3758 return 0;
3760 if (str_idx + char_len > input->len)
3761 return 0;
3763 for (i = 1; i < char_len; ++i)
3765 d = re_string_byte_at (input, str_idx + i);
3766 if (d < 0x80 || d > 0xbf)
3767 return 0;
3769 return char_len;
3772 char_len = re_string_char_size_at (input, str_idx);
3773 if (node->type == OP_PERIOD)
3775 if (char_len <= 1)
3776 return 0;
3777 /* FIXME: I don't think this if is needed, as both '\n'
3778 and '\0' are char_len == 1. */
3779 /* '.' accepts any one character except the following two cases. */
3780 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3781 re_string_byte_at (input, str_idx) == '\n') ||
3782 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3783 re_string_byte_at (input, str_idx) == '\0'))
3784 return 0;
3785 return char_len;
3788 elem_len = re_string_elem_size_at (input, str_idx);
3789 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3790 return 0;
3792 if (node->type == COMPLEX_BRACKET)
3794 const re_charset_t *cset = node->opr.mbcset;
3795 # ifdef _LIBC
3796 const unsigned char *pin
3797 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3798 int j;
3799 uint32_t nrules;
3800 # endif /* _LIBC */
3801 int match_len = 0;
3802 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3803 ? re_string_wchar_at (input, str_idx) : 0);
3805 /* match with multibyte character? */
3806 for (i = 0; i < cset->nmbchars; ++i)
3807 if (wc == cset->mbchars[i])
3809 match_len = char_len;
3810 goto check_node_accept_bytes_match;
3812 /* match with character_class? */
3813 for (i = 0; i < cset->nchar_classes; ++i)
3815 wctype_t wt = cset->char_classes[i];
3816 if (__iswctype (wc, wt))
3818 match_len = char_len;
3819 goto check_node_accept_bytes_match;
3823 # ifdef _LIBC
3824 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3825 if (nrules != 0)
3827 unsigned int in_collseq = 0;
3828 const int32_t *table, *indirect;
3829 const unsigned char *weights, *extra;
3830 const char *collseqwc;
3831 int32_t idx;
3832 /* This #include defines a local function! */
3833 # include <locale/weight.h>
3835 /* match with collating_symbol? */
3836 if (cset->ncoll_syms)
3837 extra = (const unsigned char *)
3838 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3839 for (i = 0; i < cset->ncoll_syms; ++i)
3841 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3842 /* Compare the length of input collating element and
3843 the length of current collating element. */
3844 if (*coll_sym != elem_len)
3845 continue;
3846 /* Compare each bytes. */
3847 for (j = 0; j < *coll_sym; j++)
3848 if (pin[j] != coll_sym[1 + j])
3849 break;
3850 if (j == *coll_sym)
3852 /* Match if every bytes is equal. */
3853 match_len = j;
3854 goto check_node_accept_bytes_match;
3858 if (cset->nranges)
3860 if (elem_len <= char_len)
3862 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3863 in_collseq = __collseq_table_lookup (collseqwc, wc);
3865 else
3866 in_collseq = find_collation_sequence_value (pin, elem_len);
3868 /* match with range expression? */
3869 for (i = 0; i < cset->nranges; ++i)
3870 if (cset->range_starts[i] <= in_collseq
3871 && in_collseq <= cset->range_ends[i])
3873 match_len = elem_len;
3874 goto check_node_accept_bytes_match;
3877 /* match with equivalence_class? */
3878 if (cset->nequiv_classes)
3880 const unsigned char *cp = pin;
3881 table = (const int32_t *)
3882 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3883 weights = (const unsigned char *)
3884 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3885 extra = (const unsigned char *)
3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3887 indirect = (const int32_t *)
3888 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3889 idx = findidx (&cp);
3890 if (idx > 0)
3891 for (i = 0; i < cset->nequiv_classes; ++i)
3893 int32_t equiv_class_idx = cset->equiv_classes[i];
3894 size_t weight_len = weights[idx];
3895 if (weight_len == weights[equiv_class_idx])
3897 int cnt = 0;
3898 while (cnt <= weight_len
3899 && (weights[equiv_class_idx + 1 + cnt]
3900 == weights[idx + 1 + cnt]))
3901 ++cnt;
3902 if (cnt > weight_len)
3904 match_len = elem_len;
3905 goto check_node_accept_bytes_match;
3911 else
3912 # endif /* _LIBC */
3914 /* match with range expression? */
3915 #if __GNUC__ >= 2
3916 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3917 #else
3918 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3919 cmp_buf[2] = wc;
3920 #endif
3921 for (i = 0; i < cset->nranges; ++i)
3923 cmp_buf[0] = cset->range_starts[i];
3924 cmp_buf[4] = cset->range_ends[i];
3925 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3926 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3928 match_len = char_len;
3929 goto check_node_accept_bytes_match;
3933 check_node_accept_bytes_match:
3934 if (!cset->non_match)
3935 return match_len;
3936 else
3938 if (match_len > 0)
3939 return 0;
3940 else
3941 return (elem_len > char_len) ? elem_len : char_len;
3944 return 0;
3947 # ifdef _LIBC
3948 static unsigned int
3949 find_collation_sequence_value (mbs, mbs_len)
3950 const unsigned char *mbs;
3951 size_t mbs_len;
3953 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3954 if (nrules == 0)
3956 if (mbs_len == 1)
3958 /* No valid character. Match it as a single byte character. */
3959 const unsigned char *collseq = (const unsigned char *)
3960 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3961 return collseq[mbs[0]];
3963 return UINT_MAX;
3965 else
3967 int32_t idx;
3968 const unsigned char *extra = (const unsigned char *)
3969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3970 int32_t extrasize = (const unsigned char *)
3971 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3973 for (idx = 0; idx < extrasize;)
3975 int mbs_cnt, found = 0;
3976 int32_t elem_mbs_len;
3977 /* Skip the name of collating element name. */
3978 idx = idx + extra[idx] + 1;
3979 elem_mbs_len = extra[idx++];
3980 if (mbs_len == elem_mbs_len)
3982 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3983 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3984 break;
3985 if (mbs_cnt == elem_mbs_len)
3986 /* Found the entry. */
3987 found = 1;
3989 /* Skip the byte sequence of the collating element. */
3990 idx += elem_mbs_len;
3991 /* Adjust for the alignment. */
3992 idx = (idx + 3) & ~3;
3993 /* Skip the collation sequence value. */
3994 idx += sizeof (uint32_t);
3995 /* Skip the wide char sequence of the collating element. */
3996 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3997 /* If we found the entry, return the sequence value. */
3998 if (found)
3999 return *(uint32_t *) (extra + idx);
4000 /* Skip the collation sequence value. */
4001 idx += sizeof (uint32_t);
4003 return UINT_MAX;
4006 # endif /* _LIBC */
4007 #endif /* RE_ENABLE_I18N */
4009 /* Check whether the node accepts the byte which is IDX-th
4010 byte of the INPUT. */
4012 static int
4013 check_node_accept (mctx, node, idx)
4014 const re_match_context_t *mctx;
4015 const re_token_t *node;
4016 int idx;
4018 unsigned char ch;
4019 ch = re_string_byte_at (&mctx->input, idx);
4020 switch (node->type)
4022 case CHARACTER:
4023 if (node->opr.c != ch)
4024 return 0;
4025 break;
4027 case SIMPLE_BRACKET:
4028 if (!bitset_contain (node->opr.sbcset, ch))
4029 return 0;
4030 break;
4032 #ifdef RE_ENABLE_I18N
4033 case OP_UTF8_PERIOD:
4034 if (ch >= 0x80)
4035 return 0;
4036 /* FALLTHROUGH */
4037 #endif
4038 case OP_PERIOD:
4039 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4040 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4041 return 0;
4042 break;
4044 default:
4045 return 0;
4048 if (node->constraint)
4050 /* The node has constraints. Check whether the current context
4051 satisfies the constraints. */
4052 unsigned int context = re_string_context_at (&mctx->input, idx,
4053 mctx->eflags);
4054 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4055 return 0;
4058 return 1;
4061 /* Extend the buffers, if the buffers have run out. */
4063 static reg_errcode_t
4064 extend_buffers (mctx)
4065 re_match_context_t *mctx;
4067 reg_errcode_t ret;
4068 re_string_t *pstr = &mctx->input;
4070 /* Double the lengthes of the buffers. */
4071 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4072 if (BE (ret != REG_NOERROR, 0))
4073 return ret;
4075 if (mctx->state_log != NULL)
4077 /* And double the length of state_log. */
4078 /* XXX We have no indication of the size of this buffer. If this
4079 allocation fail we have no indication that the state_log array
4080 does not have the right size. */
4081 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4082 pstr->bufs_len + 1);
4083 if (BE (new_array == NULL, 0))
4084 return REG_ESPACE;
4085 mctx->state_log = new_array;
4088 /* Then reconstruct the buffers. */
4089 if (pstr->icase)
4091 #ifdef RE_ENABLE_I18N
4092 if (pstr->mb_cur_max > 1)
4094 ret = build_wcs_upper_buffer (pstr);
4095 if (BE (ret != REG_NOERROR, 0))
4096 return ret;
4098 else
4099 #endif /* RE_ENABLE_I18N */
4100 build_upper_buffer (pstr);
4102 else
4104 #ifdef RE_ENABLE_I18N
4105 if (pstr->mb_cur_max > 1)
4106 build_wcs_buffer (pstr);
4107 else
4108 #endif /* RE_ENABLE_I18N */
4110 if (pstr->trans != NULL)
4111 re_string_translate_buffer (pstr);
4114 return REG_NOERROR;
4118 /* Functions for matching context. */
4120 /* Initialize MCTX. */
4122 static reg_errcode_t
4123 match_ctx_init (mctx, eflags, n)
4124 re_match_context_t *mctx;
4125 int eflags, n;
4127 mctx->eflags = eflags;
4128 mctx->match_last = -1;
4129 if (n > 0)
4131 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4132 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4133 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4134 return REG_ESPACE;
4136 /* Already zero-ed by the caller.
4137 else
4138 mctx->bkref_ents = NULL;
4139 mctx->nbkref_ents = 0;
4140 mctx->nsub_tops = 0; */
4141 mctx->abkref_ents = n;
4142 mctx->max_mb_elem_len = 1;
4143 mctx->asub_tops = n;
4144 return REG_NOERROR;
4147 /* Clean the entries which depend on the current input in MCTX.
4148 This function must be invoked when the matcher changes the start index
4149 of the input, or changes the input string. */
4151 static void
4152 match_ctx_clean (mctx)
4153 re_match_context_t *mctx;
4155 int st_idx;
4156 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4158 int sl_idx;
4159 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4160 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4162 re_sub_match_last_t *last = top->lasts[sl_idx];
4163 re_free (last->path.array);
4164 re_free (last);
4166 re_free (top->lasts);
4167 if (top->path)
4169 re_free (top->path->array);
4170 re_free (top->path);
4172 free (top);
4175 mctx->nsub_tops = 0;
4176 mctx->nbkref_ents = 0;
4179 /* Free all the memory associated with MCTX. */
4181 static void
4182 match_ctx_free (mctx)
4183 re_match_context_t *mctx;
4185 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4186 match_ctx_clean (mctx);
4187 re_free (mctx->sub_tops);
4188 re_free (mctx->bkref_ents);
4191 /* Add a new backreference entry to MCTX.
4192 Note that we assume that caller never call this function with duplicate
4193 entry, and call with STR_IDX which isn't smaller than any existing entry.
4196 static reg_errcode_t
4197 match_ctx_add_entry (mctx, node, str_idx, from, to)
4198 re_match_context_t *mctx;
4199 int node, str_idx, from, to;
4201 if (mctx->nbkref_ents >= mctx->abkref_ents)
4203 struct re_backref_cache_entry* new_entry;
4204 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4205 mctx->abkref_ents * 2);
4206 if (BE (new_entry == NULL, 0))
4208 re_free (mctx->bkref_ents);
4209 return REG_ESPACE;
4211 mctx->bkref_ents = new_entry;
4212 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4213 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4214 mctx->abkref_ents *= 2;
4216 if (mctx->nbkref_ents > 0
4217 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4218 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4220 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4221 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4222 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4223 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4225 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4226 If bit N is clear, means that this entry won't epsilon-transition to
4227 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4228 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4229 such node.
4231 A backreference does not epsilon-transition unless it is empty, so set
4232 to all zeros if FROM != TO. */
4233 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4234 = (from == to ? ~0 : 0);
4236 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4237 if (mctx->max_mb_elem_len < to - from)
4238 mctx->max_mb_elem_len = to - from;
4239 return REG_NOERROR;
4242 /* Search for the first entry which has the same str_idx, or -1 if none is
4243 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4245 static int
4246 search_cur_bkref_entry (mctx, str_idx)
4247 re_match_context_t *mctx;
4248 int str_idx;
4250 int left, right, mid, last;
4251 last = right = mctx->nbkref_ents;
4252 for (left = 0; left < right;)
4254 mid = (left + right) / 2;
4255 if (mctx->bkref_ents[mid].str_idx < str_idx)
4256 left = mid + 1;
4257 else
4258 right = mid;
4260 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4261 return left;
4262 else
4263 return -1;
4266 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4267 at STR_IDX. */
4269 static reg_errcode_t
4270 match_ctx_add_subtop (mctx, node, str_idx)
4271 re_match_context_t *mctx;
4272 int node, str_idx;
4274 #ifdef DEBUG
4275 assert (mctx->sub_tops != NULL);
4276 assert (mctx->asub_tops > 0);
4277 #endif
4278 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4280 int new_asub_tops = mctx->asub_tops * 2;
4281 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4282 re_sub_match_top_t *,
4283 new_asub_tops);
4284 if (BE (new_array == NULL, 0))
4285 return REG_ESPACE;
4286 mctx->sub_tops = new_array;
4287 mctx->asub_tops = new_asub_tops;
4289 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4290 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4291 return REG_ESPACE;
4292 mctx->sub_tops[mctx->nsub_tops]->node = node;
4293 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4294 return REG_NOERROR;
4297 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4298 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4300 static re_sub_match_last_t *
4301 match_ctx_add_sublast (subtop, node, str_idx)
4302 re_sub_match_top_t *subtop;
4303 int node, str_idx;
4305 re_sub_match_last_t *new_entry;
4306 if (BE (subtop->nlasts == subtop->alasts, 0))
4308 int new_alasts = 2 * subtop->alasts + 1;
4309 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4310 re_sub_match_last_t *,
4311 new_alasts);
4312 if (BE (new_array == NULL, 0))
4313 return NULL;
4314 subtop->lasts = new_array;
4315 subtop->alasts = new_alasts;
4317 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4318 if (BE (new_entry != NULL, 1))
4320 subtop->lasts[subtop->nlasts] = new_entry;
4321 new_entry->node = node;
4322 new_entry->str_idx = str_idx;
4323 ++subtop->nlasts;
4325 return new_entry;
4328 static void
4329 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4330 re_sift_context_t *sctx;
4331 re_dfastate_t **sifted_sts, **limited_sts;
4332 int last_node, last_str_idx;
4334 sctx->sifted_states = sifted_sts;
4335 sctx->limited_states = limited_sts;
4336 sctx->last_node = last_node;
4337 sctx->last_str_idx = last_str_idx;
4338 re_node_set_init_empty (&sctx->limits);